In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import io
import os

# Function to save plot as byte stream
def save_plot_to_bytes(fig):
    img_stream = io.BytesIO()
    fig.savefig(img_stream, format='png')
    img_stream.seek(0)  # Reset the stream pointer to the beginning
    return img_stream

# Get list of tickers (file names without extensions) from the directory
list_of_tickers = os.listdir('../stock_data_csv')
tickers_without_extension = [os.path.splitext(ticker)[0] for ticker in list_of_tickers if ticker.endswith('.csv')]

# Iterate through each ticker and perform EDA
for ticker in tickers_without_extension:
    # Load the dataset
    df = pd.read_csv(f"../stock_data_csv/{ticker}.csv")
    df["published_date"] = pd.to_datetime(df["published_date"])
    df.sort_values("published_date", inplace=True)
    # Perform EDA
    eda_stats = df.describe()
    missing_values = df.isnull().sum()
    correlation_matrix = df.drop(columns=['status']).corr() if 'status' in df.columns else df.corr()
    
    # Print results for verification (optional)
    print(f"Dataset Overview for {ticker}:")
    print(df.info())
    print(f"\nDescriptive Statistics for {ticker}:")
    print(eda_stats)
    print(f"\nMissing Values for {ticker}:")
    print(missing_values)
    print(f"\nCorrelation Matrix for {ticker}:")
    print(correlation_matrix)
    
    # Save Closing Price Plot as byte stream
    plt.figure(figsize=(10, 5))
    sns.lineplot(x=df["published_date"], y=df["close"], label="Close Price")
    plt.title(f"{ticker} Stock Closing Price Over Time")
    plt.xlabel("Date")
    plt.ylabel("Close Price")
    plt.legend()
    closing_price_img_stream = save_plot_to_bytes(plt.gcf())  # Save plot to byte stream
    plt.close()

    # Moving Averages Plot
    df['10-day MA'] = df['close'].rolling(window=10).mean()
    df['50-day MA'] = df['close'].rolling(window=50).mean()
    plt.figure(figsize=(12, 6))
    plt.plot(df['close'], label='Closing Price', alpha=0.8)
    plt.plot(df['10-day MA'], label='10-day MA', linestyle='dashed')
    plt.plot(df['50-day MA'], label='50-day MA', linestyle='dashed')
    plt.legend()
    moving_avg_img_stream = save_plot_to_bytes(plt.gcf())  # Save plot to byte stream
    plt.close()

    # Heatmap Plot for Correlation
    plt.figure(figsize=(8, 5))
    sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
    plt.title(f"{ticker} Feature Correlation Matrix")
    heatmap_img_stream = save_plot_to_bytes(plt.gcf())  # Save plot to byte stream
    plt.close()

    # Combine all results into a dictionary
    eda_results = {
        'dataframe': df,
        'eda_statistics': eda_stats,
        'missing_values': missing_values,
        'correlation_matrix': correlation_matrix,
        'closing_price_plot': closing_price_img_stream,
        'moving_avg_plot': moving_avg_img_stream,
        'heatmap_plot': heatmap_img_stream
    }

    # Save the results for each ticker in a separate joblib file
    joblib.dump(eda_results, f'./EDA_Results/eda_results_{ticker}.joblib')

    print(f"\nEDA results and plots for {ticker} have been saved into 'eda_results_{ticker}.joblib'")


Dataset Overview for AHPC:
<class 'pandas.core.frame.DataFrame'>
Index: 3287 entries, 0 to 3286
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   published_date   3287 non-null   datetime64[ns]
 1   open             3287 non-null   float64       
 2   high             3287 non-null   float64       
 3   low              3287 non-null   float64       
 4   close            3287 non-null   float64       
 5   per_change       3286 non-null   float64       
 6   traded_quantity  3287 non-null   float64       
 7   traded_amount    3287 non-null   float64       
 8   status           3287 non-null   int64         
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 256.8 KB
None

Descriptive Statistics for AHPC:
                      published_date         open         high          low  \
count                           3287  3287.000000  3287.000000  3287.000000   
mean   2017-06-07 15:

  sqr = _ensure_numeric((avg - values) ** 2)



EDA results and plots for MPFL have been saved into 'eda_results_MPFL.joblib'
Dataset Overview for NBL:
<class 'pandas.core.frame.DataFrame'>
Index: 2637 entries, 0 to 2636
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   published_date   2637 non-null   datetime64[ns]
 1   open             2637 non-null   float64       
 2   high             2637 non-null   float64       
 3   low              2637 non-null   float64       
 4   close            2637 non-null   float64       
 5   per_change       2636 non-null   float64       
 6   traded_quantity  2637 non-null   float64       
 7   traded_amount    2637 non-null   float64       
 8   status           2637 non-null   int64         
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 206.0 KB
None

Descriptive Statistics for NBL:
                      published_date         open         high          low  \
count                  

  sqr = _ensure_numeric((avg - values) ** 2)



EDA results and plots for RLFL have been saved into 'eda_results_RLFL.joblib'
Dataset Overview for SADBL:
<class 'pandas.core.frame.DataFrame'>
Index: 2287 entries, 0 to 2286
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   published_date   2287 non-null   datetime64[ns]
 1   open             2287 non-null   float64       
 2   high             2287 non-null   float64       
 3   low              2287 non-null   float64       
 4   close            2287 non-null   float64       
 5   per_change       2286 non-null   float64       
 6   traded_quantity  2287 non-null   float64       
 7   traded_amount    2287 non-null   float64       
 8   status           2287 non-null   int64         
dtypes: datetime64[ns](1), float64(7), int64(1)
memory usage: 178.7 KB
None

Descriptive Statistics for SADBL:
                      published_date         open         high          low  \
count              