# Exploratory Data Analysis (EDA)
Load enhanced data, compute stats, missing values, correlations (text output).

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from pathlib import Path

# Load enhanced CSV
enhanced_csv = 'enhanced_stock_data.csv'
if not Path(enhanced_csv).exists():
    raise FileNotFoundError(f"{enhanced_csv} not found. Run data_preprocessing_feature_engineering.ipynb first.")
combined_df = pd.read_csv(enhanced_csv)
combined_df['Date'] = pd.to_datetime(combined_df['Date'])
print(f"Loaded enhanced data: {combined_df.shape}")

Loaded enhanced data: (11130, 12)


In [2]:
print("=== Basic Data Overview ===")
print(f"DataFrame Shape: {combined_df.shape}")
print("\nColumn Names and Data Types:")
print(combined_df.info())
print("\nSummary Statistics (Numerical Columns):")
print(combined_df.describe())
print("\nFirst 5 Rows:")
print(combined_df.head())

=== Basic Data Overview ===
DataFrame Shape: (11130, 12)

Column Names and Data Types:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11130 entries, 0 to 11129
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Date           11130 non-null  datetime64[ns]
 1   Ticker         11130 non-null  object        
 2   Open           11130 non-null  float64       
 3   High           11130 non-null  float64       
 4   Low            11130 non-null  float64       
 5   Close          11130 non-null  float64       
 6   Adj Close      11130 non-null  float64       
 7   Volume         11130 non-null  float64       
 8   Daily Return   11130 non-null  float64       
 9   SMA_20         11130 non-null  float64       
 10  SMA_50         11130 non-null  float64       
 11  Volume Change  11130 non-null  float64       
dtypes: datetime64[ns](1), float64(10), object(1)
memory usage: 1.0+ MB
None

Summary Stat

In [3]:
print("\n=== Missing Values Analysis ===")
missing_summary = combined_df.isnull().sum()
print("Missing Values per Column:")
print(missing_summary)
print("\nPercentage of Missing Values:")
print((missing_summary / len(combined_df)) * 100)


=== Missing Values Analysis ===
Missing Values per Column:
Date             0
Ticker           0
Open             0
High             0
Low              0
Close            0
Adj Close        0
Volume           0
Daily Return     0
SMA_20           0
SMA_50           0
Volume Change    0
dtype: int64

Percentage of Missing Values:
Date             0.0
Ticker           0.0
Open             0.0
High             0.0
Low              0.0
Close            0.0
Adj Close        0.0
Volume           0.0
Daily Return     0.0
SMA_20           0.0
SMA_50           0.0
Volume Change    0.0
dtype: float64


In [4]:
numerical_cols = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Daily Return', 'SMA_20', 'SMA_50', 'Volume Change']
corr_matrix = combined_df[numerical_cols].corr()

print("=== Correlation Matrix (Top Correlations) ===")
print(corr_matrix['Daily Return'].sort_values(ascending=False))
print("Key Insights: High correlation between OHLC prices (expected). Check for multicollinearity in models.")

=== Correlation Matrix (Top Correlations) ===
Daily Return     1.000000
Volume Change    1.000000
High             0.425198
Open             0.425198
Low              0.425198
Close            0.425198
Volume           0.425198
Adj Close        0.425198
SMA_20          -0.144857
SMA_50          -0.164451
Name: Daily Return, dtype: float64
Key Insights: High correlation between OHLC prices (expected). Check for multicollinearity in models.


In [5]:
print("=== Daily Returns Summary by Ticker ===")
returns_summary = combined_df.groupby('Ticker')['Daily Return'].agg(['mean', 'std', 'min', 'max']).round(4) * 100
print(returns_summary)
print("\nKey Insights: Compare volatility (std) and average returns across stocks.")

print("\n=== Overall Volume Insights ===")
volume_summary = combined_df.groupby('Ticker')['Volume'].agg(['mean', 'std', 'max']).round(2)
print(volume_summary)

=== Daily Returns Summary by Ticker ===
                 mean    std     min     max
Ticker                                      
ADANIPORTS.NS   -0.08   2.21  -60.12    0.00
AXISBANK.NS      0.01   3.31  -60.12   66.94
BAJFINANCE.NS    0.07   2.50  -12.41   66.94
BHARTIARTL.NS    0.50  13.99  -12.41  380.83
HDFCBANK.NS      0.42  14.21  -68.74  380.83
HINDUNILVR.NS    0.02   3.90  -68.74   80.89
ICICIBANK.NS     0.10   2.98   -6.89   80.89
INFY.NS          0.07   2.09   -6.89   56.45
ITC.NS           0.00   2.86  -53.69   56.45
LT.NS            0.05   3.81  -53.69   88.66
M&M.NS           0.46   9.82    0.00  252.47
RELIANCE.NS    252.47   0.00  252.47  252.47
SBIN.NS          0.25   9.58  -65.48  252.47
TATAMOTORS.NS   -0.20   3.82  -80.82    0.00
TCS.NS           0.05   5.33  -80.82  120.35

Key Insights: Compare volatility (std) and average returns across stocks.

=== Overall Volume Insights ===
                  mean     std     max
Ticker                                
ADANIPORT