# Data Preprocessing & Feature Engineering
Load raw data from CSV, add features (Daily Return, SMAs, Volume Change), clean NaNs. Outputs `enhanced_stock_data.csv`.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

print("Libraries imported")

Libraries imported


In [3]:
# Load raw CSV (checkpoint from data_collection.ipynb)
raw_csv = 'raw_stock_data.csv'
if not Path(raw_csv).exists():
    raise FileNotFoundError(f"{raw_csv} not found. Run data_collection.ipynb first.")
combined_df = pd.read_csv(raw_csv)
combined_df['Date'] = pd.to_datetime(combined_df['Date'], utc=True, errors='coerce')
combined_df['Date'] = combined_df['Date'].dt.tz_convert(None)  # remove timezone info

print(f"Loaded raw data: {combined_df.shape}")
print(combined_df.head())

Loaded raw data: (11130, 8)
        Date       Ticker  Open  High  Low  Close  Adj Close  Volume
0 2022-09-26  RELIANCE.NS   NaN   NaN  NaN    NaN        NaN     NaN
1 2022-09-27  RELIANCE.NS   NaN   NaN  NaN    NaN        NaN     NaN
2 2022-09-28  RELIANCE.NS   NaN   NaN  NaN    NaN        NaN     NaN
3 2022-09-29  RELIANCE.NS   NaN   NaN  NaN    NaN        NaN     NaN
4 2022-09-30  RELIANCE.NS   NaN   NaN  NaN    NaN        NaN     NaN


In [4]:
# Timezone handling and sorting
combined_df['Date'] = pd.to_datetime(combined_df['Date'], utc=True)
combined_df['Date'] = combined_df['Date'].dt.tz_convert(None)
combined_df.sort_values(by=['Ticker', 'Date'], inplace=True)

# Daily Returns
combined_df['Daily Return'] = combined_df.groupby('Ticker')['Adj Close'].pct_change(fill_method=None)

# Moving Averages
combined_df['SMA_20'] = combined_df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=20).mean())
combined_df['SMA_50'] = combined_df.groupby('Ticker')['Close'].transform(lambda x: x.rolling(window=50).mean())

# Volume Change
combined_df['Volume Change'] = combined_df.groupby('Ticker')['Volume'].pct_change(fill_method=None)

print("DataFrame with engineered features:")
print(combined_df.tail())

DataFrame with engineered features:
           Date  Ticker         Open         High          Low        Close  \
1479 2025-09-18  TCS.NS  1389.800049  1389.800049  1389.800049  1389.800049   
1480 2025-09-19  TCS.NS  1389.800049  1389.800049  1389.800049  1389.800049   
1481 2025-09-22  TCS.NS  1389.800049  1389.800049  1389.800049  1389.800049   
1482 2025-09-23  TCS.NS  1389.800049  1389.800049  1389.800049  1389.800049   
1483        NaT  TCS.NS  3062.399902  3062.399902  3062.399902  3062.399902   

        Adj Close       Volume  Daily Return       SMA_20       SMA_50  \
1479  1389.800049  1389.800049      0.000000  1389.800049  1389.800049   
1480  1389.800049  1389.800049      0.000000  1389.800049  1389.800049   
1481  1389.800049  1389.800049      0.000000  1389.800049  1389.800049   
1482  1389.800049  1389.800049      0.000000  1389.800049  1389.800049   
1483  3062.399902  3062.399902      1.203482  1473.430042  1423.252046   

      Volume Change  
1479       0.000000  


In [5]:
print("=== Missing Values Analysis ===")
missing_summary = combined_df.isnull().sum()
print("Missing Values per Column:")
print(missing_summary)
print("\nPercentage of Missing Values:")
print((missing_summary / len(combined_df)) * 100)

# Fill NaNs (fixed for pandas 2.0+)
combined_df.ffill(inplace=True)
combined_df.bfill(inplace=True)
print(f"\nAfter filling: Total missing values = {combined_df.isnull().sum().sum()}")

# Save enhanced CSV for next notebooks
enhanced_csv = 'enhanced_stock_data.csv'
combined_df.to_csv(enhanced_csv, index=False)
print(f"✓ Enhanced data saved to '{enhanced_csv}' ({len(combined_df)} rows). Run eda.ipynb or visualization.ipynb next.")

=== Missing Values Analysis ===
Missing Values per Column:
Date               15
Ticker              0
Open              741
High              741
Low               741
Close             741
Adj Close         741
Volume            741
Daily Return      756
SMA_20           1008
SMA_50           1428
Volume Change     756
dtype: int64

Percentage of Missing Values:
Date              0.134771
Ticker            0.000000
Open              6.657682
High              6.657682
Low               6.657682
Close             6.657682
Adj Close         6.657682
Volume            6.657682
Daily Return      6.792453
SMA_20            9.056604
SMA_50           12.830189
Volume Change     6.792453
dtype: float64

After filling: Total missing values = 0
✓ Enhanced data saved to 'enhanced_stock_data.csv' (11130 rows). Run eda.ipynb or visualization.ipynb next.
