In [55]:
"""
Module: data_preprocessing.py
Purpose: Clean and prepare raw financial data for modeling.
"""

import os
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

def preprocess_data(data_dict, save_path="data/processed"):
    """
    Preprocess raw data: clean, merge, and feature engineer.

    Args:
        data_dict (dict): Dictionary of DataFrames {ticker: DataFrame}
        save_path (str): Directory to save processed data

    Returns:
        dict: Processed DataFrames for each ticker
    """
    os.makedirs(save_path, exist_ok=True)
    processed_dict = {}

    for ticker, df in data_dict.items():
        print(f"🛠 Preprocessing {ticker}...")

        # Ensure datetime index
        df.index = pd.to_datetime(df.index)

        # Sort by date
        df.sort_index(inplace=True)

        # Handle missing values (forward fill, then backward fill)
        df.ffill(inplace=True)
        df.bfill(inplace=True)

        # Feature Engineering
        df['Daily_Return'] = df['Adj Close'].pct_change()
        df['Rolling_Mean_20'] = df['Adj Close'].rolling(window=20).mean()
        df['Rolling_Std_20'] = df['Adj Close'].rolling(window=20).std()

        # Stationarity test (ADF on returns)
        adf_pvalue = np.nan
        try:
            result = adfuller(df['Daily_Return'].dropna())
            adf_pvalue = result[1]
        except Exception:
            pass
        df.attrs['ADF_pvalue'] = adf_pvalue
        print(f"   📊 ADF p-value (returns): {adf_pvalue:.4f}")

        # Save processed CSV
        file_path = os.path.join(save_path, f"{ticker}_processed.csv")
        df.to_csv(file_path)
        print(f"   ✅ Saved processed data to {file_path}")

        processed_dict[ticker] = df

    return processed_dict


def merge_assets(processed_dict):
    """
    Merge multiple assets' adjusted close prices and returns into one DataFrame.
    """
    merged_df = pd.DataFrame()

    for ticker, df in processed_dict.items():
        merged_df[f"{ticker}_Adj_Close"] = df['Adj Close']
        merged_df[f"{ticker}_Daily_Return"] = df['Daily_Return']

    merged_df.dropna(inplace=True)
    return merged_df


if __name__ == "__main__":
    # Test run: assumes raw data exists in data/raw
    tickers = ['SPY', 'TSLA']

    # Handle both script and notebook cases
    try:
        BASE_DIR = os.path.dirname(os.path.abspath(__file__))
    except NameError:
        BASE_DIR = os.getcwd()

    RAW_DATA_DIR = os.path.join(BASE_DIR, "data", "raw")

    raw_data = {}
    for t in tickers:
        file_path = os.path.join(RAW_DATA_DIR, f"{t}_raw.csv")
        df = pd.read_csv(file_path ,skiprows=2 ,header=0)  # Skip metadata rows
        print(f"Columns for {t}: {df.columns.tolist()}")
        df.rename(columns={'Price': 'Date'}, inplace=True)
        df.rename(columns={'Unnamed: 1': 'Close','Unnamed: 2': 'High','Unnamed: 3': 'Low','Unnamed: 4': 'Open','Unnamed: 5': 'Volume'}, inplace=True)
        df['Date'] = pd.to_datetime(df['Date'])
        df.set_index('Date', inplace=True)
        raw_data[t] = df

    print(raw_data['SPY'].head())
    output_dir = 'processed_data'
    os.makedirs(output_dir, exist_ok=True)

    for ticker, df in raw_data.items():
        csv_path = os.path.join(output_dir, f'{ticker}_processed.csv')
        df.to_csv(csv_path)
        print(f'Saved {ticker} data to {csv_path}')


Columns for SPY: ['Date', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']
Columns for TSLA: ['Date', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5']
                 Close        High         Low        Open     Volume
Date                                                                 
2015-07-01  174.917114  175.363889  174.124717  175.110995  135979900
2015-07-02  174.756927  175.566188  174.335441  175.397596  104373700
2015-07-06  174.259628  175.043588  173.256487  173.458805  117975400
2015-07-07  175.355438  175.481879  172.059407  174.461888  173820200
2015-07-08  172.413498  174.293327  172.177466  174.006719  164020100
Saved SPY data to processed_data\SPY_processed.csv
Saved TSLA data to processed_data\TSLA_processed.csv
