In [None]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import ta  


In [None]:
data_folder = "/Users/saamsani/Desktop/CMPT /stock_prediction_project/data/stock_data"
all_files = os.listdir(data_folder)

stock_dfs = []

print("Loading all stock CSVs and adding Ticker column...")

for file in tqdm(all_files):
    try:
        file_path = os.path.join(data_folder, file)
        df = pd.read_csv(file_path)
        if len(df) >= 30:
            ticker = file.replace(".csv", "")   
            df["Ticker"] = ticker               
            stock_dfs.append(df)
    except Exception as e:
        print(f"Failed to load {file}: {e}")


In [None]:
combined_df = pd.concat(stock_dfs)
combined_df.reset_index(drop=True, inplace=True)

print("Combined data shape:", combined_df.shape)



In [None]:
df = combined_df.copy()

# Fix incorrect column name (messed up from last notebook)
df.rename(columns={'Price': 'Date'}, inplace=True)
# Remove any rows where the 'Date' column is not a real date string (e.g. 'Date', 'Ticker', NaN, etc.)
df = df[df['Date'].str.match(r'^\d{4}-\d{2}-\d{2}$', na=False)]

# Make sure Date is in datetime format and sort the data
df['Date'] = pd.to_datetime(df['Date'])
df.sort_values(by=['Ticker', 'Date'], inplace=True)

# Convert key price/volume columns to numbers
cols_to_fix = ['Open', 'High', 'Low', 'Close', 'Volume']
for col in cols_to_fix:
    df[col] = pd.to_numeric(df[col], errors='coerce')


print("Adding technical indicators...")

# Add indicators from the 'ta' (technical analysis) library

df['SMA_10'] = ta.trend.sma_indicator(df['Close'], window=10)  
# 10-day Simple Moving Average — smooths price trends using equal weight on last 10 days

df['EMA_10'] = ta.trend.ema_indicator(df['Close'], window=10)  
# 10-day Exponential Moving Average — similar to SMA but gives more weight to recent prices

df['RSI_14'] = ta.momentum.rsi(df['Close'], window=14)  
# Relative Strength Index (14 days) — momentum indicator (0–100) to show overbought/oversold

df['ATR_14'] = ta.volatility.average_true_range(df['High'], df['Low'], df['Close'], window=14)  
# Average True Range (14 days) — measures daily volatility by looking at high/low/close ranges

# Remove any rows with missing values from indicator calculations
df.dropna(inplace=True)

print("Data shape after adding indicators:", df.shape)
df.head()


In [None]:
# Target_A: 1 if price goes UP tomorrow, 0 if DOWN or SAME
df['Target_A'] = df.groupby('Ticker')['Close'].shift(-1) > df['Close']
df['Target_A'] = df['Target_A'].astype(int)



In [None]:
# Target_B: 1 if a breakout as a 10%+ rise in the next 3 days' highs, if not
future_high = (
    df.groupby('Ticker')['High']
    .transform(lambda x: x.shift(-1).rolling(3).max())
)

df['Target_B'] = (future_high >= df['Close'] * 1.10).astype(int)



In [None]:
# Target_C: 1 if price is higher in 5 days, else 0
df['Target_C'] = (
    df.groupby('Ticker')['Close']
    .transform(lambda x: x.shift(-5) > x)
).astype(int)

print(df[['Date', 'Ticker', 'Close', 'Target_C']].head())


In [None]:
# Drop rows with missing values in indicators or any of the targets
df.dropna(subset=[
    'SMA_10', 'EMA_10', 'RSI_14', 'ATR_14',
    'Target_A', 'Target_B', 'Target_C'
], inplace=True)

print("Final shape after dropping NaNs:", df.shape)


In [None]:
# Savefor model training
df.to_csv("/Users/saamsani/Desktop/CMPT /stock_prediction_project/data/engineered_stock_data.csv", index=False)
