# Stock Movement Prediction – Training Notebook
This notebook downloads stock data, builds 20‑day windows, trains MLP and LSTM models, evaluates accuracy, and saves weights for deployment.

## 1. Imports

In [21]:
import numpy as np
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
import os
from datetime import datetime, timedelta

SEED=42
np.random.seed(SEED)
torch.manual_seed(SEED)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 2. Parameters

In [None]:

# Load S&P 500 constituents list
url = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents.csv"
sp500 = pd.read_csv(url)

# Clean tickers: replace '.' with '-' for yfinance compatibility
sp500['Symbol'] = sp500['Symbol'].str.replace('.', '-', regex=False)

# Hardcoded So users can easily add and remove specific tickers according to preferences
TOP50 = [
    "AAPL", "MSFT", "AMZN", "NVDA", "GOOGL", "GOOG", "META", "TSLA", "BRK-B", "UNH",
    "XOM", "JPM", "JNJ", "V", "PG", "LLY", "HD", "MA", "CVX", "AVGO",
    "COST", "PEP", "PFE", "KO", "MRK", "ABBV", "WMT", "BAC", "TMO", "DIS",
    "ADBE", "CSCO", "CRM", "MCD", "ACN", "LIN", "ABT", "ORCL", "NKE", "DHR",
    "CMCSA", "TXN", "NEE", "WFC", "PM", "VZ", "RTX", "UPS", "INTC", "AMD"
]

# Filter the S&P 500 tickers to only include those in our TOP50 list
TICKERS = [t for t in sp500['Symbol'].tolist() if t in TOP50]

print("Total tickers loaded:", len(TICKERS))
print("Tickers:", TICKERS)


Total tickers loaded: 50
Tickers: ['ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'GOOGL', 'GOOG', 'AMZN', 'AAPL', 'BAC', 'BRK-B', 'AVGO', 'CVX', 'CSCO', 'KO', 'CMCSA', 'COST', 'DHR', 'XOM', 'HD', 'INTC', 'JNJ', 'JPM', 'LLY', 'LIN', 'MA', 'MCD', 'MRK', 'META', 'MSFT', 'NEE', 'NKE', 'NVDA', 'ORCL', 'PEP', 'PFE', 'PM', 'PG', 'RTX', 'CRM', 'TSLA', 'TXN', 'TMO', 'UPS', 'UNH', 'VZ', 'V', 'WMT', 'DIS', 'WFC']


## 3. Download Data
We fetch daily OHLCV stock prices using yfinance, compute daily returns, and generate labels (1 = next‑day price up).

In [23]:
# End date = today
END_DATE = datetime.today().strftime("%Y-%m-%d")

# Start date = 7 years before today
START_DATE = (datetime.today() - timedelta(days=365*7)).strftime("%Y-%m-%d")

print("START_DATE:", START_DATE)
print("END_DATE:", END_DATE)

all_frames = []
for ticker in TICKERS:
    print(f"\nDownloading {ticker}...")

    df = yf.download(
        ticker,
        start=START_DATE,
        end=END_DATE,
        progress=False,
        auto_adjust=False,
        group_by=None
    )

    if df is None or df.empty:
        print(f"❌ Skipping {ticker} (empty DataFrame)")
        continue

    # Your MultiIndex fix:
    df.columns = df.columns.droplevel(0)

    if "Close" not in df.columns:
        print(f"❌ Skipping {ticker} (missing Close)")
        print(df.columns)
        continue

    df = df.sort_index()
    df["Return"] = df["Close"].pct_change()
    df["Target"] = (df["Return"].shift(-1) > 0).astype(int)
    df["Ticker"] = ticker

    df = df.dropna(subset=["Return", "Target"])

    if df.empty:
        print(f"❌ Skipping {ticker} (no valid rows after cleaning)")
        continue

    print(f"✅ Loaded {ticker}, rows:", len(df))
    all_frames.append(df)
data = pd.concat(all_frames)
print("Final dataset size:", data.shape)
data.head()

print("\nFinal tickers used:", [df["Ticker"].iloc[0] for df in all_frames])



START_DATE: 2018-12-09
END_DATE: 2025-12-07

Downloading ABT...
✅ Loaded ABT, rows: 1757

Downloading ABBV...
✅ Loaded ABBV, rows: 1757

Downloading ACN...
✅ Loaded ACN, rows: 1757

Downloading ADBE...
✅ Loaded ADBE, rows: 1757

Downloading AMD...
✅ Loaded AMD, rows: 1757

Downloading GOOGL...
✅ Loaded GOOGL, rows: 1757

Downloading GOOG...
✅ Loaded GOOG, rows: 1757

Downloading AMZN...
✅ Loaded AMZN, rows: 1757

Downloading AAPL...
✅ Loaded AAPL, rows: 1757

Downloading BAC...
✅ Loaded BAC, rows: 1757

Downloading BRK-B...
✅ Loaded BRK-B, rows: 1757

Downloading AVGO...
✅ Loaded AVGO, rows: 1757

Downloading CVX...
✅ Loaded CVX, rows: 1757

Downloading CSCO...
✅ Loaded CSCO, rows: 1757

Downloading KO...
✅ Loaded KO, rows: 1757

Downloading CMCSA...
✅ Loaded CMCSA, rows: 1757

Downloading COST...
✅ Loaded COST, rows: 1757

Downloading DHR...
✅ Loaded DHR, rows: 1757

Downloading XOM...
✅ Loaded XOM, rows: 1757

Downloading HD...
✅ Loaded HD, rows: 1757

Downloading INTC...
✅ Loaded IN

## 4. Build Sliding Windows
Each sample consists of the last 20 daily returns → label is next day's movement.

In [24]:
X_list=[]; y_list=[]
for ticker in TICKERS:
    df=data[data['Ticker']==ticker].sort_index()
    ret=df['Return'].values
    tgt=df['Target'].values
    for i in range(WINDOW_SIZE,len(df)):
        X_list.append(ret[i-WINDOW_SIZE:i])
        y_list.append(tgt[i-1])

X=np.array(X_list,dtype=np.float32)
y=np.array(y_list,dtype=np.float32)
X.shape, y.shape

((86850, 20), (86850,))

## 5. Train/Val/Test Split

In [25]:
n=len(X)
train_end=int(0.7*n)
val_end=int(0.85*n)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

X_train.shape, X_val.shape, X_test.shape

((60794, 20), (13028, 20), (13028, 20))

## 6. Create Torch Datasets

In [26]:
X_train_t=torch.tensor(X_train)
y_train_t=torch.tensor(y_train.reshape(-1,1))
X_val_t=torch.tensor(X_val)
y_val_t=torch.tensor(y_val.reshape(-1,1))
X_test_t=torch.tensor(X_test)
y_test_t=torch.tensor(y_test.reshape(-1,1))

train_loader=DataLoader(TensorDataset(X_train_t,y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader=DataLoader(TensorDataset(X_val_t,y_val_t),batch_size=BATCH_SIZE)
test_loader=DataLoader(TensorDataset(X_test_t,y_test_t),batch_size=BATCH_SIZE)

## 7. Define MLP and LSTM Models

In [27]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(input_dim,128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1),
            nn.Sigmoid()
        )
    def forward(self,x): return self.net(x.float())

class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_size=64, num_layers=1):
        super().__init__()
        self.lstm=nn.LSTM(feature_dim,hidden_size,num_layers,batch_first=True)
        self.fc=nn.Linear(hidden_size,1)
        self.sigmoid=nn.Sigmoid()
    def forward(self,x):
        out,_=self.lstm(x.float())
        out=self.fc(out[:,-1,:])
        return self.sigmoid(out)

## 8. Training Helpers

In [28]:
def train_epoch(model,loader,crit,opt):
    model.train(); total=correct=loss_sum=0
    for xb,yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred=model(xb)
        loss=crit(pred,yb)
        loss.backward()
        opt.step()
        loss_sum+=loss.item()*len(xb)
        correct+=((pred>=0.5).float()==yb).sum().item()
        total+=len(xb)
    return loss_sum/total, correct/total

def eval_epoch(model,loader,crit):
    model.eval(); total=correct=loss_sum=0; preds=[]; labels=[]
    with torch.no_grad():
        for xb,yb in loader:
            xb,yb=xb.to(device),yb.to(device)
            pred=model(xb)
            loss=crit(pred,yb)
            loss_sum+=loss.item()*len(xb)
            correct+=((pred>=0.5).float()==yb).sum().item()
            total+=len(xb)
            preds.extend((pred>=0.5).cpu().numpy().astype(int))
            labels.extend(yb.cpu().numpy().astype(int))
    return loss_sum/total, correct/total, preds, labels

## 9. Train MLP

In [29]:
mlp=MLPClassifier(WINDOW_SIZE).to(device)
crit=nn.BCELoss()
opt=torch.optim.Adam(mlp.parameters(),lr=LR)
best=None; best_val=float("inf")

for epoch in range(1,EPOCHS_MLP+1):
    tr_l,tr_a=train_epoch(mlp,train_loader,crit,opt)
    va_l,va_a,_,_=eval_epoch(mlp,val_loader,crit)
    print(f"MLP {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val: best_val=va_l; best=mlp.state_dict()

mlp.load_state_dict(best)


MLP 1: Train 0.6916/0.5261, Val 0.6909/0.5319
MLP 2: Train 0.6905/0.5292, Val 0.6901/0.5325
MLP 3: Train 0.6893/0.5325, Val 0.6900/0.5305
MLP 4: Train 0.6885/0.5340, Val 0.6898/0.5313
MLP 5: Train 0.6879/0.5357, Val 0.6894/0.5319
MLP 6: Train 0.6874/0.5367, Val 0.6895/0.5312
MLP 7: Train 0.6869/0.5366, Val 0.6894/0.5328
MLP 8: Train 0.6863/0.5387, Val 0.6891/0.5312
MLP 9: Train 0.6860/0.5405, Val 0.6894/0.5335
MLP 10: Train 0.6857/0.5396, Val 0.6902/0.5355
MLP 11: Train 0.6851/0.5408, Val 0.6888/0.5347
MLP 12: Train 0.6846/0.5422, Val 0.6895/0.5339
MLP 13: Train 0.6842/0.5445, Val 0.6888/0.5368
MLP 14: Train 0.6834/0.5450, Val 0.6889/0.5339
MLP 15: Train 0.6832/0.5461, Val 0.6885/0.5393


<All keys matched successfully>

## 10. Test MLP

In [30]:
tl,ta,preds,labels=eval_epoch(mlp,test_loader,crit)
print("MLP Test Accuracy:", ta)

MLP Test Accuracy: 0.526327909118821


## 11. Prepare LSTM Data

In [31]:
X_train_seq=X_train.reshape(-1,WINDOW_SIZE,1)
X_val_seq=X_val.reshape(-1,WINDOW_SIZE,1)
X_test_seq=X_test.reshape(-1,WINDOW_SIZE,1)

train_loader_seq=DataLoader(TensorDataset(torch.tensor(X_train_seq),y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader_seq=DataLoader(TensorDataset(torch.tensor(X_val_seq),y_val_t),batch_size=BATCH_SIZE)
test_loader_seq=DataLoader(TensorDataset(torch.tensor(X_test_seq),y_test_t),batch_size=BATCH_SIZE)

## 12. Train LSTM

In [32]:
lstm=LSTMClassifier(feature_dim=1).to(device)
opt2=torch.optim.Adam(lstm.parameters(),lr=LR)
best_l=None; best_val_l=float("inf")

for epoch in range(1,EPOCHS_LSTM+1):
    tr_l,tr_a=train_epoch(lstm,train_loader_seq,crit,opt2)
    va_l,va_a,_,_=eval_epoch(lstm,val_loader_seq,crit)
    print(f"LSTM {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val_l: best_val_l=va_l; best_l=lstm.state_dict()

lstm.load_state_dict(best_l)

LSTM 1: Train 0.6920/0.5252, Val 0.6930/0.5230
LSTM 2: Train 0.6918/0.5263, Val 0.6920/0.5230
LSTM 3: Train 0.6917/0.5264, Val 0.6918/0.5233
LSTM 4: Train 0.6916/0.5283, Val 0.6915/0.5246
LSTM 5: Train 0.6913/0.5294, Val 0.6915/0.5249
LSTM 6: Train 0.6913/0.5298, Val 0.6913/0.5283
LSTM 7: Train 0.6912/0.5301, Val 0.6910/0.5301
LSTM 8: Train 0.6911/0.5294, Val 0.6910/0.5284
LSTM 9: Train 0.6906/0.5307, Val 0.6908/0.5275
LSTM 10: Train 0.6898/0.5306, Val 0.6901/0.5292
LSTM 11: Train 0.6896/0.5294, Val 0.6899/0.5262
LSTM 12: Train 0.6900/0.5302, Val 0.6906/0.5270
LSTM 13: Train 0.6893/0.5325, Val 0.6898/0.5282
LSTM 14: Train 0.6890/0.5317, Val 0.6896/0.5288
LSTM 15: Train 0.6889/0.5318, Val 0.6901/0.5276


<All keys matched successfully>

## 13. Test LSTM

In [33]:
tl,ta,preds,labels=eval_epoch(lstm,test_loader_seq,crit)
print("LSTM Test Accuracy:", ta)

LSTM Test Accuracy: 0.5181148295977894


## 14. Save Weights

In [34]:
os.makedirs("../models/saved_weights",exist_ok=True)
torch.save(mlp.state_dict(),"../models/saved_weights/mlp_weights.pth")
torch.save(lstm.state_dict(),"../models/saved_weights/lstm_weights.pth")
print("Saved mlp_weights.pth and lstm_weights.pth")

Saved mlp_weights.pth and lstm_weights.pth
