# Stock Movement Prediction – Training Notebook
This notebook downloads stock data, builds 20‑day windows, trains MLP and LSTM models, evaluates accuracy, and saves weights for deployment.

## 1. Imports

In [1]:
import numpy as np
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

SEED=42
np.random.seed(SEED)
torch.manual_seed(SEED)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 2. Parameters

In [2]:

url = "https://raw.githubusercontent.com/datasets/s-and-p-500-companies/master/data/constituents.csv"
sp500 = pd.read_csv(url)

TICKERS = [t.replace('.', '-') for t in sp500['Symbol'].tolist()]
print(len(TICKERS))
print(TICKERS[:10])

START_DATE="2015-01-01"
END_DATE="2024-12-31"
WINDOW_SIZE=20
BATCH_SIZE=64
EPOCHS_MLP=15
EPOCHS_LSTM=15
LR=1e-3

TICKERS

503
['MMM', 'AOS', 'ABT', 'ABBV', 'ACN', 'ADBE', 'AMD', 'AES', 'AFL', 'A']


['MMM',
 'AOS',
 'ABT',
 'ABBV',
 'ACN',
 'ADBE',
 'AMD',
 'AES',
 'AFL',
 'A',
 'APD',
 'ABNB',
 'AKAM',
 'ALB',
 'ARE',
 'ALGN',
 'ALLE',
 'LNT',
 'ALL',
 'GOOGL',
 'GOOG',
 'MO',
 'AMZN',
 'AMCR',
 'AEE',
 'AEP',
 'AXP',
 'AIG',
 'AMT',
 'AWK',
 'AMP',
 'AME',
 'AMGN',
 'APH',
 'ADI',
 'AON',
 'APA',
 'APO',
 'AAPL',
 'AMAT',
 'APTV',
 'ACGL',
 'ADM',
 'ANET',
 'AJG',
 'AIZ',
 'T',
 'ATO',
 'ADSK',
 'ADP',
 'AZO',
 'AVB',
 'AVY',
 'AXON',
 'BKR',
 'BALL',
 'BAC',
 'BAX',
 'BDX',
 'BRK-B',
 'BBY',
 'TECH',
 'BIIB',
 'BLK',
 'BX',
 'XYZ',
 'BK',
 'BA',
 'BKNG',
 'BSX',
 'BMY',
 'AVGO',
 'BR',
 'BRO',
 'BF-B',
 'BLDR',
 'BG',
 'BXP',
 'CHRW',
 'CDNS',
 'CZR',
 'CPT',
 'CPB',
 'COF',
 'CAH',
 'KMX',
 'CCL',
 'CARR',
 'CAT',
 'CBOE',
 'CBRE',
 'CDW',
 'COR',
 'CNC',
 'CNP',
 'CF',
 'CRL',
 'SCHW',
 'CHTR',
 'CVX',
 'CMG',
 'CB',
 'CHD',
 'CI',
 'CINF',
 'CTAS',
 'CSCO',
 'C',
 'CFG',
 'CLX',
 'CME',
 'CMS',
 'KO',
 'CTSH',
 'COIN',
 'CL',
 'CMCSA',
 'CAG',
 'COP',
 'ED',
 'STZ',
 'CEG',


## 3. Download Data
We fetch daily OHLCV stock prices using yfinance, compute daily returns, and generate labels (1 = next‑day price up).

In [3]:
all_frames = []

for ticker in TICKERS:
    print(f"\nDownloading {ticker}...")

    df = yf.download(
        ticker,
        start=START_DATE,
        end=END_DATE,
        progress=False,
        auto_adjust=False,
        group_by=None
    )

    if df is None or df.empty:
        print(f"❌ Skipping {ticker} (empty DataFrame)")
        continue

    # Your MultiIndex fix:
    df.columns = df.columns.droplevel(0)

    if "Close" not in df.columns:
        print(f"❌ Skipping {ticker} (missing Close)")
        print(df.columns)
        continue

    df = df.sort_index()
    df["Return"] = df["Close"].pct_change()
    df["Target"] = (df["Return"].shift(-1) > 0).astype(int)
    df["Ticker"] = ticker

    df = df.dropna(subset=["Return", "Target"])

    if df.empty:
        print(f"❌ Skipping {ticker} (no valid rows after cleaning)")
        continue

    print(f"✅ Loaded {ticker}, rows:", len(df))
    all_frames.append(df)
data = pd.concat(all_frames)
print("Final dataset size:", data.shape)
data.head()

print("\nFinal tickers used:", [df["Ticker"].iloc[0] for df in all_frames])




Downloading MMM...
✅ Loaded MMM, rows: 2514

Downloading AOS...
✅ Loaded AOS, rows: 2514

Downloading ABT...
✅ Loaded ABT, rows: 2514

Downloading ABBV...
✅ Loaded ABBV, rows: 2514

Downloading ACN...
✅ Loaded ACN, rows: 2514

Downloading ADBE...
✅ Loaded ADBE, rows: 2514

Downloading AMD...
✅ Loaded AMD, rows: 2514

Downloading AES...
✅ Loaded AES, rows: 2514

Downloading AFL...
✅ Loaded AFL, rows: 2514

Downloading A...
✅ Loaded A, rows: 2514

Downloading APD...
✅ Loaded APD, rows: 2514

Downloading ABNB...
✅ Loaded ABNB, rows: 1018

Downloading AKAM...
✅ Loaded AKAM, rows: 2514

Downloading ALB...
✅ Loaded ALB, rows: 2514

Downloading ARE...
✅ Loaded ARE, rows: 2514

Downloading ALGN...
✅ Loaded ALGN, rows: 2514

Downloading ALLE...
✅ Loaded ALLE, rows: 2514

Downloading LNT...
✅ Loaded LNT, rows: 2514

Downloading ALL...
✅ Loaded ALL, rows: 2514

Downloading GOOGL...
✅ Loaded GOOGL, rows: 2514

Downloading GOOG...
✅ Loaded GOOG, rows: 2514

Downloading MO...
✅ Loaded MO, rows: 251

HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: WBA"}}}

1 Failed download:
['WBA']: YFTzMissingError('possibly delisted; no timezone found')


❌ Skipping WBA (empty DataFrame)

Downloading WMT...
✅ Loaded WMT, rows: 2514

Downloading DIS...
✅ Loaded DIS, rows: 2514

Downloading WBD...
✅ Loaded WBD, rows: 2514

Downloading WM...
✅ Loaded WM, rows: 2514

Downloading WAT...
✅ Loaded WAT, rows: 2514

Downloading WEC...
✅ Loaded WEC, rows: 2514

Downloading WFC...
✅ Loaded WFC, rows: 2514

Downloading WELL...
✅ Loaded WELL, rows: 2514

Downloading WST...
✅ Loaded WST, rows: 2514

Downloading WDC...
✅ Loaded WDC, rows: 2514

Downloading WY...
✅ Loaded WY, rows: 2514

Downloading WSM...
✅ Loaded WSM, rows: 2514

Downloading WMB...
✅ Loaded WMB, rows: 2514

Downloading WTW...
✅ Loaded WTW, rows: 2514

Downloading WDAY...
✅ Loaded WDAY, rows: 2514

Downloading WYNN...
✅ Loaded WYNN, rows: 2514

Downloading XEL...
✅ Loaded XEL, rows: 2514

Downloading XYL...
✅ Loaded XYL, rows: 2514

Downloading YUM...
✅ Loaded YUM, rows: 2514

Downloading ZBRA...
✅ Loaded ZBRA, rows: 2514

Downloading ZBH...
✅ Loaded ZBH, rows: 2514

Downloading ZTS..

## 4. Build Sliding Windows
Each sample consists of the last 20 daily returns → label is next day's movement.

In [4]:
X_list=[]; y_list=[]
for ticker in TICKERS:
    df=data[data['Ticker']==ticker].sort_index()
    ret=df['Return'].values
    tgt=df['Target'].values
    for i in range(WINDOW_SIZE,len(df)):
        X_list.append(ret[i-WINDOW_SIZE:i])
        y_list.append(tgt[i-1])

X=np.array(X_list,dtype=np.float32)
y=np.array(y_list,dtype=np.float32)
X.shape, y.shape

((1214394, 20), (1214394,))

## 5. Train/Val/Test Split

In [5]:
n=len(X)
train_end=int(0.7*n)
val_end=int(0.85*n)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

X_train.shape, X_val.shape, X_test.shape

((850075, 20), (182159, 20), (182160, 20))

## 6. Create Torch Datasets

In [6]:
X_train_t=torch.tensor(X_train)
y_train_t=torch.tensor(y_train.reshape(-1,1))
X_val_t=torch.tensor(X_val)
y_val_t=torch.tensor(y_val.reshape(-1,1))
X_test_t=torch.tensor(X_test)
y_test_t=torch.tensor(y_test.reshape(-1,1))

train_loader=DataLoader(TensorDataset(X_train_t,y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader=DataLoader(TensorDataset(X_val_t,y_val_t),batch_size=BATCH_SIZE)
test_loader=DataLoader(TensorDataset(X_test_t,y_test_t),batch_size=BATCH_SIZE)

## 7. Define MLP and LSTM Models

In [7]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(input_dim,128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1),
            nn.Sigmoid()
        )
    def forward(self,x): return self.net(x.float())

class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_size=64, num_layers=1):
        super().__init__()
        self.lstm=nn.LSTM(feature_dim,hidden_size,num_layers,batch_first=True)
        self.fc=nn.Linear(hidden_size,1)
        self.sigmoid=nn.Sigmoid()
    def forward(self,x):
        out,_=self.lstm(x.float())
        out=self.fc(out[:,-1,:])
        return self.sigmoid(out)

## 8. Training Helpers

In [8]:
def train_epoch(model,loader,crit,opt):
    model.train(); total=correct=loss_sum=0
    for xb,yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred=model(xb)
        loss=crit(pred,yb)
        loss.backward()
        opt.step()
        loss_sum+=loss.item()*len(xb)
        correct+=((pred>=0.5).float()==yb).sum().item()
        total+=len(xb)
    return loss_sum/total, correct/total

def eval_epoch(model,loader,crit):
    model.eval(); total=correct=loss_sum=0; preds=[]; labels=[]
    with torch.no_grad():
        for xb,yb in loader:
            xb,yb=xb.to(device),yb.to(device)
            pred=model(xb)
            loss=crit(pred,yb)
            loss_sum+=loss.item()*len(xb)
            correct+=((pred>=0.5).float()==yb).sum().item()
            total+=len(xb)
            preds.extend((pred>=0.5).cpu().numpy().astype(int))
            labels.extend(yb.cpu().numpy().astype(int))
    return loss_sum/total, correct/total, preds, labels

## 9. Train MLP

In [9]:
mlp=MLPClassifier(WINDOW_SIZE).to(device)
crit=nn.BCELoss()
opt=torch.optim.Adam(mlp.parameters(),lr=LR)
best=None; best_val=float("inf")

for epoch in range(1,EPOCHS_MLP+1):
    tr_l,tr_a=train_epoch(mlp,train_loader,crit,opt)
    va_l,va_a,_,_=eval_epoch(mlp,val_loader,crit)
    print(f"MLP {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val: best_val=va_l; best=mlp.state_dict()

mlp.load_state_dict(best)


MLP 1: Train 0.6908/0.5236, Val 0.6896/0.5248
MLP 2: Train 0.6892/0.5278, Val 0.6881/0.5298
MLP 3: Train 0.6881/0.5307, Val 0.6873/0.5325
MLP 4: Train 0.6871/0.5336, Val 0.6863/0.5361
MLP 5: Train 0.6864/0.5349, Val 0.6853/0.5388
MLP 6: Train 0.6857/0.5362, Val 0.6845/0.5402
MLP 7: Train 0.6851/0.5378, Val 0.6840/0.5400
MLP 8: Train 0.6846/0.5387, Val 0.6837/0.5417
MLP 9: Train 0.6842/0.5400, Val 0.6835/0.5420
MLP 10: Train 0.6838/0.5412, Val 0.6828/0.5422
MLP 11: Train 0.6835/0.5425, Val 0.6826/0.5419
MLP 12: Train 0.6831/0.5432, Val 0.6824/0.5439
MLP 13: Train 0.6827/0.5431, Val 0.6816/0.5452
MLP 14: Train 0.6826/0.5430, Val 0.6816/0.5450
MLP 15: Train 0.6820/0.5455, Val 0.6813/0.5469


<All keys matched successfully>

## 10. Test MLP

In [10]:
tl,ta,preds,labels=eval_epoch(mlp,test_loader,crit)
print("MLP Test Accuracy:", ta)

MLP Test Accuracy: 0.5459266578831796


## 11. Prepare LSTM Data

In [11]:
X_train_seq=X_train.reshape(-1,WINDOW_SIZE,1)
X_val_seq=X_val.reshape(-1,WINDOW_SIZE,1)
X_test_seq=X_test.reshape(-1,WINDOW_SIZE,1)

train_loader_seq=DataLoader(TensorDataset(torch.tensor(X_train_seq),y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader_seq=DataLoader(TensorDataset(torch.tensor(X_val_seq),y_val_t),batch_size=BATCH_SIZE)
test_loader_seq=DataLoader(TensorDataset(torch.tensor(X_test_seq),y_test_t),batch_size=BATCH_SIZE)

## 12. Train LSTM

In [12]:
lstm=LSTMClassifier(feature_dim=1).to(device)
opt2=torch.optim.Adam(lstm.parameters(),lr=LR)
best_l=None; best_val_l=float("inf")

for epoch in range(1,EPOCHS_LSTM+1):
    tr_l,tr_a=train_epoch(lstm,train_loader_seq,crit,opt2)
    va_l,va_a,_,_=eval_epoch(lstm,val_loader_seq,crit)
    print(f"LSTM {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val_l: best_val_l=va_l; best_l=lstm.state_dict()

lstm.load_state_dict(best_l)

LSTM 1: Train 0.6922/0.5212, Val 0.6923/0.5188
LSTM 2: Train 0.6918/0.5222, Val 0.6920/0.5201
LSTM 3: Train 0.6913/0.5229, Val 0.6909/0.5206
LSTM 4: Train 0.6907/0.5237, Val 0.6906/0.5212
LSTM 5: Train 0.6903/0.5241, Val 0.6902/0.5221
LSTM 6: Train 0.6901/0.5238, Val 0.6901/0.5229
LSTM 7: Train 0.6897/0.5247, Val 0.6899/0.5233
LSTM 8: Train 0.6894/0.5257, Val 0.6890/0.5264
LSTM 9: Train 0.6885/0.5272, Val 0.6889/0.5244
LSTM 10: Train 0.6878/0.5287, Val 0.6878/0.5287
LSTM 11: Train 0.6870/0.5301, Val 0.6866/0.5296
LSTM 12: Train 0.6861/0.5318, Val 0.6862/0.5306
LSTM 13: Train 0.6852/0.5329, Val 0.6846/0.5327
LSTM 14: Train 0.6841/0.5353, Val 0.6845/0.5330
LSTM 15: Train 0.6831/0.5368, Val 0.6829/0.5351


<All keys matched successfully>

## 13. Test LSTM

In [13]:
tl,ta,preds,labels=eval_epoch(lstm,test_loader_seq,crit)
print("LSTM Test Accuracy:", ta)

LSTM Test Accuracy: 0.5364679402722881


## 14. Save Weights

In [14]:
os.makedirs("../models/saved_weights",exist_ok=True)
torch.save(mlp.state_dict(),"../models/saved_weights/mlp_weights.pth")
torch.save(lstm.state_dict(),"../models/saved_weights/lstm_weights.pth")
print("Saved mlp_weights.pth and lstm_weights.pth")

Saved mlp_weights.pth and lstm_weights.pth
