# Stock Movement Prediction – Training Notebook
This notebook downloads stock data, builds 20‑day windows, trains MLP and LSTM models, evaluates accuracy, and saves weights for deployment.

## 1. Imports

In [25]:
import numpy as np
import pandas as pd
import yfinance as yf
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
import os

SEED=42
np.random.seed(SEED)
torch.manual_seed(SEED)
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

## 2. Parameters

In [26]:
TICKERS=["AAPL","MSFT","AMZN","GOOGL","META","TSLA","SPY"]
START_DATE="2015-01-01"
END_DATE="2024-12-31"
WINDOW_SIZE=20
BATCH_SIZE=64
EPOCHS_MLP=15
EPOCHS_LSTM=15
LR=1e-3

TICKERS

['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'SPY']

## 3. Download Data
We fetch daily OHLCV stock prices using yfinance, compute daily returns, and generate labels (1 = next‑day price up).

In [27]:
all_frames = []

for ticker in TICKERS:
    print(f"\nDownloading {ticker}...")

    df = yf.download(
        ticker,
        start=START_DATE,
        end=END_DATE,
        progress=False,
        auto_adjust=False,
        group_by=None
    )

    if df is None or df.empty:
        print(f"❌ Skipping {ticker} (empty DataFrame)")
        continue

    # Your MultiIndex fix:
    df.columns = df.columns.droplevel(0)

    if "Close" not in df.columns:
        print(f"❌ Skipping {ticker} (missing Close)")
        print(df.columns)
        continue

    df = df.sort_index()
    df["Return"] = df["Close"].pct_change()
    df["Target"] = (df["Return"].shift(-1) > 0).astype(int)
    df["Ticker"] = ticker

    df = df.dropna(subset=["Return", "Target"])

    if df.empty:
        print(f"❌ Skipping {ticker} (no valid rows after cleaning)")
        continue

    print(f"✅ Loaded {ticker}, rows:", len(df))
    all_frames.append(df)
data = pd.concat(all_frames)
print("Final dataset size:", data.shape)
data.head()

print("\nFinal tickers used:", [df["Ticker"].iloc[0] for df in all_frames])




Downloading AAPL...
✅ Loaded AAPL, rows: 2514

Downloading MSFT...
✅ Loaded MSFT, rows: 2514

Downloading AMZN...
✅ Loaded AMZN, rows: 2514

Downloading GOOGL...
✅ Loaded GOOGL, rows: 2514

Downloading META...
✅ Loaded META, rows: 2514

Downloading TSLA...
✅ Loaded TSLA, rows: 2514

Downloading SPY...
✅ Loaded SPY, rows: 2514
Final dataset size: (17598, 9)

Final tickers used: ['AAPL', 'MSFT', 'AMZN', 'GOOGL', 'META', 'TSLA', 'SPY']


## 4. Build Sliding Windows
Each sample consists of the last 20 daily returns → label is next day's movement.

In [28]:
X_list=[]; y_list=[]
for ticker in TICKERS:
    df=data[data['Ticker']==ticker].sort_index()
    ret=df['Return'].values
    tgt=df['Target'].values
    for i in range(WINDOW_SIZE,len(df)):
        X_list.append(ret[i-WINDOW_SIZE:i])
        y_list.append(tgt[i-1])

X=np.array(X_list,dtype=np.float32)
y=np.array(y_list,dtype=np.float32)
X.shape, y.shape

((17458, 20), (17458,))

## 5. Train/Val/Test Split

In [29]:
n=len(X)
train_end=int(0.7*n)
val_end=int(0.85*n)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

X_train.shape, X_val.shape, X_test.shape

((12220, 20), (2619, 20), (2619, 20))

## 6. Create Torch Datasets

In [30]:
X_train_t=torch.tensor(X_train)
y_train_t=torch.tensor(y_train.reshape(-1,1))
X_val_t=torch.tensor(X_val)
y_val_t=torch.tensor(y_val.reshape(-1,1))
X_test_t=torch.tensor(X_test)
y_test_t=torch.tensor(y_test.reshape(-1,1))

train_loader=DataLoader(TensorDataset(X_train_t,y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader=DataLoader(TensorDataset(X_val_t,y_val_t),batch_size=BATCH_SIZE)
test_loader=DataLoader(TensorDataset(X_test_t,y_test_t),batch_size=BATCH_SIZE)

## 7. Define MLP and LSTM Models

In [31]:
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(input_dim,128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128,64),
            nn.ReLU(),
            nn.Linear(64,1),
            nn.Sigmoid()
        )
    def forward(self,x): return self.net(x.float())

class LSTMClassifier(nn.Module):
    def __init__(self, feature_dim, hidden_size=64, num_layers=1):
        super().__init__()
        self.lstm=nn.LSTM(feature_dim,hidden_size,num_layers,batch_first=True)
        self.fc=nn.Linear(hidden_size,1)
        self.sigmoid=nn.Sigmoid()
    def forward(self,x):
        out,_=self.lstm(x.float())
        out=self.fc(out[:,-1,:])
        return self.sigmoid(out)

## 8. Training Helpers

In [32]:
def train_epoch(model,loader,crit,opt):
    model.train(); total=correct=loss_sum=0
    for xb,yb in loader:
        xb, yb = xb.to(device), yb.to(device)
        opt.zero_grad()
        pred=model(xb)
        loss=crit(pred,yb)
        loss.backward()
        opt.step()
        loss_sum+=loss.item()*len(xb)
        correct+=((pred>=0.5).float()==yb).sum().item()
        total+=len(xb)
    return loss_sum/total, correct/total

def eval_epoch(model,loader,crit):
    model.eval(); total=correct=loss_sum=0; preds=[]; labels=[]
    with torch.no_grad():
        for xb,yb in loader:
            xb,yb=xb.to(device),yb.to(device)
            pred=model(xb)
            loss=crit(pred,yb)
            loss_sum+=loss.item()*len(xb)
            correct+=((pred>=0.5).float()==yb).sum().item()
            total+=len(xb)
            preds.extend((pred>=0.5).cpu().numpy().astype(int))
            labels.extend(yb.cpu().numpy().astype(int))
    return loss_sum/total, correct/total, preds, labels

## 9. Train MLP

In [33]:
mlp=MLPClassifier(WINDOW_SIZE).to(device)
crit=nn.BCELoss()
opt=torch.optim.Adam(mlp.parameters(),lr=LR)
best=None; best_val=float("inf")

for epoch in range(1,EPOCHS_MLP+1):
    tr_l,tr_a=train_epoch(mlp,train_loader,crit,opt)
    va_l,va_a,_,_=eval_epoch(mlp,val_loader,crit)
    print(f"MLP {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val: best_val=va_l; best=mlp.state_dict()

mlp.load_state_dict(best)


MLP 1: Train 0.6916/0.5281, Val 0.6926/0.5193
MLP 2: Train 0.6910/0.5326, Val 0.6928/0.5193
MLP 3: Train 0.6906/0.5327, Val 0.6935/0.5193
MLP 4: Train 0.6908/0.5336, Val 0.6922/0.5204
MLP 5: Train 0.6898/0.5360, Val 0.6922/0.5132
MLP 6: Train 0.6895/0.5322, Val 0.6926/0.5059
MLP 7: Train 0.6898/0.5307, Val 0.6923/0.5078
MLP 8: Train 0.6889/0.5381, Val 0.6929/0.5078
MLP 9: Train 0.6884/0.5358, Val 0.6953/0.5128
MLP 10: Train 0.6886/0.5345, Val 0.6961/0.5116
MLP 11: Train 0.6882/0.5385, Val 0.6960/0.5139
MLP 12: Train 0.6871/0.5403, Val 0.6967/0.5094
MLP 13: Train 0.6871/0.5421, Val 0.6965/0.5128
MLP 14: Train 0.6870/0.5434, Val 0.6954/0.5101
MLP 15: Train 0.6861/0.5419, Val 0.6969/0.5136


<All keys matched successfully>

## 10. Test MLP

In [34]:
tl,ta,preds,labels=eval_epoch(mlp,test_loader,crit)
print("MLP Test Accuracy:", ta)

MLP Test Accuracy: 0.5421916762122948


## 11. Prepare LSTM Data

In [35]:
X_train_seq=X_train.reshape(-1,WINDOW_SIZE,1)
X_val_seq=X_val.reshape(-1,WINDOW_SIZE,1)
X_test_seq=X_test.reshape(-1,WINDOW_SIZE,1)

train_loader_seq=DataLoader(TensorDataset(torch.tensor(X_train_seq),y_train_t),batch_size=BATCH_SIZE,shuffle=True)
val_loader_seq=DataLoader(TensorDataset(torch.tensor(X_val_seq),y_val_t),batch_size=BATCH_SIZE)
test_loader_seq=DataLoader(TensorDataset(torch.tensor(X_test_seq),y_test_t),batch_size=BATCH_SIZE)

## 12. Train LSTM

In [36]:
lstm=LSTMClassifier(feature_dim=1).to(device)
opt2=torch.optim.Adam(lstm.parameters(),lr=LR)
best_l=None; best_val_l=float("inf")

for epoch in range(1,EPOCHS_LSTM+1):
    tr_l,tr_a=train_epoch(lstm,train_loader_seq,crit,opt2)
    va_l,va_a,_,_=eval_epoch(lstm,val_loader_seq,crit)
    print(f"LSTM {epoch}: Train {tr_l:.4f}/{tr_a:.4f}, Val {va_l:.4f}/{va_a:.4f}")
    if va_l<best_val_l: best_val_l=va_l; best_l=lstm.state_dict()

lstm.load_state_dict(best_l)

LSTM 1: Train 0.6917/0.5273, Val 0.6924/0.5193
LSTM 2: Train 0.6912/0.5326, Val 0.6927/0.5193
LSTM 3: Train 0.6911/0.5326, Val 0.6929/0.5193
LSTM 4: Train 0.6913/0.5326, Val 0.6928/0.5193
LSTM 5: Train 0.6911/0.5326, Val 0.6923/0.5193
LSTM 6: Train 0.6911/0.5326, Val 0.6933/0.5193
LSTM 7: Train 0.6911/0.5326, Val 0.6926/0.5193
LSTM 8: Train 0.6910/0.5326, Val 0.6923/0.5193
LSTM 9: Train 0.6910/0.5326, Val 0.6922/0.5193
LSTM 10: Train 0.6911/0.5326, Val 0.6923/0.5193
LSTM 11: Train 0.6910/0.5327, Val 0.6933/0.5193
LSTM 12: Train 0.6910/0.5327, Val 0.6928/0.5193
LSTM 13: Train 0.6910/0.5326, Val 0.6925/0.5193
LSTM 14: Train 0.6910/0.5327, Val 0.6929/0.5189
LSTM 15: Train 0.6910/0.5327, Val 0.6925/0.5193


<All keys matched successfully>

## 13. Test LSTM

In [37]:
tl,ta,preds,labels=eval_epoch(lstm,test_loader_seq,crit)
print("LSTM Test Accuracy:", ta)

LSTM Test Accuracy: 0.5441008018327605


## 14. Save Weights

In [38]:
os.makedirs("../models/saved_weights",exist_ok=True)
torch.save(mlp.state_dict(),"../models/saved_weights/mlp_weights.pth")
torch.save(lstm.state_dict(),"../models/saved_weights/lstm_weights.pth")
print("Saved mlp_weights.pth and lstm_weights.pth")

Saved mlp_weights.pth and lstm_weights.pth
