# NIFTY50 Stock Predictor - Training on Google Colab

This notebook trains the HybridForecaster model on all 50 NIFTY stocks.

**Instructions:**
1. Upload your `Nifty50-predictor` folder to Google Drive
2. Make sure Runtime ‚Üí Change runtime type ‚Üí GPU is selected
3. Run all cells in order


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
import sys
import os
import math
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

print("=" * 60)
print("CHECKING YOUR GOOGLE DRIVE STRUCTURE")
print("=" * 60)

drive_root = Path("/content/drive/MyDrive")
print(f"\nFiles/folders in your Google Drive root:")
for item in sorted(drive_root.iterdir()):
    print(f"  {'üìÅ' if item.is_dir() else 'üìÑ'} {item.name}")

DRIVE_PATH = Path("/content/drive/MyDrive/Nifty50-predictor")

if not DRIVE_PATH.exists():
    print(f"\n‚ùå ERROR: Folder not found: {DRIVE_PATH}")
    print("\nLooking for similar folders...")
    for item in drive_root.iterdir():
        if "nifty" in item.name.lower() or "predictor" in item.name.lower():
            print(f"  Found: {item}")
    print("\n‚ö†Ô∏è  Please upload your Nifty50-predictor folder to Google Drive root!")
    print("   Or update DRIVE_PATH below to match your folder name.")
else:
    print(f"\n‚úÖ Project folder found: {DRIVE_PATH}")
    print(f"\nContents of project folder:")
    for item in sorted(DRIVE_PATH.iterdir()):
        print(f"  {'üìÅ' if item.is_dir() else 'üìÑ'} {item.name}")

PROCESSED_DIR = DRIVE_PATH / "data" / "processed"
MODELS_DIR = DRIVE_PATH / "models"

if PROCESSED_DIR.exists():
    csv_files = list(PROCESSED_DIR.glob("*.csv"))
    print(f"\n‚úÖ Processed data folder found: {PROCESSED_DIR}")
    print(f"   CSV files found: {len(csv_files)}")
    if csv_files:
        print(f"   First 5 files: {[f.name for f in csv_files[:5]]}")
else:
    print(f"\n‚ùå ERROR: Processed data folder not found: {PROCESSED_DIR}")
    print("\n‚ö†Ô∏è  You need to run download.py and feature_engineering.py first!")
    print("   Or upload the data/processed folder with CSV files.")
    
    data_dir = DRIVE_PATH / "data"
    if data_dir.exists():
        print(f"\nContents of data folder:")
        for item in sorted(data_dir.iterdir()):
            print(f"  {'üìÅ' if item.is_dir() else 'üìÑ'} {item.name}")

SEQ_LENGTH = 120
BATCH_SIZE = 64
LEARNING_RATE = 1e-4
EPOCHS = 50

FEATURE_COLUMNS = ["Open", "High", "Low", "Close", "Volume"]
TARGET_COLUMNS = ["Return_3M", "Return_1Y", "Return_3Y"]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"\n{'=' * 60}")
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

MODELS_DIR.mkdir(parents=True, exist_ok=True)
print(f"\nProject path: {DRIVE_PATH}")
print(f"Processed data: {PROCESSED_DIR}")
print(f"Models will be saved to: {MODELS_DIR}")


In [None]:
class StockDataset(Dataset):
    def __init__(self, data_dir, seq_length=120, tickers=None, train=True, scaler_params=None):
        self.data_dir = Path(data_dir)
        self.seq_length = seq_length
        self.train = train
        
        self.X = []
        self.Y = []
        self.scaler_params = scaler_params or {}
        
        if tickers:
            csv_files = [self.data_dir / f"{t}.csv" for t in tickers]
            csv_files = [f for f in csv_files if f.exists()]
        else:
            csv_files = list(self.data_dir.glob("*.csv"))
        
        for csv_file in csv_files:
            self._process_stock(csv_file)
        
        self.X = np.array(self.X, dtype=np.float32)
        self.Y = np.array(self.Y, dtype=np.float32)
        
        self._normalize_data()
    
    def _process_stock(self, filepath):
        df = pd.read_csv(filepath, index_col=0, parse_dates=True)
        
        features = df[FEATURE_COLUMNS].values
        targets = df[TARGET_COLUMNS].values
        
        for i in range(len(df) - self.seq_length):
            x = features[i:i + self.seq_length]
            y = targets[i + self.seq_length - 1]
            
            if np.isnan(x).any() or np.isnan(y).any():
                continue
            
            self.X.append(x)
            self.Y.append(y)
    
    def _normalize_data(self):
        if self.train:
            X_flat = self.X.reshape(-1, len(FEATURE_COLUMNS))
            self.scaler_params = {
                'min': X_flat.min(axis=0),
                'max': X_flat.max(axis=0),
            }
        
        min_vals = self.scaler_params['min']
        max_vals = self.scaler_params['max']
        range_vals = max_vals - min_vals
        range_vals[range_vals == 0] = 1
        self.X = (self.X - min_vals) / range_vals
    
    def get_scaler_params(self):
        return self.scaler_params
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx]), torch.tensor(self.Y[idx])

print("StockDataset class defined ‚úÖ")


In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000, dropout=0.1):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
    
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :]
        return self.dropout(x)


class HybridForecaster(nn.Module):
    def __init__(self, input_size=5, seq_length=120, cnn_channels=64, lstm_hidden=128,
                 lstm_layers=2, transformer_heads=4, transformer_layers=2, dropout=0.2):
        super().__init__()
        
        self.cnn = nn.Sequential(
            nn.Conv1d(input_size, cnn_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(cnn_channels),
            nn.ReLU(),
            nn.Conv1d(cnn_channels, cnn_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(cnn_channels),
            nn.ReLU(),
            nn.Dropout(dropout),
        )
        
        self.bilstm = nn.LSTM(
            input_size=cnn_channels,
            hidden_size=lstm_hidden,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout if lstm_layers > 1 else 0,
        )
        
        bilstm_output_size = lstm_hidden * 2
        
        self.pos_encoder = PositionalEncoding(bilstm_output_size, seq_length, dropout)
        
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=bilstm_output_size,
            nhead=transformer_heads,
            dim_feedforward=bilstm_output_size * 4,
            dropout=dropout,
            batch_first=True,
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=transformer_layers)
        
        self.feature_pool = nn.AdaptiveAvgPool1d(1)
        
        self.head_3m = nn.Sequential(
            nn.Linear(bilstm_output_size, 64), nn.ReLU(), nn.Dropout(dropout), nn.Linear(64, 1)
        )
        self.head_1y = nn.Sequential(
            nn.Linear(bilstm_output_size, 64), nn.ReLU(), nn.Dropout(dropout), nn.Linear(64, 1)
        )
        self.head_3y = nn.Sequential(
            nn.Linear(bilstm_output_size, 64), nn.ReLU(), nn.Dropout(dropout), nn.Linear(64, 1)
        )
    
    def forward(self, x):
        x = x.permute(0, 2, 1)
        x = self.cnn(x)
        x = x.permute(0, 2, 1)
        
        x, _ = self.bilstm(x)
        
        x = self.pos_encoder(x)
        x = self.transformer(x)
        
        x = x.permute(0, 2, 1)
        x = self.feature_pool(x)
        x = x.squeeze(-1)
        
        return self.head_3m(x), self.head_1y(x), self.head_3y(x)

print("HybridForecaster model defined ‚úÖ")


In [None]:
def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0.0
    n_batches = 0
    
    for X, Y in dataloader:
        X = X.to(device)
        Y = Y.to(device)
        
        optimizer.zero_grad()
        
        pred_3m, pred_1y, pred_3y = model(X)
        
        loss_3m = criterion(pred_3m.squeeze(), Y[:, 0])
        loss_1y = criterion(pred_1y.squeeze(), Y[:, 1])
        loss_3y = criterion(pred_3y.squeeze(), Y[:, 2])
        
        loss = loss_3m + loss_1y + loss_3y
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        n_batches += 1
    
    return total_loss / n_batches


def train_stock(ticker, processed_dir, models_dir, device):
    print(f"\n{'='*60}")
    print(f"Training model for: {ticker}")
    print(f"{'='*60}")
    
    try:
        dataset = StockDataset(
            data_dir=processed_dir,
            seq_length=SEQ_LENGTH,
            tickers=[ticker],
            train=True,
        )
    except Exception as e:
        print(f"‚ùå Error loading {ticker}: {e}")
        return None
    
    if len(dataset) == 0:
        print(f"‚ùå No samples for {ticker}, skipping...")
        return None
    
    print(f"Dataset size: {len(dataset)} samples")
    
    dataloader = DataLoader(
        dataset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=2,
        pin_memory=True,
    )
    
    sample_x, _ = dataset[0]
    input_size = sample_x.shape[1]
    
    model = HybridForecaster(
        input_size=input_size,
        seq_length=SEQ_LENGTH,
    ).to(device)
    
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5
    )
    
    best_loss = float('inf')
    
    for epoch in tqdm(range(EPOCHS), desc=f"Training {ticker}"):
        avg_loss = train_one_epoch(model, dataloader, optimizer, criterion, device)
        scheduler.step(avg_loss)
        
        if avg_loss < best_loss:
            best_loss = avg_loss
        
        if (epoch + 1) % 10 == 0:
            print(f"  Epoch {epoch + 1}/{EPOCHS} | Loss: {avg_loss:.6f}")
    
    model_path = models_dir / f"{ticker}_model.pt"
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scaler_params': dataset.get_scaler_params(),
        'best_loss': best_loss,
        'epochs': EPOCHS,
        'ticker': ticker,
    }, model_path)
    
    print(f"‚úÖ Model saved: {model_path}")
    print(f"   Best loss: {best_loss:.6f}")
    
    return best_loss

print("Training functions defined ‚úÖ")


In [None]:
csv_files = sorted(PROCESSED_DIR.glob("*.csv"))
tickers = [f.stem for f in csv_files]

print(f"Found {len(tickers)} stocks to train:")
print(", ".join(tickers))


In [None]:
print(f"\n{'#'*60}")
print(f"STARTING TRAINING FOR ALL {len(tickers)} STOCKS")
print(f"{'#'*60}")

results = {}

for i, ticker in enumerate(tickers):
    print(f"\n[{i+1}/{len(tickers)}] Processing {ticker}...")
    
    loss = train_stock(ticker, PROCESSED_DIR, MODELS_DIR, device)
    
    if loss is not None:
        results[ticker] = loss
    
    torch.cuda.empty_cache()


In [None]:
print(f"\n{'#'*60}")
print(f"TRAINING COMPLETE!")
print(f"{'#'*60}")

print(f"\nSuccessfully trained: {len(results)}/{len(tickers)} models")

if results:
    print(f"\nResults summary:")
    print(f"{'-'*40}")
    sorted_results = sorted(results.items(), key=lambda x: x[1])
    for ticker, loss in sorted_results:
        print(f"  {ticker:15} | Loss: {loss:.6f}")
    
    print(f"\nBest performing: {sorted_results[0][0]} (loss: {sorted_results[0][1]:.6f})")
    print(f"Worst performing: {sorted_results[-1][0]} (loss: {sorted_results[-1][1]:.6f})")

print(f"\nModels saved to: {MODELS_DIR}")
print(f"Total models: {len(list(MODELS_DIR.glob('*.pt')))}")
