In [30]:
!pip install torch numpy pandas scikit-learn matplotlib seaborn



In [34]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns

# Load data from database
conn = sqlite3.connect('stocks.db')
df = pd.read_sql_query("""
    SELECT Date, Ticker, Open, High, Low, Close, Volume, 
           SMA_50, SMA_200, EMA_50, RSI, MACD 
    FROM preprocessed_stock_data
    WHERE Date BETWEEN '2015-01-01' AND '2025-03-28'
    ORDER BY Date
""", conn)
conn.close()

print(f"Loaded data shape: {df.shape}")
print(f"Unique tickers: {df['Ticker'].nunique()}")
print(f"Date range: {df['Date'].min()} to {df['Date'].max()}")
df.head()

Loaded data shape: (284652, 12)
Unique tickers: 51
Date range: 2015-01-01 00:00:00+05:30 to 2025-03-27 00:00:00+05:30


Unnamed: 0,Date,Ticker,Open,High,Low,Close,Volume,SMA_50,SMA_200,EMA_50,RSI,MACD
0,2015-01-01 00:00:00+05:30,ADANIENT.NS,0.002012,0.00202,0.002032,0.002033,0.00614,0.002129,0.002224,0.002159,0.0,0.77524
1,2015-01-01 00:00:00+05:30,ADANIENT.NS,0.002012,0.00202,0.002032,0.002033,0.00614,0.002129,0.002224,0.002159,0.0,0.77524
2,2015-01-01 00:00:00+05:30,ADANIPORTS.NS,0.009476,0.009472,0.009569,0.009477,0.002265,0.01002,0.010532,0.010169,0.0,0.76132
3,2015-01-01 00:00:00+05:30,ADANIPORTS.NS,0.009476,0.009472,0.009569,0.009477,0.002265,0.01002,0.010532,0.010169,0.0,0.748682
4,2015-01-01 00:00:00+05:30,APOLLOHOSP.NS,0.034677,0.034462,0.034932,0.034451,0.00012,0.036492,0.038401,0.037037,0.0,0.775929


In [35]:
# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Volume', 
            'SMA_50', 'SMA_200', 'EMA_50', 'RSI', 'MACD']

# Create sequence data
def create_sequences(data, look_back=60):
    X, y = [], []
    for i in range(len(data)-look_back-1):
        X.append(data[i:(i+look_back)])
        y.append(data[i+look_back, 3])  # Close price as target
    return np.array(X), np.array(y)

# Normalize data per stock
scaler_dict = {}
processed_data = []

for ticker, group in df.groupby('Ticker'):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_data = scaler.fit_transform(group[features])
    scaler_dict[ticker] = scaler
    
    X, y = create_sequences(scaled_data)
    for seq, target in zip(X, y):
        processed_data.append({
            'ticker': ticker,
            'sequence': seq,
            'target': target
        })

# Convert to numpy arrays
X = np.array([item['sequence'] for item in processed_data])
y = np.array([item['target'] for item in processed_data])

print(f"Final dataset shape: X={X.shape}, y={y.shape}")

Final dataset shape: X=(281541, 60, 10), y=(281541,)


In [36]:
class StockDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
        
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return (
            torch.FloatTensor(self.sequences[idx]),
            torch.FloatTensor([self.targets[idx]])
        )

# Split data
train_size = int(0.8 * len(X))
train_data = StockDataset(X[:train_size], y[:train_size])
val_data = StockDataset(X[train_size:], y[train_size:])

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_data, batch_size=batch_size)

In [40]:
import math


class StockTransformer(nn.Module):
    def __init__(self, input_dim, num_layers=3, nhead=5, dim_feedforward=512):
        super().__init__()
        # Ensure input_dim is divisible by nhead
        assert input_dim % nhead == 0, "input_dim must be divisible by nhead"
        
        self.input_dim = input_dim
        self.pos_encoder = PositionalEncoding(input_dim)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=input_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=0.1,
            batch_first=True  # Add this for modern PyTorch versions
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.decoder = nn.Linear(input_dim, 1)

    def forward(self, src):
        src = src.permute(1, 0, 2)  # (seq_len, batch, features)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src)
        output = self.decoder(output[-1])
        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=0.1)
        
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)

In [41]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StockTransformer(input_dim=len(features)).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

# Training loop
num_epochs = 100
train_losses, val_losses = [], []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for batch, (seq, target) in enumerate(train_loader):
        seq, target = seq.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(seq)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for seq, target in val_loader:
            seq, target = seq.to(device), target.to(device)
            output = model(seq)
            val_loss += criterion(output, target).item()
    
    # Save losses
    train_loss = epoch_loss/len(train_loader)
    val_loss = val_loss/len(val_loader)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    print(f'Epoch {epoch+1}/{num_epochs} | '
          f'Train Loss: {train_loss:.6f} | '
          f'Val Loss: {val_loss:.6f}')

# Plot training progress
plt.figure(figsize=(10, 6))
plt.plot(train_losses, label='Training Loss')
plt.plot(val_losses, label='Validation Loss')
plt.title('Training Progress')
plt.xlabel('Epochs')
plt.ylabel('MSE Loss')
plt.legend()
plt.show()



Epoch 1/100 | Train Loss: 0.028029 | Val Loss: 0.037791
Epoch 2/100 | Train Loss: 0.008470 | Val Loss: 0.033618


KeyboardInterrupt: 