In [6]:
!pip install torch numpy pandas scikit-learn matplotlib seaborn



In [8]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt

# Load data
conn = sqlite3.connect('stocks.db')
df = pd.read_sql("""
    SELECT Date, Ticker, Close, SMA_50, SMA_200, EMA_50, RSI, MACD 
    FROM preprocessed_stock_data
    WHERE Date BETWEEN '2015-03-30' AND '2024-03-27'
    ORDER BY Date
""", conn)
conn.close()

# Filter to 5 liquid stocks for faster training
top_stocks = df.groupby('Ticker')['Close'].count().nlargest(20).index.tolist()
df = df[df['Ticker'].isin(top_stocks)]
print(f"Working with stocks: {top_stocks}")

Working with stocks: ['BHARTIARTL.NS', 'HCLTECH.NS', 'HDFCBANK.NS', 'HINDUNILVR.NS', 'ICICIBANK.NS', 'INFY.NS', 'ITC.NS', 'KOTAKBANK.NS', 'LT.NS', 'RELIANCE.NS', 'SBIN.NS', 'TCS.NS', 'ADANIENT.NS', 'ADANIPORTS.NS', 'APOLLOHOSP.NS', 'ASIANPAINT.NS', 'AXISBANK.NS', 'BAJAJ-AUTO.NS', 'BAJAJFINSV.NS', 'BAJFINANCE.NS']


In [9]:
# Per-stock scaling
scaler_dict = {}
features = ['Close', 'SMA_50', 'SMA_200', 'EMA_50', 'RSI', 'MACD']
look_back = 20  # Reduced sequence length

def create_sequences(data):
    X, y = [], []
    for i in range(len(data)-look_back-1):
        X.append(data[i:(i+look_back)])
        y.append(data[i+look_back, 0])  # Predict Close price
    return np.array(X), np.array(y)

processed_data = []
for ticker, group in df.groupby('Ticker'):
    scaler = MinMaxScaler(feature_range=(-1, 1))
    scaled_data = scaler.fit_transform(group[features])
    X_ticker, y_ticker = create_sequences(scaled_data)
    
    for seq, target in zip(X_ticker, y_ticker):
        processed_data.append({
            'ticker': ticker,
            'sequence': seq,
            'target': target,
            'scaler': scaler
        })

# Convert to arrays
X = np.array([item['sequence'] for item in processed_data])
y = np.array([item['target'] for item in processed_data])
print(f"Final dataset shape: X={X.shape}, y={y.shape}")

Final dataset shape: X=(115020, 20, 6), y=(115020,)


In [10]:
class StockLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=50, num_layers=2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=0.2
        )
        self.fc = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Last time step
        return out.squeeze()

In [11]:
# Split data
train_size = int(0.8 * len(X))
X_train, X_val = X[:train_size], X[train_size:]
y_train, y_val = y[:train_size], y[train_size:]

# Create datasets
class StockDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.X[idx]), torch.FloatTensor([self.y[idx]])

train_loader = DataLoader(StockDataset(X_train, y_train), batch_size=32, shuffle=True)
val_loader = DataLoader(StockDataset(X_val, y_val), batch_size=32)

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = StockLSTM(input_size=len(features)).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [12]:
# Train for just 10 epochs with early stopping
num_epochs = 10
best_loss = float('inf')
patience = 3
no_improve = 0

train_losses = []
val_losses = []

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val, y_val = X_val.to(device), y_val.to(device)
            outputs = model(X_val)
            val_loss += criterion(outputs, y_val).item()
    
    # Track losses
    avg_train = epoch_loss/len(train_loader)
    avg_val = val_loss/len(val_loader)
    train_losses.append(avg_train)
    val_losses.append(avg_val)
    
    # Early stopping
    if avg_val < best_loss:
        best_loss = avg_val
        no_improve = 0
        torch.save(model.state_dict(), 'best_model.pth')
    else:
        no_improve += 1
    
    print(f'Epoch {epoch+1}: Train Loss {avg_train:.4f}, Val Loss {avg_val:.4f}')
    
    if no_improve >= patience:
        print("Early stopping!")
        break

# Plot training
plt.plot(train_losses, label='Train')
plt.plot(val_losses, label='Validation')
plt.title('Training Progress')
plt.legend()
plt.show()

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1: Train Loss 0.3269, Val Loss 0.2704


KeyboardInterrupt: 