<a href="https://colab.research.google.com/github/Shreya667777/Datasets/blob/main/finalproject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === EXTENDED FULL PIPELINE with Multi-Stock Support, Embedding Visualization, Backtesting ===

!pip install torch_geometric scikit-learn keras umap-learn --quiet

import os, torch, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from keras.models import Sequential
from keras.layers import LSTM, Dense, Input
from google.colab import files
from torch_geometric.data import Data
from torch_geometric.nn import GATConv, global_mean_pool
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime

# ==== Reproducibility ====
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)

# ==== Technical Indicators ====
def compute_rsi(series, period=14):
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=period, min_periods=1).mean()
    avg_loss = loss.rolling(window=period, min_periods=1).mean()
    rs = avg_gain / avg_loss
    return 100 - (100 / (1 + rs))

def add_technical_indicators(df):
    df['Close'] = pd.to_numeric(df['Close'], errors='coerce')
    df['MA_5'] = df['Close'].rolling(window=5).mean()
    df['MA_20'] = df['Close'].rolling(window=20).mean()
    df['RSI'] = compute_rsi(df['Close'])
    return df.dropna()

def build_graph_from_window(window):
    x = torch.tensor(window, dtype=torch.float32)
    edge_index = torch.tensor([[i, i+1] for i in range(len(window)-1)] + [[i+1, i] for i in range(len(window)-1)], dtype=torch.long).t().contiguous()
    return Data(x=x, edge_index=edge_index)

def create_windows(data, window_size):
    return np.array([data[i:i+window_size] for i in range(len(data) - window_size)])

class GATEncoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=16, output_dim=8):
        super().__init__()
        self.conv1 = GATConv(input_dim, hidden_dim, heads=2, concat=True)
        self.conv2 = GATConv(hidden_dim * 2, output_dim, heads=1)
    def forward(self, x, edge_index, batch):
        x = F.relu(self.conv1(x, edge_index))
        x = self.conv2(x, edge_index)
        return global_mean_pool(x, batch)

class SiameseGNN(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        self.fc = nn.Sequential(
            nn.Linear(8*2, 16),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(16, 1)
        )
    def forward(self, data1, data2):
        x1 = self.encoder(data1.x, data1.edge_index, data1.batch)
        x2 = self.encoder(data2.x, data2.edge_index, data2.batch)
        x = torch.cat([x1, x2], dim=1)
        sim = torch.sigmoid(self.fc(x))
        return sim.squeeze(), x1, x2

def generate_pairs(graphs, num_pairs=2000):
    pairs, labels = [], []
    half = num_pairs // 2
    for _ in range(half):
        idx = random.randint(0, len(graphs)-2)
        pairs.append((graphs[idx], graphs[idx+1]))
        labels.append(1)
    for _ in range(half):
        idx1, idx2 = random.sample(range(len(graphs)), 2)
        pairs.append((graphs[idx1], graphs[idx2]))
        labels.append(0)
    return pairs, torch.tensor(labels, dtype=torch.float32)

# === Load CSVs and Process ===
uploaded = files.upload()
window_size = 10
scaler = MinMaxScaler()
all_graphs, timestamps = [], []

for fname in uploaded.keys():
    df = pd.read_csv(fname)
    if 'Close' not in df.columns: continue
    df = add_technical_indicators(df)
    features = ['Close', 'MA_5', 'MA_20', 'RSI']
    data = df[features].values
    data_scaled = scaler.fit_transform(data)
    X = create_windows(data_scaled, window_size)
    for i, window in enumerate(X):
        g = build_graph_from_window(window)
        g.batch = torch.zeros(g.num_nodes, dtype=torch.long)
        all_graphs.append(g)
        timestamps.append(df.iloc[i+window_size-1]['Date'] if 'Date' in df.columns else i)

# === Train Siamese GNN ===
pairs, labels = generate_pairs(all_graphs)
encoder = GATEncoder(input_dim=4)
model = SiameseGNN(encoder)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

model.train()
for epoch in range(30):
    total_loss = 0
    for i in range(0, len(pairs), 32):
        batch_pairs = pairs[i:i+32]
        batch_labels = labels[i:i+32]
        pred_batch = []
        for j, (g1, g2) in enumerate(batch_pairs):
            g1.batch = torch.zeros(g1.num_nodes, dtype=torch.long)
            g2.batch = torch.zeros(g2.num_nodes, dtype=torch.long)
            pred, _, _ = model(g1, g2)
            pred_batch.append(pred)
        pred_batch = torch.stack(pred_batch)
        loss = criterion(pred_batch, batch_labels[:len(pred_batch)])
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(pairs):.4f}")

# === Evaluation ===
model.eval()
preds, targets = [], []
with torch.no_grad():
    for i, (g1, g2) in enumerate(pairs):
        g1.batch = torch.zeros(g1.num_nodes, dtype=torch.long)
        g2.batch = torch.zeros(g2.num_nodes, dtype=torch.long)
        pred, _, _ = model(g1, g2)
        preds.append(1 if pred.item() > 0.5 else 0)
        targets.append(int(labels[i].item()))
acc = accuracy_score(targets, preds)
prec = precision_score(targets, preds)
rec = recall_score(targets, preds)
f1 = f1_score(targets, preds)
print(f"\n✅ Siamese GNN Accuracy: {acc*100:.2f}% | Precision: {prec:.2f}, Recall: {rec:.2f}, F1: {f1:.2f}")

# === Save model weights ===
torch.save(model.state_dict(), "siamese_gnn_weights.pth")

# === Embedding extraction ===
embeddings = []
with torch.no_grad():
    for g in all_graphs:
        g.batch = torch.zeros(g.num_nodes, dtype=torch.long)
        emb = encoder(g.x, g.edge_index, g.batch)
        embeddings.append(emb.squeeze().numpy())
embeddings = np.array(embeddings)

# === Embedding Visualization (PCA + t-SNE) ===
pca = PCA(n_components=2).fit_transform(embeddings)
tsne = TSNE(n_components=2, perplexity=30, random_state=seed).fit_transform(embeddings)

plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.scatter(pca[:,0], pca[:,1], c='blue', s=10)
plt.title("PCA of Graph Embeddings")
plt.grid(True)

plt.subplot(1,2,2)
plt.scatter(tsne[:,0], tsne[:,1], c='green', s=10)
plt.title("t-SNE of Graph Embeddings")
plt.grid(True)
plt.show()

# === LSTM Forecasting ===
X_lstm, y_lstm = [], []
for i in range(len(embeddings)-5):
    X_lstm.append(embeddings[i:i+5])
    y_lstm.append(embeddings[i+5])
X_lstm, y_lstm = np.array(X_lstm), np.array(y_lstm)

lstm = Sequential()
lstm.add(Input(shape=(X_lstm.shape[1], X_lstm.shape[2])))
lstm.add(LSTM(64, return_sequences=True))
lstm.add(LSTM(64))
lstm.add(Dense(8))
lstm.compile(optimizer='adam', loss='mse')
lstm.fit(X_lstm, y_lstm, epochs=20, batch_size=32, verbose=0)
y_pred = lstm.predict(X_lstm)
mse = np.mean((y_lstm - y_pred) ** 2)
print(f"\n✅ LSTM Forecast MSE: {mse:.4f}")

# === Change-Point Detection ===
distances = [np.linalg.norm(embeddings[i] - embeddings[i+1]) for i in range(len(embeddings)-1)]
threshold = np.mean(distances) + 1.5*np.std(distances)
change_points = np.where(np.array(distances) > threshold)[0]

plt.figure(figsize=(12,4))
plt.plot(distances, label='Embedding Distance')
plt.scatter(change_points, np.array(distances)[change_points], color='red', label='Change Points')
plt.title("Graph Embedding Distance & Detected Change Points")
plt.xlabel("Window Index")
plt.ylabel("Distance")
plt.xticks(np.arange(0, len(distances), 500))  # ✅ Set X-axis ticks every 500
plt.legend()
plt.grid(True)
plt.show()

# === Backtesting (Simple Strategy) ===
profits = []
initial_cash = 100000
cash = initial_cash
position = 0
buy_price = 0
change_points_set = set(change_points)

for i in range(len(timestamps)-1):
    if i in change_points_set:
        if position == 0:
            position = cash // data[i,0]
            buy_price = data[i,0]
            cash -= position * buy_price
        else:
            cash += position * data[i,0]
            position = 0
    profits.append(cash + position * data[i,0])

plt.figure(figsize=(10,4))
plt.plot(profits, label='Portfolio Value')
plt.axhline(initial_cash, color='gray', linestyle='--', label='Initial Cash')
plt.title("Backtest Around Change Points")
plt.legend()
plt.xticks(np.arange(0, len(distances), 500))  # ✅ Set X-axis ticks every 500
plt.grid(True)
plt.show()

print(f"\n📈 Final Portfolio Value: ₹{profits[-1]:,.2f}")



[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25h

Saving bajaj_auto_stock_data.csv to bajaj_auto_stock_data.csv
Epoch 1, Loss: 0.0228
Epoch 2, Loss: 0.0221
Epoch 3, Loss: 0.0221
Epoch 4, Loss: 0.0220
Epoch 5, Loss: 0.0220
Epoch 6, Loss: 0.0220
Epoch 7, Loss: 0.0219
Epoch 8, Loss: 0.0218
Epoch 9, Loss: 0.0213
Epoch 10, Loss: 0.0198
Epoch 11, Loss: 0.0186
Epoch 12, Loss: 0.0202
Epoch 13, Loss: 0.0148
Epoch 14, Loss: 0.0140
Epoch 15, Loss: 0.0120
Epoch 16, Loss: 0.0115
Epoch 17, Loss: 0.0105
Epoch 18, Loss: 0.0098
Epoch 19, Loss: 0.0095
Epoch 20, Loss: 0.0088
