In [3]:
!pip install torch-geometric


Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: torch-geometric
Successfully installed torch-geometric-2.6.1


In [9]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, SAGEConv, global_mean_pool
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, roc_auc_score
from sklearn.metrics.pairwise import cosine_similarity
import random
import pickle
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

print("=== GNN LINK PREDICTION WITH ALL FEATURES ===\n")

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Device setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 1. LOAD ALL FEATURES ---
print("\n=== LOADING FEATURES ===")

# Load embeddings
embeddings_df = pd.read_pickle('/kaggle/input/features-ready/embeddings.pkl')
print(f"Loaded embeddings: {embeddings_df.shape}")

# Load PCA features  
pca_features_df = pd.read_pickle('/kaggle/input/features-ready/node_features_pca.pkl')
print(f"Loaded PCA features: {pca_features_df.shape}")

# Load graph features
graph_features_df = pd.read_pickle('/kaggle/input/features-ready/simple_graph_features.pkl')
print(f"Loaded graph features: {graph_features_df.shape}")

# Ensure all have same nodes
common_nodes = set(embeddings_df.index) & set(pca_features_df.index) & set(graph_features_df.index)
print(f"Common nodes: {len(common_nodes)}")

# Combine all features
print("Combining all features...")
all_node_features = []
node_list = sorted(list(common_nodes))

for node in node_list:
    embedding = embeddings_df.loc[node].values          # 64 dims
    pca_features = pca_features_df.loc[node].values     # 256 dims  
    graph_features = graph_features_df.loc[node].values # 10 dims
    
    combined = np.concatenate([embedding, pca_features, graph_features])
    all_node_features.append(combined)

# Convert to tensor
node_features_tensor = torch.FloatTensor(all_node_features)
print(f"Combined features shape: {node_features_tensor.shape}")

# Create node mapping
node_to_idx = {node: idx for idx, node in enumerate(node_list)}
print(f"Node mapping created for {len(node_to_idx)} nodes")

# --- 2. LOAD EDGES AND CREATE GRAPH ---
print("\n=== LOADING EDGES ===")

train_edges_df = pd.read_csv('/kaggle/input/features-ready/train_positives.csv')
test_edges_df = pd.read_csv('/kaggle/input/features-ready/test_positives (1).csv')

print(f"Train edges: {len(train_edges_df)}")
print(f"Test edges: {len(test_edges_df)}")

# Convert edges to indices
def edges_to_indices(edges_df, node_mapping):
    valid_edges = []
    for _, row in edges_df.iterrows():
        if row['node_1'] in node_mapping and row['node_2'] in node_mapping:
            idx1 = node_mapping[row['node_1']]
            idx2 = node_mapping[row['node_2']]
            valid_edges.append([idx1, idx2])
    return torch.LongTensor(valid_edges).t()

train_edge_index = edges_to_indices(train_edges_df, node_to_idx)
test_edge_index = edges_to_indices(test_edges_df, node_to_idx)

print(f"Train edge tensor shape: {train_edge_index.shape}")
print(f"Test edge tensor shape: {test_edge_index.shape}")

# --- 3. GENERATE NEGATIVE EDGES ---
print("\n=== GENERATING NEGATIVE EDGES ===")

def generate_negative_edges(num_nodes, positive_edges, num_negatives):
    """Generate negative edges that don't exist in positive edges"""
    positive_set = set()
    for i in range(positive_edges.shape[1]):
        edge = tuple(sorted([positive_edges[0, i].item(), positive_edges[1, i].item()]))
        positive_set.add(edge)
    
    negative_edges = []
    while len(negative_edges) < num_negatives:
        node1 = random.randint(0, num_nodes - 1)
        node2 = random.randint(0, num_nodes - 1)
        
        if node1 != node2:
            edge = tuple(sorted([node1, node2]))
            if edge not in positive_set:
                negative_edges.append([node1, node2])
                positive_set.add(edge)  # Avoid duplicates
    
    return torch.LongTensor(negative_edges).t()

# Generate negative edges
train_neg_edge_index = generate_negative_edges(
    len(node_list), train_edge_index, train_edge_index.shape[1]
)
test_neg_edge_index = generate_negative_edges(
    len(node_list), 
    torch.cat([train_edge_index, test_edge_index], dim=1),
    test_edge_index.shape[1]
)

print(f"Train negative edges: {train_neg_edge_index.shape}")
print(f"Test negative edges: {test_neg_edge_index.shape}")

# --- 4. DEFINE GNN MODEL ---
print("\n=== DEFINING GNN MODEL ===")

class LinkPredictionGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, num_layers=2):
        super(LinkPredictionGNN, self).__init__()
        
        # GNN layers
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(input_dim, hidden_dim))
        
        for _ in range(num_layers - 1):
            self.convs.append(GCNConv(hidden_dim, hidden_dim))
        
        # Link prediction head
        self.link_predictor = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim // 2, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x, edge_index, edge_pairs):
        # GNN forward pass
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i < len(self.convs) - 1:
                x = F.relu(x)
                x = F.dropout(x, training=self.training)
        
        # Get embeddings for edge pairs
        row, col = edge_pairs
        edge_embeddings = torch.cat([x[row], x[col]], dim=1)
        
        # Predict link probability
        return self.link_predictor(edge_embeddings)

# Initialize model
input_dim = node_features_tensor.shape[1]
model = LinkPredictionGNN(input_dim, hidden_dim=128, num_layers=2)
model = model.to(device)

print(f"Model created with input dim: {input_dim}")
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")

# --- 5. TRAINING PREPARATION ---
print("\n=== PREPARING TRAINING ===")

# Move data to device
node_features_tensor = node_features_tensor.to(device)
train_edge_index = train_edge_index.to(device)

# Prepare training data
train_pos_pairs = train_edge_index
train_neg_pairs = train_neg_edge_index.to(device)

# Combine positive and negative pairs
train_pairs = torch.cat([train_pos_pairs, train_neg_pairs], dim=1)
train_labels = torch.cat([
    torch.ones(train_pos_pairs.shape[1]),
    torch.zeros(train_neg_pairs.shape[1])
]).to(device)

print(f"Training pairs: {train_pairs.shape[1]}")
print(f"Positive: {train_pos_pairs.shape[1]}, Negative: {train_neg_pairs.shape[1]}")

# --- 6. TRAINING LOOP ---
print("\n=== TRAINING MODEL ===")

optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
criterion = nn.BCELoss()

num_epochs = 400
best_loss = float('inf')

model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    predictions = model(node_features_tensor, train_edge_index, train_pairs).squeeze()
    
    # Compute loss
    loss = criterion(predictions, train_labels)
    
    # Backward pass
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f"Epoch {epoch:3d}, Loss: {loss.item():.4f}")
    
    if loss.item() < best_loss:
        best_loss = loss.item()
        torch.save(model.state_dict(), '/kaggle/working/best_gnn_model.pth')

print(f"Training completed. Best loss: {best_loss:.4f}")

# --- 7. TESTING ---
print("\n=== TESTING MODEL ===")

# Load best model
model.load_state_dict(torch.load('/kaggle/working/best_gnn_model.pth'))
model.eval()

# Prepare test data
test_pos_pairs = test_edge_index.to(device)
test_neg_pairs = test_neg_edge_index.to(device)

test_pairs = torch.cat([test_pos_pairs, test_neg_pairs], dim=1)
test_labels = torch.cat([
    torch.ones(test_pos_pairs.shape[1]),
    torch.zeros(test_neg_pairs.shape[1])
]).cpu().numpy()

print(f"Test pairs: {test_pairs.shape[1]}")
print(f"Positive: {test_pos_pairs.shape[1]}, Negative: {test_neg_pairs.shape[1]}")

# Get predictions
with torch.no_grad():
    test_predictions = model(node_features_tensor, train_edge_index, test_pairs).squeeze().cpu().numpy()

# Convert probabilities to binary predictions
test_pred_binary = (test_predictions > 0.5).astype(int)

# Compute metrics
accuracy = accuracy_score(test_labels, test_pred_binary)
f1 = f1_score(test_labels, test_pred_binary)
precision = precision_score(test_labels, test_pred_binary)
recall = recall_score(test_labels, test_pred_binary)
auc = roc_auc_score(test_labels, test_predictions)

print(f"\n=== TEST RESULTS ===")
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1 Score:  {f1:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"AUC:       {auc:.4f}")

# --- 8. ADDITIONAL SIMILARITY-BASED ANALYSIS ---
print("\n=== SIMILARITY-BASED ANALYSIS ===")

# Get final node embeddings from GNN
with torch.no_grad():
    final_embeddings = model.convs[0](node_features_tensor, train_edge_index)
    for i in range(1, len(model.convs)):
        final_embeddings = model.convs[i](final_embeddings, train_edge_index)

final_embeddings = final_embeddings.cpu().numpy()

def compute_similarity_features(node1_idx, node2_idx, embeddings):
    """Compute various similarity features between two nodes"""
    emb1 = embeddings[node1_idx].reshape(1, -1)
    emb2 = embeddings[node2_idx].reshape(1, -1)
    
    # Cosine similarity
    cos_sim = cosine_similarity(emb1, emb2)[0, 0]
    
    # Euclidean distance
    euclidean_dist = np.linalg.norm(emb1 - emb2)
    
    # Absolute difference (L1 norm)
    l1_dist = np.sum(np.abs(emb1 - emb2))
    
    # Dot product
    dot_product = np.dot(emb1.flatten(), emb2.flatten())
    
    return {
        'cosine_similarity': cos_sim,
        'euclidean_distance': euclidean_dist,
        'l1_distance': l1_dist,
        'dot_product': dot_product
    }

# Compute similarity features for test pairs
print("Computing similarity features for test pairs...")
similarity_features = []
for i in range(test_pairs.shape[1]):
    node1_idx = test_pairs[0, i].item()
    node2_idx = test_pairs[1, i].item()
    sim_feats = compute_similarity_features(node1_idx, node2_idx, final_embeddings)
    similarity_features.append(sim_feats)

# Convert to DataFrame for analysis
sim_df = pd.DataFrame(similarity_features)
sim_df['label'] = test_labels
sim_df['gnn_prediction'] = test_predictions

print("\nSimilarity feature statistics by label:")
print(sim_df.groupby('label').agg({
    'cosine_similarity': ['mean', 'std'],
    'euclidean_distance': ['mean', 'std'],
    'l1_distance': ['mean', 'std'],
    'dot_product': ['mean', 'std']
}))

# Simple cosine similarity baseline
cos_sim_threshold = 0.5
cos_sim_predictions = (sim_df['cosine_similarity'] > cos_sim_threshold).astype(int)
cos_sim_f1 = f1_score(test_labels, cos_sim_predictions)

print(f"\nBaseline cosine similarity F1 (threshold={cos_sim_threshold}): {cos_sim_f1:.4f}")
print(f"GNN F1 score: {f1:.4f}")
print(f"Improvement: {f1 - cos_sim_f1:.4f}")

# --- 9. SAVE RESULTS ---
print("\n=== SAVING RESULTS ===")

results = {
    'model_state_dict': model.state_dict(),
    'test_metrics': {
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'auc': auc
    },
    'similarity_analysis': sim_df,
    'node_to_idx_mapping': node_to_idx,
    'final_embeddings': final_embeddings,
    'model_config': {
        'input_dim': input_dim,
        'hidden_dim': 128,
        'num_layers': 2
    }
}

with open('/kaggle/working/gnn_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print("✅ Results saved to '/kaggle/working/gnn_results.pkl'")

print(f"\n🎯 FINAL SUMMARY:")
print(f"✅ Trained GNN on {input_dim}-dimensional features")
print(f"✅ Used {len(node_list)} nodes, {train_pairs.shape[1]} training pairs")
print(f"✅ Test F1 Score: {f1:.4f}")
print(f"✅ Test Accuracy: {accuracy:.4f}")
print(f"✅ Model and results saved for future use")

# --- 10. PREDICTION FUNCTION ---
print("\n=== CREATING PREDICTION FUNCTION ===")

def predict_link(node1_id, node2_id, model, node_features, edge_index, node_mapping):
    """Predict if two nodes should be linked"""
    if node1_id not in node_mapping or node2_id not in node_mapping:
        return {"error": "Node not found in mapping"}
    
    model.eval()
    with torch.no_grad():
        idx1 = node_mapping[node1_id]
        idx2 = node_mapping[node2_id]
        
        test_pair = torch.LongTensor([[idx1], [idx2]]).to(device)
        prediction = model(node_features, edge_index, test_pair).item()
        
        return {
            'node1': node1_id,
            'node2': node2_id,
            'probability': prediction,
            'prediction': 'LINK' if prediction > 0.5 else 'NO LINK'
        }

# Test the prediction function
print("Testing prediction function...")
test_prediction = predict_link(1777, 19409, model, node_features_tensor, train_edge_index, node_to_idx)
print(f"Sample prediction: {test_prediction}")

print("\n🚀 GNN Link Prediction Pipeline Complete!")

=== GNN LINK PREDICTION WITH ALL FEATURES ===

Using device: cuda

=== LOADING FEATURES ===
Loaded embeddings: (28281, 64)
Loaded PCA features: (28281, 256)
Loaded graph features: (28281, 10)
Common nodes: 28281
Combining all features...
Combined features shape: torch.Size([28281, 330])
Node mapping created for 28281 nodes

=== LOADING EDGES ===
Train edges: 74201
Test edges: 18551
Train edge tensor shape: torch.Size([2, 74201])
Test edge tensor shape: torch.Size([2, 18551])

=== GENERATING NEGATIVE EDGES ===
Train negative edges: torch.Size([2, 74201])
Test negative edges: torch.Size([2, 18551])

=== DEFINING GNN MODEL ===
Model created with input dim: 330
Model parameters: 100,097

=== PREPARING TRAINING ===
Training pairs: 148402
Positive: 74201, Negative: 74201

=== TRAINING MODEL ===
Epoch   0, Loss: 0.6988
Epoch  20, Loss: 0.5439
Epoch  40, Loss: 0.4365
Epoch  60, Loss: 0.3396
Epoch  80, Loss: 0.2991
Epoch 100, Loss: 0.2696
Epoch 120, Loss: 0.2486
Epoch 140, Loss: 0.2230
Epoch 16