In [1]:
from py2neo import Graph
import pandas as pd
import networkx as nx
import random
import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, average_precision_score
from node2vec import Node2Vec
import optuna

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [2]:
nodes_df = pd.read_csv('/home/vidur/mediagraph/data/Data_2_Years/Entities.csv')
edges_df = pd.read_csv('/home/vidur/mediagraph/data/Data_2_Years/Relations.csv')

In [3]:
# Create an undirected NetworkX graph
G = nx.Graph()

# Add nodes with attributes
for index, row in nodes_df.iterrows():
    G.add_node(row['name'], type=row['type'])

# Add edges with attributes
for index, row in edges_df.iterrows():
    G.add_edge(row['source'], row['target'], weight=row['weight'], dates=row['dates'])


In [4]:
# Split edges into training and testing sets
edges = list(G.edges(data=True))
random.shuffle(edges)

split_ratio = 0.8
split_index = int(len(edges) * split_ratio)
train_edges = edges[:split_index]
test_edges = edges[split_index:]

# Create training graph
G_train = nx.Graph()
G_train.add_nodes_from(G.nodes(data=True))
G_train.add_edges_from(train_edges)

# Function to generate negative edges
def generate_negative_edges(G, num_edges):
    non_edges = list(nx.non_edges(G))
    random.shuffle(non_edges)
    return non_edges[:num_edges]

# Generate negative edges for training
num_train_edges = len(train_edges)
negative_train_edges = generate_negative_edges(G_train, num_train_edges)

In [5]:
# Create a list of nodes from the training graph
nodes = list(G_train.nodes())

# Create mappings from node names to indices and vice versa
node2idx = {node: idx for idx, node in enumerate(nodes)}
idx2node = {idx: node for node, idx in node2idx.items()}

In [6]:
# Function to convert edge list from node names to indices
def edges_to_indices(edge_list, mapping):
    return [(mapping[u], mapping[v]) for u, v in edge_list]

# Convert positive training edges to indices
train_edge_list = [(u, v) for u, v, _ in train_edges]
train_edge_indices = edges_to_indices(train_edge_list, node2idx)

# Convert negative training edges to indices
negative_train_edge_list = negative_train_edges  # List of (u, v) tuples
negative_train_edge_indices = edges_to_indices(negative_train_edge_list, node2idx)

# Convert to PyTorch tensors
train_edge_index = torch.tensor(train_edge_indices, dtype=torch.long).t().contiguous()
negative_train_edge_index = torch.tensor(negative_train_edge_indices, dtype=torch.long).t().contiguous()

# Combine positive and negative edges
edge_label_index = torch.cat([train_edge_index, negative_train_edge_index], dim=1)

# Create labels: 1 for positive edges and 0 for negative edges
edge_label = torch.cat([torch.ones(train_edge_index.size(1)), torch.zeros(negative_train_edge_index.size(1))])


In [7]:
# Convert test edges to indices
test_edge_list = [(u, v) for u, v, _ in test_edges]
test_edge_indices = edges_to_indices(test_edge_list, node2idx)

# Generate negative test edges
negative_test_edges = generate_negative_edges(G_train, len(test_edge_indices))
negative_test_edge_indices = edges_to_indices(negative_test_edges, node2idx)

# Convert to PyTorch tensors
test_pos_edge_index = torch.tensor(test_edge_indices, dtype=torch.long).t().contiguous()
test_neg_edge_index = torch.tensor(negative_test_edge_indices, dtype=torch.long).t().contiguous()

# Combine for testing
test_edge_label_index = torch.cat([test_pos_edge_index, test_neg_edge_index], dim=1)
test_edge_label = torch.cat([torch.ones(test_pos_edge_index.size(1)), torch.zeros(test_neg_edge_index.size(1))])


In [8]:
# Option 1: One-Hot Encoding (Simple)
# Initialize node features using one-hot encoding
num_nodes = len(nodes)
node_features = torch.eye(num_nodes)

# Option 2: Node2Vec Embeddings (Recommended for better performance)
# Uncomment the following lines if you prefer using Node2Vec embeddings

# node2vec = Node2Vec(G_train, dimensions=64, walk_length=30, num_walks=200, workers=4, seed=42)
# node2vec_model = node2vec.fit(window=10, min_count=1, batch_words=4)
# embedding_matrix = torch.zeros((num_nodes, 64))
# for node, idx in node2idx.items():
#     embedding_matrix[idx] = torch.tensor(node2vec_model.wv[node])
# node_features = embedding_matrix


In [9]:
# Create PyTorch Geometric data object
data = Data()

# Assign node features
data.x = node_features

# Assign edge indices (training edges only)
data.edge_index = train_edge_index


In [10]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers, dropout_rate):
        super(LinkPredictor, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.lin = torch.nn.Linear(hidden_channels, 1)

    def encode(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = self.dropout(x)
        return x

    def decode(self, z, edge_label_index):
        # Using dot product for link prediction
        z_i = z[edge_label_index[0]]
        z_j = z[edge_label_index[1]]
        return torch.sigmoid((z_i * z_j).sum(dim=1))

    def forward(self, x, edge_index, edge_label_index):
        z = self.encode(x, edge_index)
        return self.decode(z, edge_label_index)

In [11]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64])
    num_layers = trial.suggest_int('num_layers', 1, 3)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)

    # Initialize the model
    model = LinkPredictor(in_channels=data.num_features,
                          hidden_channels=hidden_channels,
                          num_layers=num_layers,
                          dropout_rate=dropout_rate)
    
    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = torch.nn.BCELoss()

    # Training loop
    model.train()
    for epoch in range(1, 51):  # Using 50 epochs for tuning
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, edge_label_index)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()
    
    # Validation on test set
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index, test_edge_label_index)
        loss = criterion(out, test_edge_label)
    
    return loss.item()

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=20)  # Adjust n_trials as needed

print("Best hyperparameters: ", study.best_params)

[I 2024-12-05 15:13:01,179] A new study created in memory with name: no-name-ac3985f0-05e3-44fe-bec2-e682980f7dd4
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
[I 2024-12-05 15:15:00,150] Trial 0 finished with value: 0.6859686374664307 and parameters: {'hidden_channels': 32, 'num_layers': 3, 'dropout_rate': 0.025885316330696517, 'lr': 0.0003131666923445802, 'weight_decay': 1.5064642704172467e-05}. Best is trial 0 with value: 0.6859686374664307.
[I 2024-12-05 15:17:08,225] Trial 1 finished with value: 0.7003882527351379 and parameters: {'hidden_channels': 64, 'num_layers': 3, 'dropout_rate': 0.34428035792876394, 'lr': 1.97068191049806e-05, 'weight_decay': 0.006394597618761897}. Best is trial 0 with value: 0.6859686374664307.
[I 2024-12-05 15:19:05,496] Trial 2 finished with value: 0.6931395530700684 and parameters: {'hidden_channels': 32, 'num_layers

Best hyperparameters:  {'hidden_channels': 16, 'num_layers': 3, 'dropout_rate': 0.3334704825843034, 'lr': 0.02492698553992736, 'weight_decay': 0.00020654818724697086}


In [12]:
# Extract best hyperparameters
best_params = study.best_params
hidden_channels = best_params['hidden_channels']
num_layers = best_params['num_layers']
dropout_rate = best_params['dropout_rate']
lr = best_params['lr']
weight_decay = best_params['weight_decay']

# Initialize the model with best hyperparameters
model = LinkPredictor(in_channels=data.num_features,
                      hidden_channels=hidden_channels,
                      num_layers=num_layers,
                      dropout_rate=dropout_rate)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = torch.nn.BCELoss()

# Training loop
num_epochs = 200  # Adjust as needed
for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, edge_label_index)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0 or epoch == 1:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}')

Epoch 1/200, Loss: 0.7120
Epoch 20/200, Loss: 0.6917
Epoch 40/200, Loss: 0.6413
Epoch 60/200, Loss: 0.6140
Epoch 80/200, Loss: 0.5905
Epoch 100/200, Loss: 0.5771
Epoch 120/200, Loss: 0.5718
Epoch 140/200, Loss: 0.5711
Epoch 160/200, Loss: 0.5664
Epoch 180/200, Loss: 0.5660
Epoch 200/200, Loss: 0.5632


In [13]:
model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index, test_edge_label_index)
    out = out.cpu().numpy()
    test_labels = test_edge_label.numpy()
    auc = roc_auc_score(test_labels, out)
    ap = average_precision_score(test_labels, out)
    print(f'\nEvaluation on Test Set:')
    print(f'AUC-ROC: {auc:.4f}')
    print(f'Average Precision: {ap:.4f}')


Evaluation on Test Set:
AUC-ROC: 0.7953
Average Precision: 0.8430


In [14]:
# Function to add new nodes and predict links
def predict_new_links(new_nodes, G_train, node2idx, idx2node, data, model, threshold=0.5):
    model.eval()
    
    # Add new nodes to mappings and node features
    for node in new_nodes:
        if node not in node2idx:
            new_idx = len(node2idx)
            node2idx[node] = new_idx
            idx2node[new_idx] = node
            # Extend node features (e.g., zero vectors or random)
            new_feature = torch.zeros(1, data.x.size(1))  # Adjust as needed
            data.x = torch.cat([data.x, new_feature], dim=0)
    
    # Generate potential edges involving new nodes
    potential_edges = []
    for node in new_nodes:
        new_idx = node2idx[node]
        for existing_node, existing_idx in node2idx.items():
            if existing_node != node:
                potential_edges.append((new_idx, existing_idx))
    
    # Convert potential edges to tensor indices
    potential_edge_index = torch.tensor(potential_edges, dtype=torch.long).t().contiguous()
    
    # Predict links
    with torch.no_grad():
        out = model(data.x, data.edge_index, potential_edge_index)
        out = out.cpu().numpy()
        predicted_edges = potential_edge_index[:, out > threshold]
    
    # Convert indices back to node names
    predicted_edges_named = [(idx2node[u], idx2node[v]) for u, v in predicted_edges.t().tolist()]
    
    return predicted_edges_named

# Example: Predicting links for new nodes
new_nodes = ['New Entity 1', 'New Entity 2']  # Replace with your new nodes
predicted_new_edges = predict_new_links(new_nodes, G_train, node2idx, idx2node, data, model, threshold=0.5)
print("\nPredicted New Edges:")
for edge in predicted_new_edges:
    print(edge)


Predicted New Edges:
('New Entity 1', 'Kanadia village')
('New Entity 1', 'M Manohar Reddy')
('New Entity 1', 'Puvayan')
('New Entity 1', 'Tobacco')
('New Entity 1', 'Moti')
('New Entity 1', 'Sawaikar')
('New Entity 1', 'Lok Bandhu')
('New Entity 1', 'Niti Aayog')
('New Entity 1', 'Jalmuru')
('New Entity 1', 'Harpanahalli')
('New Entity 1', 'Behvalpur')
('New Entity 1', 'Supreme Court')
('New Entity 1', 'Rashidpur village')
('New Entity 1', 'Bajawala')
('New Entity 1', 'Shahul Hameed')
('New Entity 1', 'the Apex Bank')
('New Entity 1', 'Bharatiya Kisan Sangam')
('New Entity 1', 'Sunil Kumar Singh')
('New Entity 1', 'Ochlandra')
('New Entity 1', 'Rangampet')
('New Entity 1', 'Rajesh Khirsagar')
('New Entity 1', 'Shishupal')
('New Entity 1', 'Abhel Charolia')
('New Entity 1', 'Venkaiah Naidu')
('New Entity 1', 'Pullambadi')
('New Entity 1', 'Sattari Shetkari Manch')
('New Entity 1', 'Swarn Singh')
('New Entity 1', 'Veluru')
('New Entity 1', 'Union')
('New Entity 1', 'the General Body of

In [15]:
# Save the model
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'best_params': best_params,  # Save the best hyperparameters if using Optuna
    'node2idx': node2idx,  # Optional: save mappings if needed
    'idx2node': idx2node,  # Optional: save mappings if needed
}, 'model_checkpoint.pth')  # Choose your path


In [16]:
# #Load the model
# # Define the model architecture again
# model = LinkPredictor(
#     in_channels=data.num_features, 
#     hidden_channels=best_params['hidden_channels'],
#     num_layers=best_params['num_layers'],
#     dropout_rate=best_params['dropout_rate']
# )

# # Load the model
# checkpoint = torch.load('path_to_save/model_checkpoint.pth')
# model.load_state_dict(checkpoint['model_state_dict'])

# # If continuing training or need optimizer state
# optimizer = torch.optim.Adam(model.parameters(), lr=best_params['lr'])
# optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# # Load mappings if saved
# node2idx = checkpoint['node2idx']
# idx2node = checkpoint['idx2node']
