In [2]:
from py2neo import Graph
import pandas as pd
import networkx as nx
import random
import torch
from torch_geometric.utils import from_networkx
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, average_precision_score
from node2vec import Node2Vec
import optuna
import datetime

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


In [3]:
nodes_df = pd.read_csv('/home/vidur/mediagraph/data/Data_2_Years/Entities.csv')
edges_df = pd.read_csv('/home/vidur/mediagraph/data/Data_2_Years/Relations.csv')

In [3]:
# Create an undirected NetworkX graph
G = nx.Graph()

# Add nodes with attributes
for index, row in nodes_df.iterrows():
    G.add_node(row['name'], type=row['type'])

# Add edges with attributes
for index, row in edges_df.iterrows():
    # Assume 'dates' is a string like "05-2020,09-2021"
    G.add_edge(row['source'], row['target'], weight=row['weight'], dates=row['dates'])


In [6]:
train_start = datetime.datetime.strptime("01-2020", "%m-%Y")
train_end = datetime.datetime.strptime("07-2021", "%m-%Y")
test_start = datetime.datetime.strptime("08-2021", "%m-%Y")
test_end = datetime.datetime.strptime("12-2021", "%m-%Y")

def parse_dates(dates_str):
    """
    Parses a string of dates separated by commas into a list of datetime objects.
    Example input: "05-2020,09-2021"
    """
    date_list = dates_str.split(',')
    date_objs = []
    for date_str in date_list:
        try:
            date_obj = datetime.datetime.strptime(date_str.strip(), "%m-%Y")
            date_objs.append(date_obj)
        except ValueError:
            pass  # Handle or log invalid date formats if necessary
    return date_objs

# Lists to hold training and testing edges
train_edges = []
test_edges = []

for index, row in edges_df.iterrows():
    edge_dates = parse_dates(row['dates'])
    in_train = False
    in_test = False
    for date in edge_dates:
        if train_start <= date <= train_end:
            in_train = True
        elif test_start <= date <= test_end:
            in_test = True
    if in_train:
        # Include edge in training set
        train_edges.append((row['source'], row['target'], {'weight': row['weight'], 'dates': row['dates']}))
    elif in_test and not in_train:
        # Include edge in testing set only if not in training
        test_edges.append((row['source'], row['target'], {'weight': row['weight'], 'dates': row['dates']}))


In [7]:
G_train = nx.Graph()
G_train.add_nodes_from(G.nodes(data=True))
G_train.add_edges_from(train_edges)

In [8]:
def generate_negative_edges(G, num_edges):
    """
    Generates a list of negative edges (node pairs without an edge) from graph G.
    """
    non_edges = list(nx.non_edges(G))
    random.shuffle(non_edges)
    return non_edges[:num_edges]

# Generate negative edges for training
num_train_edges = len(train_edges)
negative_train_edges = generate_negative_edges(G_train, num_train_edges)

# Generate negative edges for testing
num_test_edges = len(test_edges)
negative_test_edges = generate_negative_edges(G_train, num_test_edges)


In [9]:
# Create a list of all nodes (from the original graph)
all_nodes = list(G.nodes())

# Create mappings from node names to indices and vice versa
node2idx = {node: idx for idx, node in enumerate(all_nodes)}
idx2node = {idx: node for node, idx in node2idx.items()}


In [10]:
def edges_to_indices(edge_list, mapping):
    """
    Converts a list of edge tuples with node names to tuples with node indices.
    """
    return [(mapping[u], mapping[v]) for u, v in edge_list]


In [11]:
# Training edges
train_edge_list = [(u, v) for u, v, _ in train_edges]
train_edge_indices = edges_to_indices(train_edge_list, node2idx)
negative_train_edge_list = negative_train_edges  # List of (u, v) tuples
negative_train_edge_indices = edges_to_indices(negative_train_edge_list, node2idx)

# Convert to PyTorch tensors
train_edge_index = torch.tensor(train_edge_indices, dtype=torch.long).t().contiguous()
negative_train_edge_index = torch.tensor(negative_train_edge_indices, dtype=torch.long).t().contiguous()

# Testing edges
test_edge_list = [(u, v) for u, v, _ in test_edges]
test_edge_indices = edges_to_indices(test_edge_list, node2idx)
negative_test_edge_list = negative_test_edges  # List of (u, v) tuples
negative_test_edge_indices = edges_to_indices(negative_test_edge_list, node2idx)

# Convert to PyTorch tensors
test_pos_edge_index = torch.tensor(test_edge_indices, dtype=torch.long).t().contiguous()
test_neg_edge_index = torch.tensor(negative_test_edge_indices, dtype=torch.long).t().contiguous()


In [16]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Option 1: One-Hot Encoding (Simple)
# Initialize node features using one-hot encoding
# num_nodes = len(all_nodes)
# node_features = torch.eye(num_nodes)

# Option 2: Node2Vec Embeddings (Recommended for better performance)
# Uncomment the following lines if you prefer using Node2Vec embeddings

try:
    logger.info("Training Node2Vec model...")
    node2vec = Node2Vec(G_train, dimensions=64, walk_length=30, num_walks=200, workers=4, seed=42)
    node2vec_model = node2vec.fit(window=10, min_count=1, batch_words=4)
    logger.info("Node2Vec training completed.")

    num_nodes = len(all_nodes)
    embedding_matrix = torch.zeros((num_nodes, 64))
    missing_nodes = []

    for node, idx in node2idx.items():
        if node in node2vec_model.wv:
            embedding_matrix[idx] = torch.tensor(node2vec_model.wv[node], dtype=torch.float)
        else:
            # Assign a default vector (e.g., zeros) if the node is missing
            embedding_matrix[idx] = torch.zeros(64)
            missing_nodes.append(node)

    node_features = embedding_matrix

    if missing_nodes:
        logger.warning(f"{len(missing_nodes)} nodes are missing embeddings and have been assigned zero vectors.")
        # Optionally, you can print or log the missing nodes
        logger.warning(f"Missing nodes: {missing_nodes}")

except Exception as e:
    logger.error(f"An error occurred during Node2Vec embedding generation: {e}")
    logger.info("Falling back to one-hot encoding for node features.")

    # If Node2Vec fails, use one-hot encoding
    node_features = torch.eye(len(all_nodes))



INFO:__main__:Training Node2Vec model...
Computing transition probabilities: 100%|██████████| 22079/22079 [02:33<00:00, 144.29it/s]
Generating walks (CPU: 1): 100%|██████████| 50/50 [12:39<00:00, 15.19s/it]
Generating walks (CPU: 2): 100%|██████████| 50/50 [12:41<00:00, 15.24s/it]
Generating walks (CPU: 3): 100%|██████████| 50/50 [12:42<00:00, 15.26s/it]
Generating walks (CPU: 4): 100%|██████████| 50/50 [12:44<00:00, 15.30s/it]
INFO:gensim.models.word2vec:collecting all words and their counts
INFO:gensim.models.word2vec:PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #10000, processed 255601 words, keeping 19866 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #20000, processed 510738 words, keeping 21751 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #30000, processed 766049 words, keeping 22079 word types
INFO:gensim.models.word2vec:PROGRESS: at sentence #40000, processed 1020722 words, keep

In [8]:
data = Data()

# Assign node features
data.x = node_features

# Assign edge indices (training edges only)
data.edge_index = train_edge_index

NameError: name 'node_features' is not defined

In [18]:
edge_label_index = torch.cat([train_edge_index, negative_train_edge_index], dim=1)
edge_label = torch.cat([torch.ones(train_edge_index.size(1)), torch.zeros(negative_train_edge_index.size(1))])

# Testing labels
test_edge_label_index = torch.cat([test_pos_edge_index, test_neg_edge_index], dim=1)
test_edge_label = torch.cat([torch.ones(test_pos_edge_index.size(1)), torch.zeros(test_neg_edge_index.size(1))])


In [6]:
class LinkPredictor(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers, dropout_rate):
        super(LinkPredictor, self).__init__()
        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for _ in range(num_layers - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.dropout = torch.nn.Dropout(dropout_rate)
    
    def encode(self, x, edge_index):
        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = self.dropout(x)
        return x
    
    def decode(self, z, edge_label_index):
        # Using dot product for link prediction
        z_i = z[edge_label_index[0]]
        z_j = z[edge_label_index[1]]
        return torch.sigmoid((z_i * z_j).sum(dim=1))
    
    def forward(self, x, edge_index, edge_label_index):
        z = self.encode(x, edge_index)
        return self.decode(z, edge_label_index)


In [7]:
# Define the objective function for Optuna
def objective(trial):
    # Suggest hyperparameters
    hidden_channels = trial.suggest_categorical('hidden_channels', [16, 32, 64, 128, 256])
    num_layers = trial.suggest_int('num_layers', 1, 3, 5)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
    lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)

    # Initialize the model
    model = LinkPredictor(
        in_channels=data.num_features,
        hidden_channels=hidden_channels,
        num_layers=num_layers,
        dropout_rate=dropout_rate
    )
    
    # Define optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = torch.nn.BCELoss()
    
    # Training loop
    model.train()
    for epoch in range(1, 101):  # Using 50 epochs for tuning
        optimizer.zero_grad()
        out = model(data.x, data.edge_index, edge_label_index)
        loss = criterion(out, edge_label)
        loss.backward()
        optimizer.step()
    
    # Validation on test set
    model.eval()
    with torch.no_grad():
        out = model(data.x, data.edge_index, test_edge_label_index)
        loss = criterion(out, test_edge_label)
    
    return loss.item()

# Create an Optuna study and optimize
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)  # Adjust n_trials as needed

print("Best hyperparameters: ", study.best_params)


[I 2024-12-08 00:36:55,309] A new study created in memory with name: no-name-7b2b3eab-ff4f-4009-88fd-e0aa4068f8ed
  num_layers = trial.suggest_int('num_layers', 1, 3, 5)
  dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
  lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
  weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-2)
[W 2024-12-08 00:36:55,313] Trial 0 failed with parameters: {'hidden_channels': 16, 'num_layers': 1, 'dropout_rate': 0.08523084192314684, 'lr': 0.007654617356295958, 'weight_decay': 7.459636145764032e-05} because of the following error: NameError("name 'data' is not defined").
Traceback (most recent call last):
  File "/home/vidur/mediagraph/.venv/lib/python3.10/site-packages/optuna/study/_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_3605706/3520098750.py", line 12, in objective
    in_channels=data.num_features,
NameError: name 'data' is not defined. Did you mean: 'Data'?
[W 2024-12-08 0

NameError: name 'data' is not defined

In [35]:
# Extract best hyperparameters
best_params = study.best_params
hidden_channels = best_params['hidden_channels']
num_layers = best_params['num_layers']
dropout_rate = best_params['dropout_rate']
lr = best_params['lr']
weight_decay = best_params['weight_decay']

# Initialize the model with best hyperparameters
model = LinkPredictor(
    in_channels=data.num_features,
    hidden_channels=hidden_channels,
    num_layers=num_layers,
    dropout_rate=dropout_rate
)

# Define optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
criterion = torch.nn.BCELoss()

# Training loop
num_epochs = 10000  # Adjust as needed
for epoch in range(1, num_epochs + 1):
    model.train()
    optimizer.zero_grad()
    out = model(data.x, data.edge_index, edge_label_index)
    loss = criterion(out, edge_label)
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0 or epoch == 1:
        print(f'Epoch {epoch}/{num_epochs}, Loss: {loss.item():.4f}')

Epoch 1/10000, Loss: 0.6961
Epoch 20/10000, Loss: 0.6849
Epoch 40/10000, Loss: 0.6530
Epoch 60/10000, Loss: 0.6435
Epoch 80/10000, Loss: 0.6255
Epoch 100/10000, Loss: 0.6248
Epoch 120/10000, Loss: 0.6179
Epoch 140/10000, Loss: 0.6156
Epoch 160/10000, Loss: 0.6068
Epoch 180/10000, Loss: 0.6105
Epoch 200/10000, Loss: 0.6103
Epoch 220/10000, Loss: 0.6129
Epoch 240/10000, Loss: 0.6090
Epoch 260/10000, Loss: 0.6043
Epoch 280/10000, Loss: 0.6053
Epoch 300/10000, Loss: 0.6018
Epoch 320/10000, Loss: 0.6031
Epoch 340/10000, Loss: 0.6073
Epoch 360/10000, Loss: 0.6032
Epoch 380/10000, Loss: 0.6063
Epoch 400/10000, Loss: 0.6049
Epoch 420/10000, Loss: 0.6014
Epoch 440/10000, Loss: 0.6025
Epoch 460/10000, Loss: 0.6013
Epoch 480/10000, Loss: 0.6026
Epoch 500/10000, Loss: 0.6035
Epoch 520/10000, Loss: 0.6101
Epoch 540/10000, Loss: 0.6021
Epoch 560/10000, Loss: 0.6039
Epoch 580/10000, Loss: 0.5974
Epoch 600/10000, Loss: 0.5983
Epoch 620/10000, Loss: 0.6048
Epoch 640/10000, Loss: 0.6043
Epoch 660/10000,

In [4]:
# Load the model (optional, shown for completeness)
checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
best_params = checkpoint['best_params']
node2idx = checkpoint['node2idx']
idx2node = checkpoint['idx2node']

model.eval()
with torch.no_grad():
    out = model(data.x, data.edge_index, test_edge_label_index)
    out = out.cpu().numpy()
    test_labels = test_edge_label.numpy()
    auc = roc_auc_score(test_labels, out)
    ap = average_precision_score(test_labels, out)
    print(f'\nEvaluation on Test Set:')
    print(f'AUC-ROC: {auc:.4f}')
    print(f'Average Precision: {ap:.4f}')

  checkpoint = torch.load('model_checkpoint.pth')


NameError: name 'model' is not defined

In [24]:
def predict_new_links(new_nodes, G_train, node2idx, idx2node, data, model, threshold=0.5):
    """
    Adds new nodes to the graph and predicts potential links for them.
    
    Parameters:
    - new_nodes: List of new node names to add.
    - G_train: Training graph.
    - node2idx: Mapping from node names to indices.
    - idx2node: Mapping from indices to node names.
    - data: PyTorch Geometric data object.
    - model: Trained LinkPredictor model.
    - threshold: Probability threshold for predicting links.
    
    Returns:
    - List of predicted new edges as tuples of node names.
    """
    model.eval()
    
    # Add new nodes to mappings and node features
    for node in new_nodes:
        if node not in node2idx:
            new_idx = len(node2idx)
            node2idx[node] = new_idx
            idx2node[new_idx] = node
            # Extend node features (e.g., zero vectors)
            new_feature = torch.zeros(1, data.x.size(1))
            data.x = torch.cat([data.x, new_feature], dim=0)
    
    # Generate potential edges involving new nodes
    potential_edges = []
    for node in new_nodes:
        new_idx = node2idx[node]
        for existing_node, existing_idx in node2idx.items():
            if existing_node != node:
                potential_edges.append((new_idx, existing_idx))
    
    # Convert potential edges to tensor indices
    potential_edge_index = torch.tensor(potential_edges, dtype=torch.long).t().contiguous()
    
    # Predict links
    with torch.no_grad():
        out = model(data.x, data.edge_index, potential_edge_index)
        out = out.cpu().numpy()
        predicted_edges = potential_edge_index[:, out > threshold]
    
    # Convert indices back to node names
    predicted_edges_named = [(idx2node[u], idx2node[v]) for u, v in predicted_edges.t().tolist()]
    
    return predicted_edges_named

# Example: Predicting links for new nodes
new_nodes = ['Vidur Kaushik', 'New Entity 2']  # Replace with your new nodes
predicted_new_edges = predict_new_links(new_nodes, G_train, node2idx, idx2node, data, model, threshold=0.5)
print("\nPredicted New Edges:")
for edge in predicted_new_edges:
    print(edge)



Predicted New Edges:
