In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
import copy
import numpy as np
import pickle
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import SAGEConv
from sklearn.model_selection import ParameterGrid
from torch.optim import Adam



In [None]:
class Model(torch.nn.Module):
    def __init__(self, in_feats, h_feats, num_classes, dropout_rate=0.1):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, h_feats, aggregator_type='mean')  # Added one more layer
        self.conv3 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer
        self.batchnorm = nn.BatchNorm1d(h_feats)  # Batch Normalization layer

    def forward(self, graph, x):
        h = self.conv1(graph, x)
        h = F.relu(h)
        h = self.dropout(h)  # Apply dropout
        h = self.batchnorm(h)  # Apply batch normalization
        h = self.conv2(graph, h)
        h = F.relu(h)
        h = self.dropout(h)  # Apply dropout
        h = self.conv3(graph, h)
        return h

In [None]:
# read G_dgl graph
with open('G_dgl.pkl', 'rb') as f:
    G_dgl = pickle.load(f)

In [None]:
# read G_dgl_with_twitter_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'rb') as f:
    G_dgl_with_twitter_features_converted = pickle.load(f)

In [None]:
# print some G_dgl_with_twitter_features_converted features
print(G_dgl_with_twitter_features_converted.ndata['features'][0:5])

In [None]:
# get the number of columns in your features
num_columns = G_dgl.ndata['features'].shape[1]
print(num_columns)

# create a copy of the features for normalization
normalized_features = G_dgl.ndata['features'].clone()

# for each column, apply the normalization
for i in range(num_columns):
    normalized_features[:, i] = normalized_features[:, i] / torch.max(normalized_features[:, i])

# assign the normalized features to the node data
G_dgl.ndata['normalized_features'] = normalized_features


In [None]:
# use a log operation on the features
log_transformed_features = torch.log1p(G_dgl.ndata['features'])

# assign the log transformed features to the node data
G_dgl.ndata['log_transformed_features'] = log_transformed_features


In [None]:
# print first 20 features in G_dgl
print(G_dgl.ndata['features'][200:210])

In [None]:
# print some log features
print(G_dgl.ndata['log_transformed_features'][200:210])

In [None]:
# print first 10 normalized features
print(G_dgl.ndata['normalized_features'][0:10])

In [None]:
# normalize the log transformed features
# normalize the log-transformed features
max_value = G_dgl.ndata['log_transformed_features'].max(dim=0)[0]
G_dgl.ndata['normalized_log_features'] = G_dgl.ndata['log_transformed_features'] / max_value

# print the first 10 normalized features for checking
print(G_dgl.ndata['normalized_log_features'][200:210])


In [None]:
# define a model

# for only structure features
model = Model(8, 128, 128, 0.1)

# for structure features and twitter features
# model = Model(128, 128, 128, 0.1)

In [None]:
import torch
# train on positive edges, negative edges; also use validation edges to stop epochs
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
best_val_loss = float('inf')
best_model = None
num_epochs = 200
patience = 20
early_stopping_counter = 0

In [None]:
import pickle as pkl
with open('positive_train_edge_indices.pkl', 'rb') as f:
    positive_train_edge_indices = pkl.load(f)
    
with open('negative_train_edge_indices.pkl', 'rb') as f:
    negative_train_edge_indices = pkl.load(f)
    
with open('positive_validation_edge_indices.pkl', 'rb') as f:
    positive_validation_edge_indices = pkl.load(f)
    
with open('negative_validation_edge_indices.pkl', 'rb') as f:
    negative_validation_edge_indices = pkl.load(f)
    
with open('positive_test_edge_indices.pkl', 'rb') as f:
    positive_test_edge_indices = pkl.load(f)
    
with open('negative_test_edge_indices.pkl', 'rb') as f:
    negative_test_edge_indices = pkl.load(f)

In [None]:
def generate_edge_embeddings(h, edges):
    # Extract the source and target node indices from the edges
    src, dst = edges[0], edges[1]
    
    # Use the node indices to get the corresponding node embeddings
    src_embed = h[src]
    dst_embed = h[dst]

    # Concatenate the source and target node embeddings
    edge_embs = torch.cat([src_embed, dst_embed], dim=1)

    return edge_embs

In [None]:
# print G_dgl all keys
print(G_dgl.ndata.keys())

In [None]:
import copy
linear = nn.Linear(256, 1)

for epoch in range(num_epochs):
    model.train()
    
    # forward pass
    logits = model(G_dgl, G_dgl.ndata['normalized_log_features'])
    
    # generate edge embeddings
    pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
    neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
    
    # concatenete positive and negative edge embeddings
    train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
    train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
    
    # print shapes of tensors for debugging
    # print(f"Train Edge Embeddings Shape: {train_edge_embs.shape}")
    # print(f"Train Edge Labels Shape: {train_edge_labels.shape}")
    
    # calculate loss
    loss = criterion(linear(train_edge_embs), train_edge_labels)
    print(f"Training Loss: {loss.item()}")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
    # validation
    model.eval()
    
    with torch.no_grad():
        # repeat the same process as above for validation samples
        logits = model(G_dgl, G_dgl.ndata['normalized_log_features'].float())
        pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
        neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
        val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
        val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
        # print shapes of tensors for debugging
        # print(f"Validation Edge Embeddings Shape: {val_edge_embs.shape}")
        # print(f"Validation Edge Labels Shape: {val_edge_labels.shape}")

        val_loss = criterion(linear(val_edge_embs), val_edge_labels)
        print(f"Validation Loss: {val_loss.item()}")
        
        # early stopping based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience = 0
            # save the best model
            best_model = copy.deepcopy(model)
        else:
            patience += 1
            if patience == 10:
                print('early stopping due to validation loss not improving')
                break

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

# switch to evaluation mode
best_model.eval()

with torch.no_grad():
    # generate the embeddings using the best model
    logits = best_model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['normalized_log_features'].float())

    # generate edge embeddings for the test samples
    pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
    neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

    # concatenate the positive and negative edge embeddings and labels
    test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
    test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)


    # test_loss = criterion(linear(test_edge_embs), val_edge_labels)
    # calculate predictions using the linear layer
    
    predictions = torch.sigmoid(linear(test_edge_embs))
    
    # reshape the predictions and the labels
    predictions = predictions.view(-1).cpu().numpy()
    test_edge_labels = test_edge_labels.cpu().numpy()

    # calculate scores and entropyloss
    
    
    auc = roc_auc_score(test_edge_labels, predictions)
    predictions_binary = (predictions > 0.5).astype(int)
    f1 = f1_score(test_edge_labels, predictions_binary)
    precision = precision_score(test_edge_labels, predictions_binary)
    recall = recall_score(test_edge_labels, predictions_binary)

print(f"AUC: {auc}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
# print(f"Test Loss: {test_loss.item()}")

In [None]:
# Initialize an empty DataFrame to store the results
import pandas as pd
results = pd.DataFrame(columns=['Run', 'AUC', 'F1', 'Precision', 'Recall'])
best_f1 = 0
best_auc = 0
patience = 0

# write a loop to run 5 times and document each time's performance
for i in range(5):
    model = Model(8, 128, 128, 0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    best_model_wts = copy.deepcopy(model.state_dict())
    
    linear = nn.Linear(256, 1)

    for epoch in range(num_epochs):
        model.train()
        
        # forward pass
        logits = model(G_dgl, G_dgl.ndata['features'])
        
        # generate edge embeddings
        pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
        neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
        
        # concatenete positive and negative edge embeddings
        train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
        train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
        
        # calculate loss
        loss = criterion(linear(train_edge_embs), train_edge_labels)
        print(f"Training Loss: {loss.item()}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        # validation
        model.eval()
        
        with torch.no_grad():
            # repeat the same process as above for validation samples
            logits = model(G_dgl, G_dgl.ndata['features'].float())
            pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
            neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
            val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
            val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
            
            val_loss = criterion(linear(val_edge_embs), val_edge_labels)
            print(f"Validation Loss: {val_loss.item()}")
            
            # early stopping based on validation loss
            if val_loss <= best_val_loss:
                best_val_loss = val_loss
                patience = 0
                best_model_wts = copy.deepcopy(model.state_dict())
            else:
                patience += 1
                if patience == 10:
                    print('early stopping due to validation loss not improving')
                    break
    
    # Load best model weights
    model.load_state_dict(best_model_wts)

    # switch to evaluation mode
    model.eval()

    with torch.no_grad():
        # generate the embeddings using the best model
        logits = model(G_dgl, G_dgl.ndata['features'].float())

        # generate edge embeddings for the test samples
        pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
        neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

        # concatenate the positive and negative edge embeddings and labels
        test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
        test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)

        # calculate predictions using the linear layer
        predictions = torch.sigmoid(linear(test_edge_embs))
        
        # reshape the predictions and the labels
        predictions = predictions.view(-1).cpu().numpy()
        test_edge_labels = test_edge_labels.cpu().numpy()

        # calculate scores
        auc = roc_auc_score(test_edge_labels, predictions)
        predictions_binary = (predictions > 0.5).astype(int)
        f1 = f1_score(test_edge_labels, predictions_binary)
        precision = precision_score(test_edge_labels, predictions_binary)
        recall = recall_score(test_edge_labels, predictions_binary)

    print(f"AUC: {auc}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    
    # append the results to the DataFrame
    results = results.append({
        'Run': i + 1,
        'AUC': auc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }, ignore_index=True)

    # save the best model
    torch.save(best_model_wts, f"best_model_run_{i + 1}.pth")

# Save the results to a CSV file
results.to_csv('results.csv', index=False)
