In [None]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import GraphConv
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
from sklearn.exceptions import UndefinedMetricWarning, ConvergenceWarning
import warnings
import copy
import numpy as np
import pickle
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn import SAGEConv
from sklearn.model_selection import ParameterGrid
from torch.optim import Adam


In [None]:
class Model(torch.nn.Module):
    def __init__(self, in_feats, h_feats, num_classes, dropout_rate=0.1):
        super(Model, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feats, aggregator_type='mean')
        self.conv2 = SAGEConv(h_feats, h_feats, aggregator_type='mean')  # Added one more layer
        self.conv3 = SAGEConv(h_feats, num_classes, aggregator_type='mean')
        self.dropout = nn.Dropout(dropout_rate)  # Dropout layer
        self.batchnorm = nn.BatchNorm1d(h_feats)  # Batch Normalization layer

    def forward(self, graph, x):
        h = self.conv1(graph, x)
        h = F.relu(h)
        h = self.dropout(h)  # Apply dropout
        h = self.batchnorm(h)  # Apply batch normalization
        h = self.conv2(graph, h)
        h = F.relu(h)
        h = self.dropout(h)  # Apply dropout
        h = self.conv3(graph, h)
        return h

In [None]:
with open('G_deepwalk.gpickle', 'rb') as f:
    G_deepwalk = pickle.load(f)

In [None]:
with open('G_deepwalk_reorder.gpickle', 'rb') as f:
    G_deepwalk_reorder = pickle.load(f)

In [None]:
# add an mapping
mapping = dict(zip(G_deepwalk.nodes, G_deepwalk_reorder.nodes))

In [None]:
# read G_dgl_with_twitter_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'rb') as f:
    G_dgl_with_twitter_features_converted = pickle.load(f)

In [None]:
# print keys of G_dgl_with_twitter_features_converted
print(G_dgl_with_twitter_features_converted.ndata.keys())

In [None]:
# read G_dgl.pkl
with open('G_dgl.pkl', 'rb') as f:
    G_dgl = pickle.load(f)

In [None]:
# print G_dgl all keys
print(G_dgl.ndata.keys())

In [None]:
# copy normalized_log_features to G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['normalized_log_features'] = G_dgl.ndata['normalized_log_features']

In [None]:
# print all keys of G_dgl_with_twitter_features_converted
print(G_dgl_with_twitter_features_converted.ndata.keys())

In [None]:
# write back to G_dgl_with_twitter_features_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'wb') as f:
    pickle.dump(G_dgl_with_twitter_features_converted, f)

In [None]:
# print all keys of G_dgl_with_twitter_features_converted
print(G_dgl_with_twitter_features_converted.ndata.keys())

In [None]:
# add two columns: original_combined_features and normalized_combined_features
G_dgl_with_twitter_features_converted.ndata['original_combined_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], G_dgl_with_twitter_features_converted.ndata['twitter_features']), 1)
G_dgl_with_twitter_features_converted.ndata['normalized_combined_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], G_dgl_with_twitter_features_converted.ndata['normalized_twitter_features']), 1)

In [None]:
# write back to G_dgl_with_twitter_features_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'wb') as f:
    pickle.dump(G_dgl_with_twitter_features_converted, f)

In [None]:
# use pca to reduce the dimension of twitter_features
twitter_features = G_dgl_with_twitter_features_converted.ndata['twitter_features']

# print the shape of twitter_features
print(twitter_features.shape)

In [None]:
# use a map to map original_combined_feature to a 128-dimensional vector
original_combined_features = G_dgl_with_twitter_features_converted.ndata['original_combined_features']
map_original_combined_features = nn.Linear(original_combined_features.shape[1], 128)

# add a column to G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['map_original_combined_features'] = map_original_combined_features(original_combined_features)

In [None]:
# use pca to reduce the dimension of twitter_features
from sklearn.decomposition import PCA
pca = PCA(n_components=128)

# fit the model with twitter_features
pca.fit(twitter_features)

# transform twitter_features
pca_twitter_features = pca.transform(twitter_features)

In [None]:
# print normalized_log_features example
print(G_dgl_with_twitter_features_converted.ndata['normalized_log_features'][0])

In [None]:
# print one example of pca_twitter_features
print(pca_twitter_features[0])

In [None]:
# use PCA to reduce the dimension of twitter_features to 64
from sklearn.decomposition import PCA
pca = PCA(n_components=64)

# fit the model with twitter_features
pca.fit(twitter_features)

# transform twitter_features
pca_64_twitter_features = pca.transform(twitter_features)

In [None]:
# print pca_64_twitter_features some examples
print(pca_64_twitter_features[0])

In [None]:
# use PCA to reduce the dimension of twitter_features to 32
from sklearn.decomposition import PCA
pca = PCA(n_components=32)

# fit the model with twitter_features
pca.fit(twitter_features)

# transform twitter_features
pca_32_twitter_features = pca.transform(twitter_features)

In [None]:
# normalize pca_32_twitter_features
from sklearn.preprocessing import normalize
normalized_pca_32_twitter_features = normalize(pca_32_twitter_features)

In [None]:
# concatenate normalized_log_features and normalized_pca_32_twitter_features, add to G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_32_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.from_numpy(normalized_pca_32_twitter_features)), 1)

In [None]:
# print pca_32_twitter_features[0] example
print(pca_32_twitter_features[0])

In [None]:
# use PCA to reduce the dimension of twitter_features to 16
from sklearn.decomposition import PCA
pca = PCA(n_components=16)

# fit the model with twitter_features
pca.fit(twitter_features)

# transform twitter_features
pca_16_twitter_features = pca.transform(twitter_features)

# add a column: pca_16_normalized_twitter_features
G_dgl_with_twitter_features_converted.ndata['pca_16_normalized_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.tensor(pca_16_twitter_features)), 1)

In [None]:
# print pca_16_normalized_twitter_features[0] example
print(pca_16_twitter_features[0])

In [None]:
# use pca to reduce the dimension of twitter_features to 8
from sklearn.decomposition import PCA
pca = PCA(n_components=8)

# fit the model with twitter_features
pca.fit(twitter_features)

# transform twitter_features
pca_8_twitter_features = pca.transform(twitter_features)


# add a column: pca_8_normalized_twitter_features
G_dgl_with_twitter_features_converted.ndata['pca_8_normalized_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.tensor(pca_8_twitter_features)), 1)

In [None]:
# normalize pca_8_twitter_features
from sklearn.preprocessing import normalize
normalized_pca_8_twitter_features = normalize(pca_8_twitter_features)

# concatenate normalized_log_features and normalized_pca_8_twitter_features, add to G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.from_numpy(normalized_pca_8_twitter_features)), 1)

In [None]:
# store back to G_dgl_with_twitter_features_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'wb') as f:
    pickle.dump(G_dgl_with_twitter_features_converted, f)

In [None]:
# print an example of pca_8_normalized_twitter_features
print(G_dgl_with_twitter_features_converted.ndata['pca_8_normalized_twitter_features'][0])

In [None]:
# use pca to convert twitter_features to 4-dimensional vector, then normalize it
from sklearn.decomposition import PCA
from sklearn.preprocessing import normalize
pca = PCA(n_components=4)

# fit the model with twitter_features
twitter_features = G_dgl_with_twitter_features_converted.ndata['twitter_features']
pca.fit(twitter_features)

# transform twitter_features
pca_4_twitter_features = pca.transform(twitter_features)

# normalize pca_4_twitter_features
normalized_pca_4_twitter_features = normalize(pca_4_twitter_features)

# concatenate normalized_log_features and normalized_pca_4_twitter_features, add to G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_4_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.from_numpy(normalized_pca_4_twitter_features)), 1)

In [None]:
# print some normalized_pca_4_twitter_features examples
print(normalized_pca_4_twitter_features[0:10])

In [None]:
# add a column in G_dgl_with_twitter_features_converted
G_dgl_with_twitter_features_converted.ndata['pca_32_normalized_twitter_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.tensor(pca_32_twitter_features)), 1)

In [None]:
# add a new column in G_dgl_with_twitter_features_converted, combine pca and normalized_log_features
G_dgl_with_twitter_features_converted.ndata['pca_normalized_log_features'] = torch.cat((G_dgl_with_twitter_features_converted.ndata['normalized_log_features'], torch.tensor(pca_twitter_features)), 1)

In [None]:
# print all keys of G_dgl_with_twitter_features_converted
print(G_dgl_with_twitter_features_converted.ndata.keys())

In [None]:
# print shape of pca_normalized_log_features
print(G_dgl_with_twitter_features_converted.ndata['pca_normalized_log_features'].shape)

In [None]:
# combine the pca_twitter_features with the features, get a new column
features = G_dgl_with_twitter_features_converted.ndata['features']

# print the shape of features
print(features.shape)

In [None]:
# print normalized_twitter_features
normalized_twitter_features = G_dgl_with_twitter_features_converted.ndata['normalized_twitter_features']

In [None]:
# read G_dgl
with open('G_dgl.pkl', 'rb') as f:
    G_dgl = pickle.load(f)

In [None]:
# print all keys in G_dgl.ndata
print(G_dgl.ndata.keys())

In [None]:
# write back to G_dgl_with_twitter_features_converted.pkl
with open('G_dgl_with_twitter_features_converted.pkl', 'wb') as f:
    pickle.dump(G_dgl_with_twitter_features_converted, f)

In [None]:
# print dimension of combined_features
print(G_dgl_with_twitter_features_converted.ndata['combined_features'].shape)

In [None]:
# for structure features and twitter features
model = Model(16, 128, 128, 0.1)

In [None]:
import torch
# train on positive edges, negative edges; also use validation edges to stop epochs
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
best_val_loss = float('inf')
best_model = None
num_epochs = 200
patience = 0
early_stopping_counter = 0

In [None]:
# store all edge_indices in separate files
import pickle as pkl
with open('positive_train_edge_indices.pkl', 'rb') as f:
    positive_train_edge_indices = pkl.load(f)
    
with open('negative_train_edge_indices.pkl', 'rb') as f:
    negative_train_edge_indices = pkl.load(f)
    
with open('positive_validation_edge_indices.pkl', 'rb') as f:
    positive_validation_edge_indices = pkl.load(f)
    
with open('negative_validation_edge_indices.pkl', 'rb') as f:
    negative_validation_edge_indices = pkl.load(f)
    
with open('positive_test_edge_indices.pkl', 'rb') as f:
    positive_test_edge_indices = pkl.load(f)
    
with open('negative_test_edge_indices.pkl', 'rb') as f:
    negative_test_edge_indices = pkl.load(f)

In [None]:
def generate_edge_embeddings(h, edges):
    # Extract the source and target node indices from the edges
    src, dst = edges[0], edges[1]
    
    # Use the node indices to get the corresponding node embeddings
    src_embed = h[src]
    dst_embed = h[dst]

    # Concatenate the source and target node embeddings
    edge_embs = torch.cat([src_embed, dst_embed], dim=1)

    return edge_embs

In [None]:
# print map_original_combined_features
G_dgl_with_twitter_features_converted.ndata['map_original_combined_features'] = G_dgl_with_twitter_features_converted.ndata['map_original_combined_features'].detach()

In [None]:
import copy
# Define a non-linear transformation
transform = nn.Sequential(
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Linear(128, 1)
)

for epoch in range(num_epochs):
    model.train()
    
    # forward pass
    logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
    
    # generate edge embeddings
    pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
    neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
    
    # concatenete positive and negative edge embeddings
    train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
    train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
    
    # print shapes of tensors for debugging
    # print(f"Train Edge Embeddings Shape: {train_edge_embs.shape}")
    # print(f"Train Edge Labels Shape: {train_edge_labels.shape}")
    
    # calculate loss
    loss = criterion(transform(train_edge_embs), train_edge_labels)
    print(f"Training Loss: {loss.item()}")
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    
    # validation
    model.eval()
    
    with torch.no_grad():
        # repeat the same process as above for validation samples
        logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
        pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
        neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
        val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
        val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
        # print shapes of tensors for debugging
        # print(f"Validation Edge Embeddings Shape: {val_edge_embs.shape}")
        # print(f"Validation Edge Labels Shape: {val_edge_labels.shape}")

        val_loss = criterion(transform(val_edge_embs), val_edge_labels)
        print(f"Validation Loss: {val_loss.item()}")
        
        # early stopping based on validation loss
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            patience = 0
            # save the best model
            best_model = copy.deepcopy(model)
        else:
            patience += 1
            if patience == 20:
                print('early stopping due to validation loss not improving')
                break

In [None]:
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score

# switch to evaluation mode
best_model.eval()

with torch.no_grad():
    # generate the embeddings using the best model
    logits = best_model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())

    # generate edge embeddings for the test samples
    pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
    neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

    # concatenate the positive and negative edge embeddings and labels
    test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
    test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)


    # test_loss = criterion(linear(test_edge_embs), val_edge_labels)
    # calculate predictions using the linear layer
    
    predictions = torch.sigmoid(transform(test_edge_embs))
    
    # reshape the predictions and the labels
    predictions = predictions.view(-1).cpu().numpy()
    test_edge_labels = test_edge_labels.cpu().numpy()

    # calculate scores and entropyloss
    
    
    auc = roc_auc_score(test_edge_labels, predictions)
    predictions_binary = (predictions > 0.42).astype(int)
    f1 = f1_score(test_edge_labels, predictions_binary)
    precision = precision_score(test_edge_labels, predictions_binary)
    recall = recall_score(test_edge_labels, predictions_binary)

print(f"AUC: {auc}")
print(f"F1 Score: {f1}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
# print(f"Test Loss: {test_loss.item()}")

In [None]:
# Initialize an empty DataFrame to store the results
import pandas as pd
# Define the path to the result file
result_file_path = 'results_with_twitter.txt'

# Initialize the DataFrame column names
column_names = ['Run', 'AUC', 'F1', 'Precision', 'Recall']

best_f1 = 0
best_auc = 0
best_val_loss = float('inf')
patience = 0

# write a loop to run 10 times and document each time's performance
results = ""

for i in range(10):
    model = Model(16, 128, 128, 0.1)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    best_model = None
    
    linear = nn.Linear(256, 1)

    for epoch in range(num_epochs):
        model.train()
        
        # forward pass
        logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
        
        # generate edge embeddings
        pos_train_edge_embs = generate_edge_embeddings(logits, positive_train_edge_indices)
        neg_train_edge_embs = generate_edge_embeddings(logits, negative_train_edge_indices)
        
        # concatenete positive and negative edge embeddings
        train_edge_embs = torch.cat([pos_train_edge_embs, neg_train_edge_embs], dim=0)
        train_edge_labels = torch.cat([torch.ones(pos_train_edge_embs.shape[0]), torch.zeros(neg_train_edge_embs.shape[0])], dim=0).unsqueeze(1)
        
        # # print shapes of tensors for debugging
        # print(f"Train Edge Embeddings Shape: {train_edge_embs.shape}")
        # print(f"Train Edge Labels Shape: {train_edge_labels.shape}")
        
        # calculate loss
        loss = criterion(linear(train_edge_embs), train_edge_labels)
        print(f"Training Loss: {loss.item()}")
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        
        # validation
        model.eval()
        
        with torch.no_grad():
            # repeat the same process as above for validation samples
            logits = model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())
            pos_val_edge_embs = generate_edge_embeddings(logits, positive_validation_edge_indices)
            neg_val_edge_embs = generate_edge_embeddings(logits, negative_validation_edge_indices)
            val_edge_embs = torch.cat([pos_val_edge_embs, neg_val_edge_embs], dim=0)
            val_edge_labels = torch.cat([torch.ones(pos_val_edge_embs.shape[0]), torch.zeros(neg_val_edge_embs.shape[0])], dim=0).unsqueeze(1)
            # print shapes of tensors for debugging
            # print(f"Validation Edge Embeddings Shape: {val_edge_embs.shape}")
            # print(f"Validation Edge Labels Shape: {val_edge_labels.shape}")

            val_loss = criterion(linear(val_edge_embs), val_edge_labels)
            print(f"Validation Loss: {val_loss.item()}")


            val_predictions = torch.sigmoid(linear(val_edge_embs))
            val_predictions = val_predictions.view(-1).cpu().numpy()
            val_edge_labels = val_edge_labels.cpu().numpy()

            val_auc = roc_auc_score(val_edge_labels, val_predictions)
            val_predictions_binary = (val_predictions > 0.5).astype(int)
            val_f1 = f1_score(val_edge_labels, val_predictions_binary)

            # Check the validation performance
            if val_loss <= best_val_loss:
                # best_f1 = max(val_f1, best_f1)
                # best_auc = max(val_auc, best_auc)
                # best_val_loss = min(val_loss, best_val_loss)
                best_val_loss = val_loss
                best_model = copy.deepcopy(model)
                patience = 0
            else:
                patience += 1
                if patience == 10:  # early stopping
                    print(f'Early stopping at epoch {epoch}. Best F1: {best_f1}, best AUC: {best_auc}, best Validation Loss: {best_val_loss}.')
                    break
    
    # switch to evaluation mode
    best_model.eval()

    with torch.no_grad():
        # generate the embeddings using the best model
        logits = best_model(G_dgl_with_twitter_features_converted, G_dgl_with_twitter_features_converted.ndata['combine_normalized_pca_8_twitter_features'].float())

        # generate edge embeddings for the test samples
        pos_test_edge_embs = generate_edge_embeddings(logits, positive_test_edge_indices)
        neg_test_edge_embs = generate_edge_embeddings(logits, negative_test_edge_indices)

        # concatenate the positive and negative edge embeddings and labels
        test_edge_embs = torch.cat([pos_test_edge_embs, neg_test_edge_embs], dim=0)
        test_edge_labels = torch.cat([torch.ones(pos_test_edge_embs.shape[0]), torch.zeros(neg_test_edge_embs.shape[0])], dim=0)


        # test_loss = criterion(linear(test_edge_embs), val_edge_labels)
        # calculate predictions using the linear layer
        
        predictions = torch.sigmoid(linear(test_edge_embs))
        
        # reshape the predictions and the labels
        predictions = predictions.view(-1).cpu().numpy()
        test_edge_labels = test_edge_labels.cpu().numpy()

        # calculate scores and entropyloss
        auc = roc_auc_score(test_edge_labels, predictions)
        predictions_binary = (predictions > 0.5).astype(int)
        f1 = f1_score(test_edge_labels, predictions_binary)
        precision = precision_score(test_edge_labels, predictions_binary)
        recall = recall_score(test_edge_labels, predictions_binary)

    print(f"AUC: {auc}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    # print(f"Test Loss: {test_loss.item()}")
    
    with open(result_file_path, 'a') as f:
        #first write the parameters
        f.write(f"Parameters: 8-dimension normalized twitter features concatenating structural features\n")
        f.write(f"Run: {i + 1}\n")
        f.write(f"AUC: {auc}\n")
        f.write(f"F1 Score: {f1}\n")
        f.write(f"Precision: {precision}\n")
        f.write(f"Recall: {recall}\n\n")
    
