In [None]:
import dgl
import torch
import torch.nn.functional as F
from dgl.nn.pytorch import GATConv
from sklearn.metrics import roc_auc_score
import numpy as np
import torch.backends.cudnn as cudnn
import random
import pickle as pkl

In [None]:
# read G_train_dgl, G_val_dgl, G_test_dgl, use pickle to read
with open('G_train_dgl_twitter.gpickle', 'rb') as f:
    G_train_dgl_twitter= pkl.load(f)
    
with open('G_val_dgl_twitter.gpickle', 'rb') as f:
    G_val_dgl_twitter = pkl.load(f)
    
with open('G_test_dgl_twitter.gpickle', 'rb') as f:
    G_test_dgl_twitter = pkl.load(f)

In [None]:
# print all keys in G_train_dgl
print(G_train_dgl_twitter.ndata.keys())

In [None]:
# combine features and twitter_features => combined_features

# step 1: normalize features
# step 2: pca twitter features
# step 3: combine features and twitter features

In [None]:
# step 1: normalize features
from sklearn.preprocessing import StandardScaler

# Retrieve node features from the graph
features = G_train_dgl_twitter.ndata['features']

# Convert the features tensor to a numpy array
features = features.numpy()

# Create a scaler and fit it to the features
scaler = StandardScaler()
normalized_features = scaler.fit_transform(features)

# Add normalized features back to the graph
G_train_dgl_twitter.ndata['normalized_features'] = torch.tensor(normalized_features)


In [None]:
# print some features and normalized_features in G_train_dgl_twitter
print(G_train_dgl_twitter.ndata['features'][0:5])
print(G_train_dgl_twitter.ndata['normalized_features'][0:5])

In [None]:
# do similar things for G_val_dgl_twitter and G_test_dgl_twitter
# Retrieve node features from the graph
features = G_val_dgl_twitter.ndata['features']
normalized_features = scaler.fit_transform(features)
G_val_dgl_twitter.ndata['normalized_features'] = torch.tensor(normalized_features)

# Retrieve node features from the graph
features = G_test_dgl_twitter.ndata['features']
normalized_features = scaler.fit_transform(features)
G_test_dgl_twitter.ndata['normalized_features'] = torch.tensor(normalized_features)

In [None]:
# print twitter_features shape
print(G_train_dgl_twitter.ndata['twitter_features'].shape)

In [None]:
# print number of nodes in G_train_dgl_twitter
print(G_train_dgl_twitter.number_of_nodes())

In [None]:
# pca twitter features
from sklearn.decomposition import PCA

# on train graph

# Retrieve node features from the graph
twitter_features = G_train_dgl_twitter.ndata['twitter_features']

# Convert the tensor to a numpy array
twitter_features_numpy = twitter_features.cpu().numpy()

# Create a PCA object and fit it to the twitter features
pca = PCA(n_components=8)
twitter_features_pca = pca.fit_transform(twitter_features_numpy)

# print shape of twitter_features_pca
print(twitter_features_pca.shape)

# Convert the PCA features back to a tensor 
twitter_features_pca_tensor = torch.tensor(twitter_features_pca)


G_train_dgl_twitter.ndata['pca_8_twitter_features'] = twitter_features_pca_tensor


In [None]:
# print first 10 rows of twitter_features_pca_tensor
print(G_train_dgl_twitter.ndata['pca_8_twitter_features'][0:10])

In [None]:
# print how many distinct rows in twitter_features_pca_tensor
print(len(np.unique(G_train_dgl_twitter.ndata['pca_8_twitter_features'], axis=0)))

In [None]:
import torch

# Get the tensor data
tensor = G_train_dgl_twitter.ndata['pca_8_twitter_features']

# Find unique rows
tensor_unique = torch.unique(tensor, dim=0)

# Print the number of unique rows
print(tensor_unique.shape[0])

# Print some examples
print("Some examples of unique rows:")

for i in range(min(100, tensor_unique.shape[0])):  # print at most 5 examples
    print(tensor_unique[i])


In [None]:
# print the number of tensors in G_train_dgl_twitter
print(len(G_train_dgl_twitter.ndata['pca_8_twitter_features']))

In [None]:
# print distinct tensors in G_train_dgl_twitter
print(len(np.unique(G_train_dgl_twitter.ndata['pca_8_twitter_features'], axis=0)))

In [None]:
# print the # Get the tensor data
tensor = G_train_dgl_twitter.ndata['pca_8_twitter_features']

In [None]:
# print first 10 unique rows of twitter_features_pca_tensor
print(np.unique(G_train_dgl_twitter.ndata['pca_8_twitter_features'], axis=0)[0:10])

In [None]:
# print first 10 unique elements in twitter_features_pca
print(np.unique(twitter_features_pca)[0:1000])

In [None]:
# print some twitter_features_pca
print(twitter_features_pca[0:5])

In [None]:
# check whether some twitter features are not 0
# Retrieve twitter features from the graph
features = G_train_dgl_twitter.ndata['twitter_features']

# Check whether some twitter features are not 0
if torch.any(features):
    print("There are some non-zero features.")
else:
    print("All features are zero.")


In [None]:
# print some pca twitter features
print(G_train_dgl_twitter.ndata['pca_8_twitter_features'][10:100])

In [None]:
# do the same thing for G_val_dgl_twitter and G_test_dgl_twitter
# Retrieve node features from the graph
twitter_features = G_val_dgl_twitter.ndata['twitter_features']
twitter_features_pca = pca.fit_transform(twitter_features)
G_val_dgl_twitter.ndata['pca_8_twitter_features'] = torch.tensor(twitter_features_pca)


# Retrieve node features from the graph
twitter_features = G_test_dgl_twitter.ndata['twitter_features']
twitter_features_pca = pca.fit_transform(twitter_features)
G_test_dgl_twitter.ndata['pca_8_twitter_features'] = torch.tensor(twitter_features_pca)

In [None]:
# combine features and twitter_features => combined_features, then normalize combined_features
# Retrieve node features from the graph
features = G_train_dgl_twitter.ndata['features']
twitter_features = G_train_dgl_twitter.ndata['pca_8_twitter_features']

# Concatenate the features
combined_features = torch.cat((features, twitter_features), dim=1)

# Convert the tensor to a numpy array
combined_features = combined_features.numpy()

# Create a scaler and fit it to the combined features
scaler = StandardScaler()

# Normalize the combined features
normalized_combined_features = scaler.fit_transform(combined_features)

# Add the normalized features to the graph
G_train_dgl_twitter.ndata['combined_features'] = torch.tensor(normalized_combined_features)

In [None]:
# print some examples of combined_features
print(G_train_dgl_twitter.ndata['combined_features'][0:5])

In [None]:
# do the same thing for G_val_dgl_twitter and G_test_dgl_twitter
# Retrieve node features from the graph

features = G_val_dgl_twitter.ndata['features']
twitter_features = G_val_dgl_twitter.ndata['pca_8_twitter_features']

# Concatenate the features
combined_features = torch.cat((features, twitter_features), dim=1)

# Convert the tensor to a numpy array
combined_features = combined_features.numpy()


# Normalize the combined features
normalized_combined_features = scaler.fit_transform(combined_features)

# Add the normalized features to the graph
G_val_dgl_twitter.ndata['combined_features'] = torch.tensor(normalized_combined_features)

In [None]:
# also, do the same thing for G_test_dgl_twitter
# Retrieve node features from the graph

features = G_test_dgl_twitter.ndata['features']

twitter_features = G_test_dgl_twitter.ndata['pca_8_twitter_features']

# Concatenate the features
combined_features = torch.cat((features, twitter_features), dim=1)

# Convert the tensor to a numpy array
combined_features = combined_features.numpy()

# Normalize the combined features
normalized_combined_features = scaler.fit_transform(combined_features)


# Add the normalized features to the graph
G_test_dgl_twitter.ndata['combined_features'] = torch.tensor(normalized_combined_features)

In [None]:
# print some examples of combined_features
print(G_train_dgl_twitter.ndata['combined_features'][0:5])
print(G_val_dgl_twitter.ndata['combined_features'][0:5])
print(G_test_dgl_twitter.ndata['combined_features'][0:5])

In [None]:
# store back G_train_dgl_twitter, G_val_dgl_twitter, G_test_dgl_twitter
with open('G_train_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_train_dgl_twitter, f)
    
with open('G_val_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_val_dgl_twitter, f)
    
with open('G_test_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_test_dgl_twitter, f)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import copy
import random
# Set the random seed, a randamly selected number
seed = random.randint(0, 1000)
print(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

class GATModel(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, dropout_rate=0.1):
        super().__init__()
        self.conv1 = GATConv(in_dim, hidden_dim, num_heads=num_heads)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, num_heads)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.conv3 = GATConv(hidden_dim * num_heads, out_dim, num_heads)
    
    def forward(self, g, h):
        h = self.conv1(g, h).flatten(1)
        h = F.elu(self.dropout1(h))
        h = self.conv2(g, h).flatten(1)
        h = F.elu(self.dropout2(h))
        h = self.conv3(g, h).mean(1)
        return h


# Get the number of input features
in_feats = G_train_dgl_twitter.ndata['combined_features'].shape[1]

# Define the model hyperparameters
hidden_size = 128
out_feats = 2  # Assuming binary classification
num_heads = 3

# Create the GCN model
model = GATModel(in_feats, hidden_size, out_feats, num_heads)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()
best_val_loss = float('inf')
best_model = None
num_epochs = 200
patience = 20

# Training loop
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    labels = G_train_dgl_twitter.ndata['label'].squeeze()
    features = G_train_dgl_twitter.ndata['combined_features']

    # Select indices of 0 and 1 labels
    zero_indices = torch.where(labels == 0)[0]
    one_indices = torch.where(labels == 1)[0]
    
    # Get the minimum count between 0 and 1 labels
    min_count = min(zero_indices.shape[0], one_indices.shape[0])
    
    # Randomly select 'min_count' indices from zero_indices and one_indices each
    selected_zero_indices = zero_indices[torch.randperm(zero_indices.shape[0])[:min_count]]
    selected_one_indices = one_indices[torch.randperm(one_indices.shape[0])[:min_count]]

    # Combine the selected indices
    selected_indices = torch.cat((selected_zero_indices, selected_one_indices))

    # Shuffle the selected indices
    selected_indices = selected_indices[torch.randperm(selected_indices.shape[0])]

    # Create a subgraph from the selected indices
    subgraph = dgl.node_subgraph(G_train_dgl_twitter, selected_indices)

    # Get the selected features and labels
    selected_features = subgraph.ndata['combined_features']
    selected_labels = subgraph.ndata['label'].squeeze()

    # Forward pass and compute the loss, convert selected_features to float
    
    logits = model(subgraph, selected_features.float())
    labels = F.one_hot(selected_labels, num_classes=out_feats).float()
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()

    model.eval()
    with torch.no_grad():
        logits = model(G_val_dgl_twitter, G_val_dgl_twitter.ndata['combined_features'].float())
        labels = F.one_hot(G_val_dgl_twitter.ndata['label'].squeeze(), num_classes=out_feats).float()
        val_loss = criterion(logits, labels)

    # Add early stopping based on validation loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_model = copy.deepcopy(model)
        torch.save(model.state_dict(), 'best_model.pt')  # Save the best model

    print(f"Epoch: {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}")

best_model.eval()
with torch.no_grad():
    # Create balanced testing set
    labels = G_test_dgl_twitter.ndata['label'].squeeze()

    # Select indices of 0 and 1 labels
    zero_indices = torch.where(labels == 0)[0]
    one_indices = torch.where(labels == 1)[0]

    # Get the minimum count between 0 and 1 labels
    min_count = min(zero_indices.shape[0], one_indices.shape[0])

    # Randomly select 'min_count' indices from zero_indices and one_indices each
    selected_zero_indices = zero_indices[torch.randperm(zero_indices.shape[0])[:min_count]]
    selected_one_indices = one_indices[torch.randperm(one_indices.shape[0])[:min_count]]

    # Combine the selected indices
    selected_indices = torch.cat((selected_zero_indices, selected_one_indices))

    # Shuffle the selected indices
    selected_indices = selected_indices[torch.randperm(selected_indices.shape[0])]

    # Create a subgraph from the selected indices
    subgraph = dgl.node_subgraph(G_test_dgl_twitter, selected_indices)

    # Get the selected features and labels
    selected_features = subgraph.ndata['combined_features']
    ground_truth = subgraph.ndata['label'].squeeze()

    # Testing
    logits = best_model(subgraph, selected_features.float())
    _, predicted_labels = torch.max(logits, 1)

    # Calculate additional evaluation metrics for testing
    predicted_probs = F.softmax(logits, dim=1)[:, 1]
    auc = roc_auc_score(ground_truth.detach().numpy(), predicted_probs.detach().numpy())
    f1 = f1_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
    precision = precision_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
    recall = recall_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())

    print(f"AUC: {auc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}")



In [None]:
# define the model
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score
import copy
import random
# Set the random seed, a randamly selected number
seed = random.randint(0, 1000)
print(seed)
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

class GATModel(torch.nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim, num_heads, dropout_rate=0.1):
        super().__init__()
        self.conv1 = GATConv(in_dim, hidden_dim, num_heads=num_heads)
        self.dropout1 = nn.Dropout(dropout_rate)
        self.conv2 = GATConv(hidden_dim * num_heads, hidden_dim, num_heads)
        self.dropout2 = nn.Dropout(dropout_rate)
        self.conv3 = GATConv(hidden_dim * num_heads, out_dim, num_heads)
    
    def forward(self, g, h):
        h = self.conv1(g, h).flatten(1)
        h = F.elu(self.dropout1(h))
        h = self.conv2(g, h).flatten(1)
        h = F.elu(self.dropout2(h))
        h = self.conv3(g, h).mean(1)
        return h


# Get the number of input features
in_feats = G_train_dgl_twitter.ndata['combined_features'].shape[1]

# Define the model hyperparameters
hidden_size = 128
out_feats = 2  # Assuming binary classification
num_heads = 3


In [None]:
# for G_train, G_test, G_val, add self loop
G_train_dgl_twitter = dgl.add_self_loop(G_train_dgl_twitter)
G_val_dgl_twitter = dgl.add_self_loop(G_val_dgl_twitter)
G_test_dgl_twitter = dgl.add_self_loop(G_test_dgl_twitter)

In [None]:
# store back G_train_dgl_twitter, G_val_dgl_twitter, G_test_dgl_twitter
with open('G_train_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_train_dgl_twitter, f)

with open('G_val_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_val_dgl_twitter, f)
    
with open('G_test_dgl_twitter.gpickle', 'wb') as f:
    pkl.dump(G_test_dgl_twitter, f)

In [None]:
# import multiple metrics
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
import copy
import random

In [None]:
for i in range(10):
    # Set the random seed, a randamly selected number
    seed = random.randint(0, 1000)
    print(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    # Create the GAT model
    model = GATModel(16, 128, 2, 3)
    # Define the optimizer and loss function
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()
    best_val_loss = float('inf')
    best_model = None
    num_epochs = 200
    patience = 20

    # Training loop
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()

        labels = G_train_dgl_twitter.ndata['label'].squeeze()
        features = G_train_dgl_twitter.ndata['combined_features']

        # Select indices of 0 and 1 labels
        zero_indices = torch.where(labels == 0)[0]
        one_indices = torch.where(labels == 1)[0]
        
        # Get the minimum count between 0 and 1 labels
        min_count = min(zero_indices.shape[0], one_indices.shape[0])
        
        # Randomly select 'min_count' indices from zero_indices and one_indices each
        selected_zero_indices = zero_indices[torch.randperm(zero_indices.shape[0])[:min_count]]
        selected_one_indices = one_indices[torch.randperm(one_indices.shape[0])[:min_count]]

        # Combine the selected indices
        selected_indices = torch.cat((selected_zero_indices, selected_one_indices))

        # Shuffle the selected indices
        selected_indices = selected_indices[torch.randperm(selected_indices.shape[0])]

        # Create a subgraph from the selected indices
        subgraph = dgl.node_subgraph(G_train_dgl_twitter, selected_indices)

        # Get the selected features and labels
        selected_features = subgraph.ndata['combined_features']
        selected_labels = subgraph.ndata['label'].squeeze()

        # Forward pass and compute the loss
        logits = model(subgraph, selected_features.float())
        labels = F.one_hot(selected_labels, num_classes=out_feats).float()
        loss = criterion(logits, labels)
        loss.backward()
        optimizer.step()

        model.eval()
        with torch.no_grad():
            # Create balanced validation set
            labels = G_val_dgl_twitter.ndata['label'].squeeze()

            # Select indices of 0 and 1 labels
            zero_indices = torch.where(labels == 0)[0]
            one_indices = torch.where(labels == 1)[0]

            # Get the minimum count between 0 and 1 labels
            min_count = min(zero_indices.shape[0], one_indices.shape[0])

            # Randomly select 'min_count' indices from zero_indices and one_indices each
            selected_zero_indices = zero_indices[torch.randperm(zero_indices.shape[0])[:min_count]]
            selected_one_indices = one_indices[torch.randperm(one_indices.shape[0])[:min_count]]

            # Combine the selected indices
            selected_indices = torch.cat((selected_zero_indices, selected_one_indices))

            # Shuffle the selected indices
            selected_indices = selected_indices[torch.randperm(selected_indices.shape[0])]

            # Create a subgraph from the selected indices
            subgraph = dgl.node_subgraph(G_val_dgl_twitter, selected_indices)

            # Get the selected features and labels
            selected_features = subgraph.ndata['combined_features']
            selected_labels = subgraph.ndata['label'].squeeze()

            # Validation
            logits = model(subgraph, selected_features.float())
            labels = F.one_hot(selected_labels, num_classes=out_feats).float()
        val_loss = criterion(logits, labels)
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model = copy.deepcopy(model)
            torch.save(model.state_dict(), 'best_model.pt')
        print(f"Epoch: {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}")            

    best_model.eval()
    with torch.no_grad():
        # Create balanced testing set
        labels = G_test_dgl_twitter.ndata['label'].squeeze()

        # Select indices of 0 and 1 labels
        zero_indices = torch.where(labels == 0)[0]
        one_indices = torch.where(labels == 1)[0]

        # Get the minimum count between 0 and 1 labels
        min_count = min(zero_indices.shape[0], one_indices.shape[0])

        # Randomly select 'min_count' indices from zero_indices and one_indices each
        selected_zero_indices = zero_indices[torch.randperm(zero_indices.shape[0])[:min_count]]
        selected_one_indices = one_indices[torch.randperm(one_indices.shape[0])[:min_count]]

        # Combine the selected indices
        selected_indices = torch.cat((selected_zero_indices, selected_one_indices))

        # Shuffle the selected indices
        selected_indices = selected_indices[torch.randperm(selected_indices.shape[0])]

        # Create a subgraph from the selected indices
        subgraph = dgl.node_subgraph(G_test_dgl_twitter, selected_indices)

        # Get the selected features and labels
        selected_features = subgraph.ndata['combined_features']
        ground_truth = subgraph.ndata['label'].squeeze()

        # Testing
        logits = best_model(subgraph, selected_features.float())
        _, predicted_labels = torch.max(logits, 1)

        # Calculate additional evaluation metrics for testing
        predicted_probs = F.softmax(logits, dim=1)[:, 1]
        # adjust threshold here
        predicted_labels = (predicted_probs > 0.4).float()
        auc = roc_auc_score(ground_truth.detach().numpy(), predicted_probs.detach().numpy())
        f1 = f1_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
        precision = precision_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
        recall = recall_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
        accuracy = accuracy_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy())
        macro_f1 = f1_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy(), average='macro')
        macro_precision = precision_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy(), average='macro')
        macro_recall = recall_score(ground_truth.detach().numpy(), predicted_labels.detach().numpy(), average='macro')
        # store results in a txt file
        with open("GAT_with_results.txt", "a") as f:
            # need to write random seed, validation loss, test loss, auc, f1, precision, recall
            f.write(f"Random seed: {seed}, Epoch: {epoch + 1}/{num_epochs}, Loss: {loss:.4f}, Validation Loss: {val_loss:.4f}, AUC: {auc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, Macro-F1: {macro_f1:.4f}, Macro-Precision: {macro_precision:.4f}, Macro-recall: {macro_recall:.4f}\n")
        print(f"AUC: {auc:.4f}, F1: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, Accuracy: {accuracy:.4f}, Macro-F1: {macro_f1:.4f}, Macro-Precision: {macro_precision:.4f}, Macro-recall: {macro_recall:.4f}\n")

