In [None]:
!pip install -q torch_geometric

In [None]:
import torch
import numpy as np
import pandas as pd
import networkx as nx
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.datasets import Planetoid
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch_geometric.utils import from_networkx, negative_sampling
from torch_geometric.nn import GCNConv, BatchNorm
from torch.nn import Module

# Loading Dataset

In [None]:
with open('/content/DANI.txt', 'r') as file:
    lines = file.readlines()

edges = [line.strip().split(',') for line in lines]
edges_df = pd.DataFrame(edges, columns=['source', 'target'])
edges_df = edges_df.astype(int)
G = nx.from_pandas_edgelist(edges_df, source='source', target='target')

with open('/content/cascades.txt', 'r') as file:
    lines = file.readlines()

num_nodes = edges_df.to_numpy().max() + 1
num_features = len(lines)

node_features = np.zeros((num_nodes, num_features))
print("Shape of node_features:", node_features.shape)

for j, line in enumerate(lines):
    entries = line.strip().split(';')
    for entry in entries:
        node_id, value = entry.split(',')
        node_id = int(node_id)
        value = float(value)
        node_features[node_id, j] = value

for node_id, features in enumerate(node_features):
    G.nodes[node_id]['x'] = np.array(features, dtype=np.float32)


with open('/content/community.txt', 'r') as file:
    lines = file.readlines()

labels = [list(map(int, l.strip().split())) for l in lines]
print("Number of labels: ", len(labels))
for label, nodes in enumerate(labels):
  for node_id in nodes:
      G.nodes[node_id]['y'] = label

data = from_networkx(G)
device = torch.device('cpu')
data = data.to(device)
data

Shape of node_features: (1000, 20000)
Number of labels:  28


  data_dict[key] = torch.as_tensor(value)


Data(x=[1000, 20000], edge_index=[2, 15384], y=[1000])

# Feature Engineering for Logistic Regression

In [None]:
def get_link_labels(pos_edge_index, neg_edge_index):
    edge_index = torch.cat([pos_edge_index, neg_edge_index], dim=-1).to(device)
    labels = torch.cat([torch.ones(pos_edge_index.size(1)), torch.zeros(neg_edge_index.size(1))], dim=0).to(device)
    return edge_index, labels

def extract_features(X, edges):
    node_features = []
    for edge in edges:
        u, v = edge
        node_features.append(np.concatenate([X[u], X[v]]))
    return np.array(node_features)


X_train_positive_edges, X_test_positive_edges = train_test_split(data.edge_index.T, test_size=0.2, random_state=42)

pos_edge_index = X_train_positive_edges.T
neg_edge_index = negative_sampling(pos_edge_index, num_nodes=data.x.shape[0])
X_train, y_train = get_link_labels(pos_edge_index, neg_edge_index)
X_train_features = extract_features(data.x.numpy(), X_train.T)

pos_edge_index = X_test_positive_edges.T
neg_edge_index = negative_sampling(pos_edge_index, num_nodes=data.x.shape[0])
X_test, y_test = get_link_labels(pos_edge_index, neg_edge_index)
X_test_features = extract_features(data.x.numpy(), X_test.T)

scaler = StandardScaler()
X_train_features = torch.tensor(scaler.fit_transform(X_train_features), dtype=torch.float32)
X_test_features = torch.tensor(scaler.transform(X_test_features), dtype=torch.float32)

# Logistic Regression

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return torch.sigmoid(self.linear(x))


model = LogisticRegression(X_train_features.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_features)
    outputs = outputs.squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


model.eval()
with torch.no_grad():
    train_outputs = model(X_train_features)
    test_outputs = model(X_test_features)

train_predicted = (train_outputs > 0.5).float()
test_predicted = (test_outputs > 0.5).float()

train_accuracy = accuracy_score(y_train, train_predicted)
test_accuracy = accuracy_score(y_test, test_predicted)

train_auc = roc_auc_score(y_train, train_outputs)
test_auc = roc_auc_score(y_test, test_outputs)

train_f1 = f1_score(y_train, train_predicted)
test_f1 = f1_score(y_test, test_predicted)

print(f'\nTrain Accuracy: {train_accuracy:.4f}')
print(f'Train AUC: {train_auc:.4f}')
print(f'Train F1 Score: {train_f1:.4f}')
print(f'\nTest Accuracy: {test_accuracy:.4f}')
print(f'Test AUC: {test_auc:.4f}')
print(f'Test F1 Score: {test_f1:.4f}')


Epoch [10/50], Loss: 1.1703
Epoch [20/50], Loss: 0.7358
Epoch [30/50], Loss: 0.6740
Epoch [40/50], Loss: 0.6330
Epoch [50/50], Loss: 0.6120

Train Accuracy: 0.6708
Train AUC: 0.7379
Train F1 Score: 0.6579

Test Accuracy: 0.6493
Test AUC: 0.7130
Test F1 Score: 0.6293


## Knowledge Distillation

In [None]:
import os

directory = '/content/'
path = None

for root, dirs, files in os.walk(directory):
    for file in files:
        if file.endswith('.pth'):
            path = os.path.join(root, file)
            break
    if path:
        break

In [None]:
class GCN(Module):
    def __init__(self, num_features, hidden_channels, out_channels, dropout=0.5):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.bn1 = BatchNorm(hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.bn2 = BatchNorm(hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.bn3 = BatchNorm(hidden_channels)
        self.conv4 = GCNConv(hidden_channels, out_channels)

        self.dropout = dropout

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = self.bn1(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv2(x, edge_index)
        x = self.bn2(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv3(x, edge_index)
        x = self.bn3(x)
        x = F.relu(x)
        x = F.dropout(x, p=self.dropout, training=self.training)

        x = self.conv4(x, edge_index)
        return x

    def link_logits(self, x, edge_index):
        return (x[edge_index[0]] * x[edge_index[1]]).sum(dim=-1)


In [None]:
teacher_model = GCN(num_features=data.x.shape[1], hidden_channels=128, out_channels=64, dropout=0.5).to(device)
checkpoint = torch.load(path)
teacher_model.load_state_dict(checkpoint['model_state_dict'])
teacher_model.eval()

GCN(
  (conv1): GCNConv(20000, 128)
  (bn1): BatchNorm(128)
  (conv2): GCNConv(128, 128)
  (bn2): BatchNorm(128)
  (conv3): GCNConv(128, 128)
  (bn3): BatchNorm(128)
  (conv4): GCNConv(128, 64)
)

In [None]:
count_params = lambda x : sum(p.numel() for p in x.parameters())
X_train_features_tensor = torch.FloatTensor(X_train_features)
X_test_features_tensor = torch.FloatTensor(X_test_features)

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(input_dim, 1)

    def forward(self, x):
        return self.linear(x)


temperature = 1.0
alpha = 0.6
num_epochs = 100

student_model = LogisticRegression(X_train_features.shape[1])
optimizer = torch.optim.Adam(student_model.parameters(), lr=0.001)
criterion = nn.KLDivLoss(reduction='batchmean')
# criterion = nn.MSELoss()

for epoch in range(num_epochs):
    optimizer.zero_grad()

    with torch.no_grad():
        teacher_output = teacher_model(data.x, X_train_positive_edges.T)
        teacher_output = teacher_model.link_logits(teacher_output, X_train)
        teacher_output = torch.sigmoid(teacher_output / temperature)

    student_output = student_model(X_train_features_tensor).view(-1)
    student_output_sigmoid = torch.sigmoid(student_output / temperature)

    student_output_sigmoid = torch.clamp(student_output_sigmoid, min=1e-7, max=1.0 - 1e-7)

    distillation_loss = criterion(student_output_sigmoid.log(), teacher_output)

    ce_loss = F.binary_cross_entropy_with_logits(student_output, y_train.float())

    loss = (1 - alpha) * ce_loss + alpha * distillation_loss

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 10 == 0:
        print(f'Epoch [{epoch + 1}/{num_epochs}], CE Loss: {ce_loss.item():.4f}, Distillation Loss: {distillation_loss.item():.4f}')


student_model.eval()
with torch.no_grad():
    student_logits = student_model(X_test_features_tensor).view(-1)
    student_probs = torch.sigmoid(student_logits)
    student_predictions = (student_probs > 0.5).float()

accuracy_student = torch.sum(student_predictions == y_test.float()).item() / len(y_test)
y_test_np = y_test.cpu().numpy()
student_probs_np = student_probs.cpu().numpy()
student_predictions_np = student_predictions.cpu().numpy()

auc_student = roc_auc_score(y_test_np, student_probs_np)
f1_student = f1_score(y_test_np, student_predictions_np)

print('\nStudent Model Evaluation')
print(f'Student Model Accuracy on Test Set: {accuracy_student:.4f}')
print(f'Student Model AUC on Test Set: {auc_student:.4f}')
print(f'Student Model F1-Score on Test Set: {f1_student:.4f}')


teacher_model.eval()
with torch.no_grad():
    teacher_output = teacher_model(data.x, X_train_positive_edges.T)
    teacher_logits = teacher_model.link_logits(teacher_output, X_test)
    teacher_probs = torch.sigmoid(teacher_logits)
    teacher_predictions = (teacher_probs > 0.5).float()

accuracy_teacher = torch.sum(teacher_predictions == y_test.float()).item() / len(y_test)
teacher_probs_np = teacher_probs.cpu().numpy()
teacher_predictions_np = teacher_predictions.cpu().numpy()

auc_teacher = roc_auc_score(y_test_np, teacher_probs_np)
f1_teacher = f1_score(y_test_np, teacher_predictions_np)

print('\nTeacher Model Evaluation')
print(f'Teacher Model Accuracy on Test Set: {accuracy_teacher:.4f}')
print(f'Teacher Model AUC on Test Set: {auc_teacher:.4f}')
print(f'Teacher Model F1-Score on Test Set: {f1_teacher:.4f}')


Epoch [10/100], CE Loss: 0.8542, Distillation Loss: 0.5457
Epoch [20/100], CE Loss: 0.8682, Distillation Loss: 0.4788
Epoch [30/100], CE Loss: 0.6653, Distillation Loss: 0.3589
Epoch [40/100], CE Loss: 0.6315, Distillation Loss: 0.3026
Epoch [50/100], CE Loss: 0.6231, Distillation Loss: 0.2946
Epoch [60/100], CE Loss: 0.6254, Distillation Loss: 0.2768
Epoch [70/100], CE Loss: 0.6160, Distillation Loss: 0.2709
Epoch [80/100], CE Loss: 0.6166, Distillation Loss: 0.2661
Epoch [90/100], CE Loss: 0.6170, Distillation Loss: 0.2626
Epoch [100/100], CE Loss: 0.6165, Distillation Loss: 0.2594

Student Model Evaluation
Student Model Accuracy on Test Set: 0.6685
Student Model AUC on Test Set: 0.7203
Student Model F1-Score on Test Set: 0.6761

Teacher Model Evaluation
Teacher Model Accuracy on Test Set: 0.8006
Teacher Model AUC on Test Set: 0.9538
Teacher Model F1-Score on Test Set: 0.8287


In [None]:
count_params(teacher_model)

2602176

In [None]:
count_params(student_model)

40001

# MLP Performance

In [None]:
class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dim1=256, hidden_dim2=128, hidden_dim3=64):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.bn1 = nn.BatchNorm1d(hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.bn2 = nn.BatchNorm1d(hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, hidden_dim3)
        self.bn3 = nn.BatchNorm1d(hidden_dim3)
        self.fc4 = nn.Linear(hidden_dim3, 1)

    def forward(self, x):
        x = F.relu(self.bn1(self.fc1(x)))
        x = F.relu(self.bn2(self.fc2(x)))
        x = F.relu(self.bn3(self.fc3(x)))
        return torch.sigmoid(self.fc4(x))



model = MLP(X_train_features.shape[1])
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(X_train_features)
    outputs = outputs.squeeze()
    loss = criterion(outputs, y_train)
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


model.eval()
with torch.no_grad():
    train_outputs = model(X_train_features)
    test_outputs = model(X_test_features)

train_predicted = (train_outputs > 0.5).float()
test_predicted = (test_outputs > 0.5).float()

train_accuracy = accuracy_score(y_train, train_predicted)
test_accuracy = accuracy_score(y_test, test_predicted)

train_auc = roc_auc_score(y_train, train_outputs)
test_auc = roc_auc_score(y_test, test_outputs)

train_f1 = f1_score(y_train, train_predicted)
test_f1 = f1_score(y_test, test_predicted)

print(f'\nTrain Accuracy: {train_accuracy:.4f}')
print(f'Train AUC: {train_auc:.4f}')
print(f'Train F1 Score: {train_f1:.4f}')
print(f'\nTest Accuracy: {test_accuracy:.4f}')
print(f'Test AUC: {test_auc:.4f}')
print(f'Test F1 Score: {test_f1:.4f}')


Epoch [10/50], Loss: 0.6174
Epoch [20/50], Loss: 0.6013
Epoch [30/50], Loss: 0.5922
Epoch [40/50], Loss: 0.5823
Epoch [50/50], Loss: 0.5645

Train Accuracy: 0.6417
Train AUC: 0.7677
Train F1 Score: 0.7246

Test Accuracy: 0.6341
Test AUC: 0.7041
Test F1 Score: 0.7186


In [None]:
count_params(model)

10282369

In [None]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_channels):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.conv3 = GCNConv(hidden_channels, hidden_channels)
        self.conv4 = GCNConv(hidden_channels, hidden_channels)

    def forward(self, x, edge_index):
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.conv3(x, edge_index))
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv4(x, edge_index)
        return x

In [None]:
example_model = GCN(num_features=data.x.shape[1], hidden_channels=64)
count_params(example_model)

1292544