In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
import numpy as np
from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall, AUC
from scipy.io import loadmat
from scipy.io import savemat
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from scipy.spatial.distance import squareform
from scipy.spatial.distance import pdist
from scipy.spatial.distance import cdist
import torch
from torch_geometric.datasets import Planetoid
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

In [2]:
mat = loadmat('..\\results\\var_k\dblp\\res_dblp_exp_2_002_128.mat')
features_matrix = mat['embs']

mat = loadmat("../data/dblp.mat")
A = mat["network"]
labels_matrix = mat["group"]
labels_count = labels_matrix.shape[1]

groups = labels_matrix.toarray()
A = A.toarray()
X = features_matrix

print(groups[:5, :5])
print(A[:5, :5])

[[1. 0.]
 [1. 0.]
 [1. 0.]
 [1. 1.]
 [1. 0.]]
[[0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0.]]


In [3]:
X_train, X_test, A_train, A_test = train_test_split(X, A, test_size=0.25, random_state=2137)

In [4]:
def create_masks(num_nodes, train_percent=0.8, val_percent=0.1):
    """
    Randomly assign train, validation, and test masks for the nodes in the graph.
    
    Parameters:
    - num_nodes (int): Total number of nodes in the graph.
    - train_percent (float): Percentage of nodes to include in the training set.
    - val_percent (float): Percentage of nodes to include in the validation set.
    
    Returns:
    - Tuple of Tensors: (train_mask, val_mask, test_mask)
    """
    indices = np.random.permutation(num_nodes)
    train_size = int(num_nodes * train_percent)
    val_size = int(num_nodes * val_percent)

    train_mask = torch.zeros(num_nodes, dtype=torch.bool)
    val_mask = torch.zeros(num_nodes, dtype=torch.bool)
    test_mask = torch.zeros(num_nodes, dtype=torch.bool)

    train_mask[indices[:train_size]] = True
    val_mask[indices[train_size:train_size + val_size]] = True
    test_mask[indices[train_size + val_size:]] = True

    return train_mask, val_mask, test_mask

In [5]:
from torch_geometric.data import Data

def create_torch_geo_data(features, adjacency_matrix, num_nodes):
    edge_index = adjacency_to_edge_index(adjacency_matrix)
    x = features_to_tensor(features)
    train_mask, val_mask, test_mask = create_masks(num_nodes)
    data = Data(x=x, edge_index=edge_index, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
    return data

In [6]:
import torch
import numpy as np

def adjacency_to_edge_index(adjacency_matrix):
    # Find the indices of nonzero elements (edges) in the adjacency matrix
    src, dst = adjacency_matrix.nonzero()
    # Create a tensor containing edge pairs
    edge_index = np.vstack([src, dst])
    return torch.tensor(edge_index, dtype=torch.long)

def features_to_tensor(features):
    return torch.tensor(features, dtype=torch.float)

In [7]:
class GCN(torch.nn.Module):
    def __init__(self, num_features, hidden_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(num_features, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        return x  # Output is node embeddings

In [8]:
def compute_edge_scores(node_embeddings):
    # Using dot product to score edges
    scores = torch.matmul(node_embeddings, node_embeddings.t())
    return torch.sigmoid(scores)  # Use sigmoid to map scores to probabilities

In [9]:
num_features = X.shape[1]
print(num_features)
num_classes = 13326  # You'll need to set this based on your dataset
data = create_torch_geo_data(X, A, num_classes)

# Initialize model
model = GCN(num_features, num_classes)

print(data)

128
Data(x=[13326, 128], edge_index=[2, 68562], train_mask=[13326], val_mask=[13326], test_mask=[13326])


In [10]:
def create_edge_set(edge_index):
    edge_set = set()
    for i in range(edge_index.shape[1]):
        edge_set.add((edge_index[0, i].item(), edge_index[1, i].item()))
        edge_set.add((edge_index[1, i].item(), edge_index[0, i].item()))  # Add reverse direction for undirected graphs
    return edge_set

edge_set = create_edge_set(data.edge_index)

In [11]:
def get_negative_samples(edge_set, num_nodes, num_neg_samples):
    neg_edge_index = []
    while len(neg_edge_index) < num_neg_samples:
        i = np.random.randint(0, num_nodes)
        j = np.random.randint(0, num_nodes)
        if i != j and (i, j) not in edge_set:
            neg_edge_index.append([i, j])
    return torch.tensor(neg_edge_index).t()

negative_edge_index = get_negative_samples(edge_set, data.num_nodes, num_neg_samples=1000)

In [12]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = torch.nn.BCEWithLogitsLoss()

model.train()
for epoch in range(6):
    print(f"epoch: {epoch} starting")
    optimizer.zero_grad()
    embeddings = model(data)

    print(f"epoch: {epoch} computing scores")

    pos_scores = compute_edge_scores(embeddings)[data.edge_index[0], data.edge_index[1]]
    neg_scores = compute_edge_scores(embeddings)[negative_edge_index[0], negative_edge_index[1]]
    
    # Labels: 1s for positive samples, 0s for negative samples
    labels = torch.cat([torch.ones(pos_scores.shape[0]), torch.zeros(neg_scores.shape[0])])
    scores = torch.cat([pos_scores, neg_scores])
    
    print(f"epoch: {epoch} concluding")
    loss = criterion(scores, labels)
    loss.backward()
    optimizer.step()

epoch: 0 starting
epoch: 0 computing scores
epoch: 0 concluding
epoch: 1 starting
epoch: 1 computing scores
epoch: 1 concluding
epoch: 2 starting
epoch: 2 computing scores
epoch: 2 concluding
epoch: 3 starting
epoch: 3 computing scores
epoch: 3 concluding
epoch: 4 starting
epoch: 4 computing scores
epoch: 4 concluding
epoch: 5 starting
epoch: 5 computing scores
epoch: 5 concluding


In [17]:
def calculate_metrics(predictions, labels):
    # Binarize predictions based on a threshold
    threshold = 0.5
    preds = predictions > threshold
    labels = labels.bool()
    
    # True positives, false positives, true negatives, false negatives
    tp = (preds & labels).sum().float()
    fp = (preds & ~labels).sum().float()
    tn = (~preds & ~labels).sum().float()
    fn = (~preds & labels).sum().float()

    precision = tp / (tp + fp) if tp + fp > 0 else 0
    recall = tp / (tp + fn) if tp + fn > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    return precision, recall, f1

def validate():
    model.eval()
    with torch.no_grad():
        embeddings = model(data)
        pos_edge_index = data.edge_index
        neg_edge_index = negative_sampling(edge_index=pos_edge_index, num_nodes=data.num_nodes, num_neg_samples=pos_edge_index.size(1))

        pos_scores = compute_edge_scores(embeddings)[pos_edge_index[0], pos_edge_index[1]]
        neg_scores = compute_edge_scores(embeddings)[neg_edge_index[0], neg_edge_index[1]]
        scores = torch.cat([pos_scores, neg_scores], dim=0)
        labels = torch.cat([torch.ones(pos_scores.shape[0]), torch.zeros(neg_scores.shape[0])], dim=0)

        val_loss = criterion(scores, labels)
        precision, recall, f1 = calculate_metrics(scores, labels)
        
        print(f'Validation Loss: {val_loss.item():.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

validate()

Validation Loss: 0.8133, Precision: 0.5000, Recall: 1.0000, F1: 0.6667
