In [1]:
import torch
from torch_geometric.data import Data
from torch_geometric.datasets import DBLP, IMDB
from torch_geometric.transforms import NormalizeFeatures
import random
from collections import defaultdict
import copy
import ast
import csv
import numpy as np

dataset = DBLP(root='../data/dblp', transform=NormalizeFeatures())

In [2]:
print(f'Dataset: {dataset}:')
print('======================')
print(f'Number of graphs: {len(dataset)}')
print(f'Number of features: {dataset.num_features}')

hetero_data = dataset[0]  # Get the first graph object.

print()
print(hetero_data)
print('===========================================================================================================')

# Gather some statistics about the graph.
print(f'Number of nodes: {hetero_data.num_nodes}')
print(f'Number of edges: {hetero_data.num_edges}')
print(f'Average node degree: {hetero_data.num_edges / hetero_data.num_nodes:.2f}')
# print(f'Number of training nodes: {data.train_mask.sum()}')
# print(f'Training node label rate: {int(data.train_mask.sum()) / data.num_nodes:.2f}')
# print(f'Has isolated nodes: {data.has_isolated_nodes()}')
# print(f'Has self-loops: {data.has_self_loops()}')

torch.set_printoptions(threshold=10000)

Dataset: DBLP():
Number of graphs: 1
Number of features: {'author': 334, 'paper': 4231, 'term': 50, 'conference': 0}

HeteroData(
  author={
    x=[4057, 334],
    y=[4057],
    train_mask=[4057],
    val_mask=[4057],
    test_mask=[4057],
  },
  paper={ x=[14328, 4231] },
  term={ x=[7723, 50] },
  conference={ num_nodes=20 },
  (author, to, paper)={ edge_index=[2, 19645] },
  (paper, to, author)={ edge_index=[2, 19645] },
  (paper, to, term)={ edge_index=[2, 85810] },
  (paper, to, conference)={ edge_index=[2, 14328] },
  (term, to, paper)={ edge_index=[2, 85810] },
  (conference, to, paper)={ edge_index=[2, 14328] }
)
Number of nodes: 26128
Number of edges: 239566
Average node degree: 9.17


In [6]:
# Function to convert the heterogeneous graph to a homogeneous graph with only author nodes
def convert_to_author_graph(hetero_data):    
    # Initialize adjacency list to count co-authorship
    adj_dict = defaultdict(int)
    
    # Process 'writes' and 'written_by' edges to count co-authorships
    author_to_paper = hetero_data['author', 'to', 'paper'].edge_index
    paper_to_author = hetero_data['paper', 'to', 'author'].edge_index
    
    for author, paper in author_to_paper.t().tolist():
        coauthors = paper_to_author[1, paper_to_author[0] == paper].tolist()
        print(author)
        print(coauthors)
        for coauthor in coauthors:
            if author != coauthor:
                edge = tuple([author, coauthor])
                adj_dict[edge] += 1
    
    # Create edge_index and edge_weight tensors
    edge_index = torch.tensor(list(adj_dict.keys()), dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(list(adj_dict.values()), dtype=torch.float)
    
    # Create the homogeneous graph
    data = Data()
    data.x = hetero_data['author'].x
    data.edge_index = edge_index
    data.edge_weight = edge_weight

    data.train_mask = hetero_data['author'].train_mask
    data.val_mask = hetero_data['author'].val_mask
    data.test_mask = hetero_data['author'].test_mask
    data.y = hetero_data['author'].y
    
    return data

# Convert the dataset
data = convert_to_author_graph(hetero_data)

# Print the converted data
print(data)
print(data.y.max().item() + 1)

0
[0]
0
[0]
1
[1]
1
[1]
1
[1]
1
[1, 5]
1
[1]
1
[1]
1
[1]
1
[1]
1
[1]
1
[1]
2
[2]
2
[2, 1183, 2353]
2
[2, 1183, 2353, 3626]
3
[3]
3
[3]
3
[3]
3
[3, 339, 2646]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3, 339]
3
[3, 339]
3
[3, 339]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3]
3
[3, 650]
3
[3]
3
[3, 3416]
3
[3, 3416]
3
[3]
3
[3]
3
[3, 767]
4
[4, 2602, 2814]
5
[5]
5
[5]
5
[5]
5
[5]
5
[5]
5
[5]
5
[1, 5]
5
[5]
5
[5]
6
[6]
7
[7]
7
[7]
7
[7]
7
[7]
7
[7]
7
[7]
8
[8]
8
[8]
8
[8, 412]
9
[9, 3011]
10
[10]
10
[10]
10
[10]
10
[10]
11
[11, 557]
11
[11, 557, 803]
11
[11]
12
[12, 123, 141, 1829]
12
[12, 1829]
13
[13, 1353]
13
[13, 1881, 3282]
13
[13]
13
[13]
13
[13, 2171]
13
[13, 1353]
13
[13, 2171]
13
[13, 2171]
13
[13, 1353]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14, 850]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14]
14
[14, 1132]
14
[14]
14
[14]
14
[14]
14
[14, 1014, 1015]
14
[14]
14


In [18]:
import torch
from torch_geometric.data import Data
from collections import defaultdict

def convert_to_movie_graph(hetero_data):
    # Initialize adjacency list to count shared actors
    adj_dict = defaultdict(int)
    
    # Process 'acts in' edges to count shared actors
    movie_to_actor = hetero_data['movie', 'to', 'actor'].edge_index
    actor_to_movie = hetero_data['actor', 'to', 'movie'].edge_index
    
    # Create a dictionary mapping movies to their actors
    movie_to_actors_dict = defaultdict(list)
    for movie, actor in movie_to_actor.t().tolist():
        movie_to_actors_dict[movie].append(actor)
    
    # Create edges between movies that share actors
    for movie, actors in movie_to_actors_dict.items():
        for actor in actors:
            co_movies = actor_to_movie[1, actor_to_movie[0] == actor].tolist()
            for co_movie in co_movies:
                if movie != co_movie:
                    edge = tuple(sorted([movie, co_movie]))
                    adj_dict[edge] += 1
    
    # Create edge_index and edge_weight tensors
    edge_index = torch.tensor(list(adj_dict.keys()), dtype=torch.long).t().contiguous()
    edge_weight = torch.tensor(list(adj_dict.values()), dtype=torch.float)
    
    # Create the homogeneous graph
    data = Data()
    data.x = hetero_data['movie'].x
    data.edge_index = edge_index
    data.edge_weight = edge_weight

    data.train_mask = hetero_data['movie'].train_mask
    data.val_mask = hetero_data['movie'].val_mask
    data.test_mask = hetero_data['movie'].test_mask
    data.y = hetero_data['movie'].y
    
    return data


data = convert_to_movie_graph(hetero_data)
print(data)

Data(x=[4278, 3066], edge_index=[2, 40540], edge_weight=[40540], train_mask=[4278], val_mask=[4278], test_mask=[4278], y=[4278])


In [20]:
print(torch.where(data.edge_weight != 2))

(tensor([   14,    15,    17,    93,   130,   133,   154,   214,   222,   226,
          229,   328,   330,   459,   463,   466,   517,   568,   609,   610,
          642,   676,   689,   697,   776,   812,   818,   839,   901,   902,
         1009,  1059,  1146,  1149,  1236,  1237,  1238,  1244,  1367,  1573,
         1649,  1663,  1686,  1700,  1822,  1861,  1999,  2155,  2170,  2220,
         2262,  2362,  2398,  2630,  2657,  2658,  2659,  2662,  2706,  2712,
         2713,  2888,  2987,  3009,  3049,  3056,  3138,  3212,  3214,  3215,
         3235,  3236,  3322,  3546,  3574,  3831,  3858,  3895,  3978,  3994,
         4053,  4186,  4259,  4319,  4565,  4590,  4592,  4594,  4596,  4600,
         4912,  4914,  4916,  4920,  4934,  4935,  5084,  5134,  5135,  5242,
         5277,  5316,  5322,  5343,  5373,  5411,  5763,  5792,  5798,  5885,
         6123,  6168,  6182,  6328,  6580,  6581,  6594,  6643,  6803,  6826,
         6976,  6992,  7152,  7153,  7356,  7357,  7404,  7474,

In [129]:
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GATConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(dataset.num_features['author'], hidden_channels)
        self.conv2 = GCNConv(hidden_channels, data.y.max().item() + 1)

    def forward(self, x, edge_index, edge_weight):
        x = self.conv1(x, edge_index, edge_weight=edge_weight)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index, edge_weight=edge_weight)
        return x

model = GCN(hidden_channels=16)

In [130]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(data.x, data.edge_index, data.edge_weight)  # Perform a single forward pass.
    loss = criterion(out[data.train_mask], data.y[data.train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test(mask):
    model.eval()
    out = model(data.x, data.edge_index, data.edge_weight)
    pred = out.argmax(dim=1)  # Use the class with highest probability.
    test_correct = pred[mask] == data.y[mask]  # Check against ground-truth labels.
    test_acc = int(test_correct.sum()) / int(mask.sum())  # Derive ratio of correct predictions.
    return test_acc

In [131]:
for epoch in range(1, 101):
    loss = train()
    val_acc = test(data.val_mask)
    test_acc = test(data.test_mask)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 001, Loss: 1.3845, Val: 0.3500, Test: 0.3985
Epoch: 002, Loss: 1.3738, Val: 0.4125, Test: 0.4513
Epoch: 003, Loss: 1.3638, Val: 0.4250, Test: 0.4762
Epoch: 004, Loss: 1.3560, Val: 0.4225, Test: 0.4756
Epoch: 005, Loss: 1.3382, Val: 0.4175, Test: 0.4777
Epoch: 006, Loss: 1.3278, Val: 0.4225, Test: 0.4790
Epoch: 007, Loss: 1.3174, Val: 0.4225, Test: 0.4876
Epoch: 008, Loss: 1.3024, Val: 0.4250, Test: 0.4980
Epoch: 009, Loss: 1.2891, Val: 0.4325, Test: 0.5124
Epoch: 010, Loss: 1.2801, Val: 0.4425, Test: 0.5269
Epoch: 011, Loss: 1.2614, Val: 0.4575, Test: 0.5422
Epoch: 012, Loss: 1.2554, Val: 0.4800, Test: 0.5662
Epoch: 013, Loss: 1.2352, Val: 0.4975, Test: 0.5895
Epoch: 014, Loss: 1.2166, Val: 0.5250, Test: 0.6107
Epoch: 015, Loss: 1.2056, Val: 0.5500, Test: 0.6291
Epoch: 016, Loss: 1.1879, Val: 0.5675, Test: 0.6463
Epoch: 017, Loss: 1.1750, Val: 0.5725, Test: 0.6570
Epoch: 018, Loss: 1.1602, Val: 0.5875, Test: 0.6641
Epoch: 019, Loss: 1.1614, Val: 0.6000, Test: 0.6684
Epoch: 020, 

In [132]:
class GAT(torch.nn.Module):
    def __init__(self, hidden_channels, heads):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GATConv(dataset.num_features['author'], hidden_channels, heads=heads)
        self.conv2 = GATConv(hidden_channels * heads, data.y.max().item() + 1)

    def forward(self, x, edge_index, edge_weight):
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv1(x, edge_index, edge_attr=edge_weight)
        x = F.elu(x)
        x = F.dropout(x, p=0.6, training=self.training)
        x = self.conv2(x, edge_index, edge_attr=edge_weight)
        return x

GAT_model = GAT(hidden_channels=8, heads=8)
print(GAT_model)

GAT(
  (conv1): GATConv(334, 8, heads=8)
  (conv2): GATConv(64, 4, heads=1)
)


In [133]:
GAT_optimizer = torch.optim.Adam(GAT_model.parameters(), lr=0.005, weight_decay=5e-4)
loss_fn = torch.nn.CrossEntropyLoss()

def GAT_train():
    GAT_model.train()
    GAT_optimizer.zero_grad()
    out = GAT_model(data.x, data.edge_index, data.edge_weight)
    loss = loss_fn(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    GAT_optimizer.step()
    return loss

def GAT_test(mask):
    GAT_model.eval()
    out = GAT_model(data.x, data.edge_index, data.edge_weight)
    out = F.softmax(out, dim=1)
    pred = out.argmax(dim=1)
    correct = pred[mask] == data.y[mask]
    acc = int(correct.sum()) / int(mask.sum())
    return acc

In [134]:
for epoch in range(1, 81):
    loss = GAT_train()
    val_acc = GAT_test(data.val_mask)
    test_acc = GAT_test(data.test_mask)
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}, Val: {val_acc:.4f}, Test: {test_acc:.4f}')

Epoch: 001, Loss: 1.3887, Val: 0.3850, Test: 0.4173
Epoch: 002, Loss: 1.3673, Val: 0.4200, Test: 0.4673
Epoch: 003, Loss: 1.3503, Val: 0.4400, Test: 0.4906
Epoch: 004, Loss: 1.3355, Val: 0.4475, Test: 0.5020
Epoch: 005, Loss: 1.3290, Val: 0.4500, Test: 0.5088
Epoch: 006, Loss: 1.3132, Val: 0.4500, Test: 0.5146
Epoch: 007, Loss: 1.3004, Val: 0.4475, Test: 0.5192
Epoch: 008, Loss: 1.2886, Val: 0.4625, Test: 0.5269
Epoch: 009, Loss: 1.2724, Val: 0.4675, Test: 0.5352
Epoch: 010, Loss: 1.2699, Val: 0.4700, Test: 0.5413
Epoch: 011, Loss: 1.2543, Val: 0.4825, Test: 0.5514
Epoch: 012, Loss: 1.2375, Val: 0.5025, Test: 0.5609
Epoch: 013, Loss: 1.2329, Val: 0.5050, Test: 0.5729
Epoch: 014, Loss: 1.2131, Val: 0.5150, Test: 0.5809
Epoch: 015, Loss: 1.1991, Val: 0.5225, Test: 0.5956
Epoch: 016, Loss: 1.1981, Val: 0.5275, Test: 0.6131
Epoch: 017, Loss: 1.1826, Val: 0.5375, Test: 0.6202
Epoch: 018, Loss: 1.1563, Val: 0.5475, Test: 0.6349
Epoch: 019, Loss: 1.1492, Val: 0.5625, Test: 0.6506
Epoch: 020, 

In [135]:
from torch_geometric.explain import Explainer, GNNExplainer, GraphMaskExplainer

explainer = Explainer(
    model=model,
    algorithm=GNNExplainer(epochs=100),
    explanation_type='model',
    node_mask_type='object',
    edge_mask_type='object',
    model_config=dict(
        mode='multiclass_classification',
        task_level='node',
        return_type='raw',
    ),
)

In [136]:
node_index = 13 # which node index to explain
explanation = explainer(data.x, data.edge_index, edge_weight=data.edge_weight, index=node_index)

In [137]:
explanation.visualize_graph("../img/graph_vis.png")

In [123]:
print(explanation.edge_mask[torch.where(explanation.edge_mask != 0)])

tensor([0.7616, 0.7887, 0.7695, 0.7367, 0.2187, 0.7649, 0.7768, 0.2155, 0.2339,
        0.7861, 0.3289, 0.7800, 0.7724, 0.2204, 0.2195, 0.7524, 0.7827, 0.2139,
        0.7774])
