# Homework

TODO: Leírást gyártani

In [None]:
!pip install -r requirements.txt -q

# pip didn't find pyg-lib for some reason
#!pip install pyg-lib>=0.4 torch-scatter>=2.1 torch-sparse>=0.6 torch-cluster>=1.6 torch-spline_conv>=1.2 -f https://data.pyg.org/whl/torch-2.5.0+cu121.html --force-reinstall


In [1]:
# General
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#Torch
import torch

# Graph
import networkx as nx
import torch_geometric as pyg	# This import doesn't work
import torch_geometric.nn as gnn
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling
from torch_geometric.utils.convert import from_networkx

from model import *

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)

# Default device breaks node and edge sampling
#torch.set_default_device(DEVICE)

print(f"Training on {DEVICE} with PyTorch version {torch.__version__}")


Training on cuda:0 with PyTorch version 2.5.1+cu121


## Data loading

Loading, creating and splitting the data into dataloaders, defining train and test functions, import node2vec embedding

In [2]:
# Load files created previously

# Number of different feature names for the embedding size
emb_size = 155522

allfeats = pd.read_pickle("feature_bow.pkl")
allfeats.head()


Unnamed: 0,Features,Bag_of_Words
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie...","[tensor(31517), tensor(86), tensor(115673), te..."
13,"{'brainpicker': 1, 'ev': 1, 'eventbrite': 1, '...","[tensor(3462), tensor(11073), tensor(24296), t..."
17,"{'amyquispe': 1, 'baratunde': 1, 'busterbenson...","[tensor(70048), tensor(6685), tensor(24196), t..."
20,"{'aaronsw': 1, 'abdur': 1, 'amac': 1, 'dustin'...","[tensor(24104), tensor(51836), tensor(25369), ..."
47,"{'al3x': 1, 'alexandrak': 1, 'allspaw': 1, 'av...","[tensor(25362), tensor(25367), tensor(25368), ..."


In [3]:
# Create graph: from pandas to NetworkX to torch_geometric data object
# Complete edge list
edge_list = pd.read_csv("./Data/twitter_combined.txt", sep=" ", header=None)
print("Edge list:\n", edge_list.head())

# Complete graph
G = nx.from_pandas_edgelist(edge_list, 0, 1, create_using=nx.DiGraph)
print("Complete graph nodes and edges: ", G.number_of_nodes(), G.number_of_edges())

# Restrict to nonzero feature nodes
G = G.subgraph(allfeats.index)
print("Restricted graph nodes and edges: ", G.number_of_nodes(), G.number_of_edges())

# Give each node its it's bag of words as attribute
nx.set_node_attributes(G, {idx: allfeats.Bag_of_Words[idx].to(DEVICE) for idx in allfeats.index}, "BoW")

# Example
print("Features of node index 12:\n", G.nodes[12])

# Create the graph with a bag of words as the only node attribute
graph = from_networkx(G)
graph


Edge list:
            0          1
0  214328887   34428380
1   17116707   28465635
2  380580781   18996905
3  221036078  153460275
4  107830991   17868918
Complete graph nodes and edges:  81306 1768149
Restricted graph nodes and edges:  70923 1655299
Features of node index 12:
 {'BoW': tensor([ 31517,     86, 115673,   6741,   6741,  25306,   7622,  25768,  13811,
        115629,   7120,   1681,  45262,  24086,   8859,   7310,  45288,  45228,
         25336,  25336,  31027,   2976,   2976,  19157,  88730,  24332,  38647,
         25698,  51775,  59974,  60065,  60010,  60015,  39992,   3173,   3122,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0,      0, 

Data(edge_index=[2, 1655299], BoW=[70923, 195], num_nodes=70923)

In [4]:
# Split the data: Apply train-val-test masks and create dataloaders

transform = RandomLinkSplit(num_val=0.1, num_test=0.1)
train_data, val_data, test_data = transform(graph)

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=1024,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=2048,
    shuffle=False,
)

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=2048,
    shuffle=False,
)


In [5]:
# Load node2vec embedding
state = torch.load("node2vec_trained.pt")
n2v = gnn.Node2Vec(train_data.edge_index, 20, 10, 10).to(DEVICE)
n2v.load_state_dict(state)


  state = torch.load("node2vec_trained.pt")


<All keys matched successfully>

## Baseline models

### Not using the graph structure

### Embedding + dot product

**Theory:** People tend to follow people similar to them (i.e. form echo chambers)

**Problem:** Symmetric, which is not good for directed edges

In [6]:
lr, emb_dim = 0.001, 20
model = DotProduct(emb_size, emb_dim).to(DEVICE)

train_model(model=model, train_loader=train_loader, val_loader=val_loader, optimizer="Adam", lr=lr)


Epoch 1: Train loss: 1.1412, - accuracy: 0.1504
Epoch 1: Valid loss: 0.8563 - accuracy: 0.1745
Epoch 2: Train loss: 0.7814, - accuracy: 0.1932
Epoch 2: Valid loss: 0.7141 - accuracy: 0.2066
Epoch 3: Train loss: 0.6839, - accuracy: 0.2180
Epoch 3: Valid loss: 0.6516 - accuracy: 0.2236
Epoch 4: Train loss: 0.6368, - accuracy: 0.2297
Epoch 4: Valid loss: 0.6203 - accuracy: 0.2305
Epoch 5: Train loss: 0.6125, - accuracy: 0.2350
Epoch 5: Valid loss: 0.6006 - accuracy: 0.2344
Best val_acc: 0.23436670515412225


([1.1412242747716594,
  0.7813725444780435,
  0.6838626762998896,
  0.6368485486949614,
  0.6124703146206356],
 [0.15035291914387186,
  0.19324692408708083,
  0.21800110402864736,
  0.22972555599773758,
  0.23498403991418482],
 [0.8563484172732605,
  0.7140930204708388,
  0.6515710797892104,
  0.620263319019188,
  0.6005873164216738],
 [0.17446031349278568,
  0.20659230457295916,
  0.22363489727323047,
  0.2304622043872679,
  0.23436670515412225])

### Two fully connected layers


In [7]:
lr, emb_dim, hidden_dim = 0.1, 20, 256
model = FCN(emb_size, emb_dim, hidden_dim).to(DEVICE)

train_model(model=model, train_loader=train_loader, val_loader=val_loader, optimizer="SGD", lr=lr)


Epoch 1: Train loss: 0.6757, - accuracy: 0.4889
Epoch 1: Valid loss: 0.6557 - accuracy: 0.4529
Epoch 2: Train loss: 0.6424, - accuracy: 0.4192
Epoch 2: Valid loss: 0.6290 - accuracy: 0.4111
Epoch 3: Train loss: 0.6248, - accuracy: 0.4511
Epoch 3: Valid loss: 0.6165 - accuracy: 0.4701
Epoch 4: Train loss: 0.6170, - accuracy: 0.4794
Epoch 4: Valid loss: 0.6107 - accuracy: 0.4872
Epoch 5: Train loss: 0.6128, - accuracy: 0.4873
Epoch 5: Valid loss: 0.6068 - accuracy: 0.4929
Best val_acc: 0.49287893970961477


([0.6757211400515514,
  0.6423971508348928,
  0.6247718452117542,
  0.6170047130982695,
  0.612798991490737],
 [0.48888117797289166,
  0.41915972998872564,
  0.4510776361704554,
  0.47943085888444775,
  0.4872640252038715],
 [0.6557093546416328,
  0.6289802695720973,
  0.616530851566073,
  0.6106971181710316,
  0.606817088963628],
 [0.45288659692608824,
  0.4111423071782251,
  0.4701236406364098,
  0.4872213592540935,
  0.49287893970961477])

## Basic GNN model

Two CGN layers with reverse message passing, because the people we follow influence us.<br>
(The reverse is also true, but to a lesser degree)


In [8]:
lr, emb_dim, hidden = 0.001, 20, 256
model = GCN(emb_size, emb_dim, hidden, n2v).to(DEVICE)

train_model(model=model, train_loader=train_loader, val_loader=val_loader, optimizer="Adam", lr=lr)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Advanced GNN model

In [None]:
# TODO: Something that actually works
