## Data preprocessing

### Install requirements

In [None]:
!pip install -r requirements.txt -q

# pip didn't find pyg-lib for some reason
#!pip install pyg-lib>=0.4 torch-scatter>=2.1 torch-sparse>=0.6 torch-cluster>=1.6 torch-spline_conv>=1.2 -f https://data.pyg.org/whl/torch-2.5.0+cu121.html --force-reinstall


### Download data

In [None]:
# Ego networks
!wget -P /Data https://snap.stanford.edu/data/twitter.tar.gz
!unzip -qq /Data/twitter.tar.gz
!rm twitter.tar.gz

# Edge list
!wget -P /Data https://snap.stanford.edu/data/twitter_combined.txt.gz
!unzip /Data/twitter_combined.txt.gz
!rm twitter_combined.txt.gz


### Import packages

In [1]:
# General
import os
import numpy as np
import pandas as pd

#Torch
import torch
import torch.optim as optim

# Graph
import networkx as nx
import torch_geometric.nn as gnn
from torch_geometric.transforms import RandomLinkSplit
from torch_geometric.loader import LinkNeighborLoader
from torch_geometric.sampler import NegativeSampling
from torch_geometric.utils.convert import from_networkx

DEVICE = "cuda:0" if torch.cuda.is_available() else "cpu"
torch.manual_seed(42)


<torch._C.Generator at 0x1e58483e670>

### Data preparation

In [None]:
# Get nodeID's with associated files (ego nodes)
nodes = []
for file in os.listdir("./Data/twitter"):
	if file.split(".")[0] not in nodes:
		nodes.append(file.split(".")[0])
nodes[:5]


['100318079', '10146102', '101859065', '101903164', '102765423']

In [None]:
# All feat names in order of discovery
allfeatnames = pd.Series(dtype=str)
# Node features as dictionaries of followed #-s and @-s (with multiplicities)
allfeats = pd.DataFrame(columns=["Features"])
c = 0

for node in nodes:
	c+=1
	if not c%10:
		print(f"File {c} out of {len(nodes)}")

	# Remove duplicates with special characters and lowercase-uppercase shenanigans
	featnames = pd.read_csv(f"./Data/twitter/{node}.featnames", sep=" ", header=None)[1].str.rstrip(".,;:'+-!?)(}{][").str.lstrip("#@").str.lower()
	allfeatnames = pd.Series(pd.concat([allfeatnames, featnames]).unique())

	# Feature vectors except ego node
	featdf = pd.read_csv(f"./Data/twitter/{node}.feat", sep=" ", header=None).set_index(0).set_axis(featnames, axis=1)

	# Feature vector of ego node
	egofeat = pd.read_csv(f"./Data/twitter/{node}.egofeat", sep=" ", header=None).set_axis(featnames, axis=1)
	egofeat = pd.concat([pd.DataFrame([int(node)], columns=[0]), egofeat], axis=1).set_index(0)

	# All feature vectors in a DataFrame
	featdf = pd.concat([egofeat, featdf])
	featdf = featdf.T.groupby(level=0).sum().T

	# Create bag-of-words style dictionaries to later turn into tensors
	featdf["Features"] = [{col: featdf.at[idx, col] for col in featdf.columns if featdf.at[idx, col]} for idx in featdf.index]
	
	# Concatenate
	allfeats = pd.concat([allfeats, featdf.Features])

	# Edges - use twitter_combined instead
	# edgedf = pd.read_csv(f"./Data/twitter/{node}.edges", sep=" ", header=None).T
	
# Drop the (one) empty string and save
allfeatnames = allfeatnames.dropna()
allfeatnames.to_json("feature_names.json")
print(allfeatnames)

# Drop the empty dicts (there's over 10.000 unique nodes following nothing)
allfeats = allfeats.loc[allfeats["Features"] != {}].sort_index()
allfeats


File 10 out of 973
File 20 out of 973
File 30 out of 973
File 40 out of 973
File 50 out of 973
File 60 out of 973
File 70 out of 973
File 80 out of 973
File 90 out of 973
File 100 out of 973
File 110 out of 973
File 120 out of 973
File 130 out of 973
File 140 out of 973
File 150 out of 973
File 160 out of 973
File 170 out of 973
File 180 out of 973
File 190 out of 973
File 200 out of 973
File 210 out of 973
File 220 out of 973
File 230 out of 973
File 240 out of 973
File 250 out of 973
File 260 out of 973
File 270 out of 973
File 280 out of 973
File 290 out of 973
File 300 out of 973
File 310 out of 973
File 320 out of 973
File 330 out of 973
File 340 out of 973
File 350 out of 973
File 360 out of 973
File 370 out of 973
File 380 out of 973
File 390 out of 973
File 400 out of 973
File 410 out of 973
File 420 out of 973
File 430 out of 973
File 440 out of 973
File 450 out of 973
File 460 out of 973
File 470 out of 973
File 480 out of 973
File 490 out of 973
File 500 out of 973
File 510 

Unnamed: 0,Features
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie..."
12,"{'charlieroseshow': 1, 'claychristensen': 1, '..."
12,"{'claychristensen': 1, 'jack': 1, 'makeitwork'..."
12,"{'jack': 1, 'jbrewer': 1, 'twitter': 1, 'twitt..."
12,"{'claychristensen': 1, 'foodtruck': 1, 'gzahnd..."
...,...
568552194,"{'a_olivieri11': 1, 'atlanta_falcons': 1, 'dak..."
568627575,"{'lucasgomezbr': 1, 'natystadler': 1}"
568655523,"{'dailyprofet': 1, 'paixaojovem': 1}"
568699879,"{'hotmail.com': 1, 'lualone': 1, 'mileybrcom':..."


In [None]:
# allfeats contains many duplicate indices which we need to deal with

# Function to aggregate the dicts of duplicate indices, taking the maximum of all multiplicities
# (The feature vectors are different across files)
def dict_aggregation(dicts):
	out = {}
	for d in dicts:
		for k,v in d.items():
			if k in out.keys() and out[k] >= v:
				pass
			else:
				out[k] = v
	return out

# Update node dicts with duplicate indices, then remove unnecessary duplicates
for idx in allfeats[allfeats.index.duplicated()].index.unique():
	agg = dict_aggregation(allfeats.loc[idx].Features)
	allfeats.loc[idx, "Features"] = [agg] * len(allfeats.loc[idx])

allfeats = allfeats[~allfeats.index.duplicated()]
allfeats


Unnamed: 0,Features,Bag_of_Words
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie...",12
13,"{'brainpicker': 1, 'ev': 1, 'eventbrite': 1, '...",13
17,"{'amyquispe': 1, 'baratunde': 1, 'busterbenson...",17
20,"{'aaronsw': 1, 'abdur': 1, 'amac': 1, 'dustin'...",20
47,"{'al3x': 1, 'alexandrak': 1, 'allspaw': 1, 'av...",47
...,...,...
568552194,"{'a_olivieri11': 1, 'atlanta_falcons': 1, 'dak...",568552194
568627575,"{'lucasgomezbr': 1, 'natystadler': 1}",568627575
568655523,"{'dailyprofet': 1, 'paixaojovem': 1}",568655523
568699879,"{'hotmail.com': 1, 'lualone': 1, 'mileybrcom':...",568699879


In [None]:
# Function to create a Bag of Words tensor given an index
def id2bow(index, bag_size = 195):
	bow = torch.zeros(bag_size, dtype=torch.int64)
	dictionary = allfeats.loc[index, "Features"]
	j = 0
	for key,value in dictionary.items():
		# Accounts for multiplicities
		bow[j:j+value] = allfeatnames[allfeatnames == key].index.item()
		j += value
	# Occasional progress check
	# Note: Remaining index values are not uniformly distributed
	if not index%1000:
		print(f"About {index/5687702:.2f}% done")
	return bow

# bag_size should be at least this (set one higher just to be sure)
print("bag_size: ", allfeats.Features.apply(lambda x: sum(x.values())).max())

# Set column same as index and apply id2bow
allfeats["Bag_of_Words"] = allfeats.index
allfeats.loc[:, 'Bag_of_Words'] = allfeats.Bag_of_Words.apply(id2bow)
allfeats


bag_size:  194
About 2.48% done
About 2.48% done
About 2.49% done
About 2.51% done
About 2.59% done
About 2.66% done
About 2.70% done
About 2.75% done
About 2.78% done
About 2.80% done
About 3.01% done
About 3.02% done
About 3.03% done
About 3.28% done
About 3.30% done
About 3.37% done
About 3.53% done
About 3.66% done
About 3.77% done
About 3.91% done
About 4.31% done
About 4.32% done
About 4.34% done
About 4.58% done
About 4.65% done
About 4.80% done
About 4.80% done
About 4.80% done
About 5.63% done
About 6.02% done
About 6.95% done
About 6.96% done
About 7.76% done
About 8.04% done
About 8.38% done
About 9.75% done
About 10.26% done
About 10.67% done
About 12.20% done
About 12.97% done
About 13.15% done
About 13.39% done
About 14.10% done
About 16.38% done
About 17.40% done
About 18.14% done
About 22.18% done
About 22.23% done
About 22.46% done
About 23.08% done
About 23.96% done
About 29.03% done
About 34.05% done
About 34.59% done
About 34.84% done
About 35.67% done
About 37.26% 

         115629,   7120,   1681,  45262,  24086,   8859,   7310,  45288,  45228,
          25336,  25336,  31027,   2976,   2976,  19157,  88730,  24332,  38647,
          25698,  51775,  59974,  60065,  60010,  60015,  39992,   3173,   3122,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0,      0,      0,
              0,      0,    

Unnamed: 0,Features,Bag_of_Words
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie...","[tensor(31517), tensor(86), tensor(115673), te..."
13,"{'brainpicker': 1, 'ev': 1, 'eventbrite': 1, '...","[tensor(3462), tensor(11073), tensor(24296), t..."
17,"{'amyquispe': 1, 'baratunde': 1, 'busterbenson...","[tensor(70048), tensor(6685), tensor(24196), t..."
20,"{'aaronsw': 1, 'abdur': 1, 'amac': 1, 'dustin'...","[tensor(24104), tensor(51836), tensor(25369), ..."
47,"{'al3x': 1, 'alexandrak': 1, 'allspaw': 1, 'av...","[tensor(25362), tensor(25367), tensor(25368), ..."
...,...,...
568552194,"{'a_olivieri11': 1, 'atlanta_falcons': 1, 'dak...","[tensor(148053), tensor(130385), tensor(148073..."
568627575,"{'lucasgomezbr': 1, 'natystadler': 1}","[tensor(67889), tensor(67751), tensor(0), tens..."
568655523,"{'dailyprofet': 1, 'paixaojovem': 1}","[tensor(14243), tensor(56482), tensor(0), tens..."
568699879,"{'hotmail.com': 1, 'lualone': 1, 'mileybrcom':...","[tensor(676), tensor(306), tensor(318), tensor..."


In [None]:
# Save
allfeats.to_pickle("feature_bow.pkl")
allfeats


Unnamed: 0,Features,Bag_of_Words
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie...","[tensor(31517), tensor(86), tensor(115673), te..."
13,"{'brainpicker': 1, 'ev': 1, 'eventbrite': 1, '...","[tensor(3462), tensor(11073), tensor(24296), t..."
17,"{'amyquispe': 1, 'baratunde': 1, 'busterbenson...","[tensor(70048), tensor(6685), tensor(24196), t..."
20,"{'aaronsw': 1, 'abdur': 1, 'amac': 1, 'dustin'...","[tensor(24104), tensor(51836), tensor(25369), ..."
47,"{'al3x': 1, 'alexandrak': 1, 'allspaw': 1, 'av...","[tensor(25362), tensor(25367), tensor(25368), ..."
...,...,...
568552194,"{'a_olivieri11': 1, 'atlanta_falcons': 1, 'dak...","[tensor(148053), tensor(130385), tensor(148073..."
568627575,"{'lucasgomezbr': 1, 'natystadler': 1}","[tensor(67889), tensor(67751), tensor(0), tens..."
568655523,"{'dailyprofet': 1, 'paixaojovem': 1}","[tensor(14243), tensor(56482), tensor(0), tens..."
568699879,"{'hotmail.com': 1, 'lualone': 1, 'mileybrcom':...","[tensor(676), tensor(306), tensor(318), tensor..."


### Creating a node2vec embedding of the graph nodes

Need to create the data split first

In [2]:
# Load files created previously

# Number of different feature names for the embedding size
emb_size = 155522

allfeats = pd.read_pickle("feature_bow.pkl")
allfeats.head()


Unnamed: 0,Features,Bag_of_Words
12,"{'claychristensen': 1, 'coachella': 1, 'gabrie...","[tensor(31517), tensor(86), tensor(115673), te..."
13,"{'brainpicker': 1, 'ev': 1, 'eventbrite': 1, '...","[tensor(3462), tensor(11073), tensor(24296), t..."
17,"{'amyquispe': 1, 'baratunde': 1, 'busterbenson...","[tensor(70048), tensor(6685), tensor(24196), t..."
20,"{'aaronsw': 1, 'abdur': 1, 'amac': 1, 'dustin'...","[tensor(24104), tensor(51836), tensor(25369), ..."
47,"{'al3x': 1, 'alexandrak': 1, 'allspaw': 1, 'av...","[tensor(25362), tensor(25367), tensor(25368), ..."


In [3]:
# Create graph: from pandas to NetworkX to torch_geometric data object
# Complete edge list
edge_list = pd.read_csv("./Data/twitter_combined.txt", sep=" ", header=None)

# Complete graph
G = nx.from_pandas_edgelist(edge_list, 0, 1, create_using=nx.DiGraph)
print("Complete graph nodes and edges: ", G.number_of_nodes(), G.number_of_edges())

# Restrict to nonzero feature nodes
G = G.subgraph(allfeats.index)
print("Restricted graph nodes and edges: ", G.number_of_nodes(), G.number_of_edges())

# Give each node its it's bag of words as attribute
nx.set_node_attributes(G, {idx: allfeats.Bag_of_Words[idx].to(DEVICE) for idx in allfeats.index}, "BoW")

# Create the graph with a bag of words as the only node attribute
graph = from_networkx(G)
graph


Complete graph nodes and edges:  81306 1768149
Restricted graph nodes and edges:  70923 1655299


Data(edge_index=[2, 1655299], BoW=[70923, 195], num_nodes=70923)

In [4]:
# Split the data: Apply train-val-test masks and create dataloaders

transform = RandomLinkSplit(num_val=0.1, num_test=0.1)
train_data, val_data, test_data = transform(graph)

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=1024,
    shuffle=True,
)

val_loader = LinkNeighborLoader(
    data=val_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=2048,
    shuffle=False,
)

test_loader = LinkNeighborLoader(
    data=test_data,
    num_neighbors=[20, 20],
    neg_sampling=NegativeSampling("binary"),
    batch_size=2048,
    shuffle=False,
)


In [None]:
# Node2vec training function
def n2v_train(numEpoch = 5,
			  batch_size = 128,
			  lr = 0.001,
			  **kwargs
			  ):
	n2v = gnn.Node2Vec(**kwargs).to(DEVICE)
	loader = n2v.loader(batch_size=batch_size, shuffle=True)
	optimizer = optim.Adam(n2v.parameters(), lr=lr)

	n2v.train()
	for epoch in range(numEpoch):
		epoch_loss = 0
		for pos_rw, neg_rw in loader:
			optimizer.zero_grad()
			loss = n2v.loss(pos_rw.to(DEVICE), neg_rw.to(DEVICE))
			loss.backward()
			optimizer.step()
			epoch_loss += loss.item()
		print(f"Epoch {epoch+1} loss: {epoch_loss/len(loader):.4f}")
	return n2v

node2vec_trained = n2v_train(numEpoch=100,
							 batch_size=512,
							 edge_index = train_data.edge_index,
							 embedding_dim = 20,
							 walk_length = 10,
							 context_size = 10,
							 walks_per_node = 500,
							 )

# Save
torch.save(node2vec_trained.state_dict(), "node2vec_trained.pt")


#### Obsolete code

This was my original plan before settling on bag-of-words embeddings of the node features.<br>
(Note: This would have resulted in ~71k (sparse) tensor files, which are not large, but slow to look up, and the graph would need too much memory anyways)


In [None]:
# Dictionary to tensor for function saving individual vectors
def dict2vec(d, idx):
	vector = torch.zeros(len(allfeatnames)+1, dtype=torch.int64)
	for k,v in d.items():
		vector[allfeatnames.loc[allfeatnames == k].index] = v
	torch.save(vector.to_sparse(), f"./Data/tensors/vector{idx}.pt")

# Helper function to load feature vectors by node ID
def id2vec(idx):
	if os.path.isfile(f"./Data/tensors/vector{idx}.pt"):
		return torch.load(f"./Data/tensors/vector{idx}.pt").to_dense().to(DEVICE)
	print("Bad index")
	return torch.zeros(emb_size).to(DEVICE)

# Huge computation, never run this again!
for idx in allfeats.index:
	pass
	dict2vec(allfeats.loc[idx].item(), idx)
