In [None]:
import torch
from torch import Tensor
print(torch.__version__)

2.5.1+cu124


In [None]:
# Install required packages.
import os
os.environ['TORCH'] = torch.__version__

!pip install torch-scatter -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install torch-sparse -f https://data.pyg.org/whl/torch-${TORCH}.html
!pip install pyg-lib -f https://data.pyg.org/whl/nightly/torch-${TORCH}.html
!pip install git+https://github.com/pyg-team/pytorch_geometric.git

Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_scatter-2.1.2%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m49.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt25cu124
Looking in links: https://data.pyg.org/whl/torch-2.5.1+cu124.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.5.0%2Bcu124/torch_sparse-0.6.18%2Bpt25cu124-cp311-cp311-linux_x86_64.whl (5.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.2/5.2 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt25cu124
Looking in links: https://data.pyg.org/whl/nightly/torch-2.5.1+cu124.html
Collecting pyg-lib
  Downloading https://data

In [7]:
import networkx as nx

def create_atlas_graph(data):
    """
    Creates a directed graph for the input.

    Parameters:
        data (list): A list of place names (countries/cities).

    Returns:
        G (networkx.DiGraph): A directed graph.
    """
    G = nx.DiGraph()

    for place in data:
        G.add_node(place)
        last_letter = place[-1].lower()  # last letter of the name, case-insensitive comparison
        for candidate in data:
            if candidate[0].lower() == last_letter:
                G.add_edge(place, candidate)

    return G

# Read the list of countries from file.
with open("/content/countries.txt", "r") as file:
    countries = [line.strip() for line in file if line.strip()]

country_graph = create_atlas_graph(countries)

In [37]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
from torch_geometric.data import Data
from torch_geometric.nn import Node2Vec, GCNConv, GAE
from sklearn.metrics import roc_auc_score

# 1. Convert the NetworkX Graph to a PyG Data Object with Node Features
# Build a mapping from country name to numeric index.
mapping = {country: i for i, country in enumerate(countries)}

# Create node features.
# Here we use a simple 2D feature: [normalized(first_letter), normalized(last_letter)]
def get_feature(country):
    country = country.lower()
    first = (ord(country[0]) - ord('a')) / 25.0  # Normalize a=0,...,z=1
    last  = (ord(country[-1]) - ord('a')) / 25.0
    return [first, last]

# Build feature matrix (num_nodes x 2)
num_nodes = len(countries)
x = torch.tensor([get_feature(country) for country in countries], dtype=torch.float)

# Create the edge list (in numeric indices) from the NetworkX graph.
edge_list = []
for source, target in country_graph.edges():
    # Use the mapping to get numeric indices.
    i = mapping[source]
    j = mapping[target]
    edge_list.append([i, j])
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()

print("PyG Data object will have {} nodes and {} edges.".format(num_nodes, edge_index.size(1)))

# Create PyG Data object.
data = Data(x=x, edge_index=edge_index)

# 3. Unsupervised Link Prediction with Node2Vec (PyG Implementation)
# Define Node2Vec parameters.
embedding_dim = 13
walk_length = 10
context_size = 5
walks_per_node = 10
num_negative_samples = 1

nv_epochs = 1000
learning_rate_nv = 0.008
batch_size_nv = 128

# Create Node2Vec model (using the graph’s edge_index).
node2vec = Node2Vec(
    data.edge_index,
    embedding_dim=embedding_dim,
    walk_length=walk_length,
    context_size=context_size,
    walks_per_node=walks_per_node,
    num_negative_samples=num_negative_samples,
    p=1, q=1,
    sparse=True
)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
node2vec = node2vec.to(device)
optimizer_n2v = torch.optim.SparseAdam(list(node2vec.parameters()), lr=learning_rate_nv)

def train_node2vec():
    node2vec.train()
    total_loss = 0
    loader = node2vec.loader(batch_size=batch_size_nv, shuffle=True, num_workers=0)
    num_epochs = nv_epochs
    for epoch in range(1, num_epochs + 1):
        epoch_loss = 0
        for pos_rw, neg_rw in loader:
            pos_rw = pos_rw.to(device)
            neg_rw = neg_rw.to(device)
            optimizer_n2v.zero_grad()
            loss = node2vec.loss(pos_rw, neg_rw)
            loss.backward()
            optimizer_n2v.step()
            epoch_loss += loss.item()
        avg_loss = epoch_loss / len(loader)
        if epoch % 10 == 0:
            print(f"Node2Vec Epoch {epoch}, Loss: {avg_loss:.4f}")
        total_loss += avg_loss
    return total_loss / num_epochs

loss_n2v = train_node2vec()
print("Final Node2Vec training loss:", loss_n2v)

# Extract the embeddings.
embeddings = node2vec.embedding.weight.detach().cpu()

# Define a link prediction evaluation function.
def evaluate_link_prediction(embeds, pos_edges, num_negatives=None):
    num_pos = pos_edges.size(1)
    if num_negatives is None:
        num_negatives = num_pos
    neg_edges = []
    while len(neg_edges) < num_negatives:
        i = random.randint(0, num_nodes - 1)
        j = random.randint(0, num_nodes - 1)
        if [i, j] not in edge_list:
            neg_edges.append([i, j])
    neg_edges = torch.tensor(neg_edges, dtype=torch.long).t().contiguous()

    pos_scores = (embeds[pos_edges[0]] * embeds[pos_edges[1]]).sum(dim=1).numpy()
    neg_scores = (embeds[neg_edges[0]] * embeds[neg_edges[1]]).sum(dim=1).numpy()

    scores = np.concatenate([pos_scores, neg_scores])
    labels = np.concatenate([np.ones_like(pos_scores), np.zeros_like(neg_scores)])
    auc = roc_auc_score(labels, scores)
    return auc

# Evaluate on all observed edges.
pos_edges = data.edge_index
auc_n2v = evaluate_link_prediction(embeddings, pos_edges)
print("Node2Vec link prediction AUC:", auc_n2v)

# 4. Unsupervised Link Prediction with a GNN (Graph Autoencoder using GCN)
# Define a simple two-layer GCN encoder.
class GCNEncoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(GCNEncoder, self).__init__()
        self.conv1 = GCNConv(in_channels, 2 * out_channels)
        self.conv2 = GCNConv(2 * out_channels, out_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

hidden_dim = 17
learning_rate_gnn = 0.005
split_ratio = 0.2
gae_epochs = 1000

encoder = GCNEncoder(in_channels=data.num_node_features, out_channels=hidden_dim)
model = GAE(encoder).to(device)
data = data.to(device)
optimizer_gae = torch.optim.Adam(model.parameters(), lr=learning_rate_gnn)

# Function to mask (remove) a portion of edges for testing.
def mask_edges(data, test_ratio=split_ratio):
    # Convert edge_index to list format.
    edges = data.edge_index.cpu().numpy().T.tolist()
    num_edges = len(edges)
    num_test = int(test_ratio * num_edges)
    random.shuffle(edges)
    test_edges = edges[:num_test]
    train_edges = edges[num_test:]
    train_edge_index = torch.tensor(train_edges, dtype=torch.long).t().contiguous().to(device)
    test_edge_index = torch.tensor(test_edges, dtype=torch.long).t().contiguous().to(device)
    return train_edge_index, test_edge_index

train_edge_index, test_edge_index = mask_edges(data, test_ratio=0.2)
print("GAE: Training edges:", train_edge_index.size(1), "Test edges:", test_edge_index.size(1))
data.train_edge_index = train_edge_index

# Train the GAE (unsupervised reconstruction of the graph)
def train_gae():
    model.train()
    optimizer_gae.zero_grad()
    z = model.encode(data.x, data.train_edge_index)
    loss = model.recon_loss(z, data.train_edge_index)
    loss.backward()
    optimizer_gae.step()
    return loss.item()

num_gae_epochs = gae_epochs
for epoch in range(1, num_gae_epochs + 1):
    loss = train_gae()
    if epoch % 20 == 0:
        print(f"GAE Epoch {epoch}, Loss: {loss:.4f}")

# Evaluate GAE link prediction performance.
model.eval()
with torch.no_grad():
    z = model.encode(data.x, data.train_edge_index)

def evaluate_gae(z, pos_edge_index, num_negatives=None):
    num_pos = pos_edge_index.size(1)
    if num_negatives is None:
        num_negatives = num_pos
    neg_edges = []
    while len(neg_edges) < num_negatives:
        i = random.randint(0, num_nodes - 1)
        j = random.randint(0, num_nodes - 1)
        if [i, j] not in edge_list:
            neg_edges.append([i, j])
    neg_edge_index = torch.tensor(neg_edges, dtype=torch.long).t().contiguous().to(device)

    pos_scores = (z[pos_edge_index[0]] * z[pos_edge_index[1]]).sum(dim=1).cpu().numpy()
    neg_scores = (z[neg_edge_index[0]] * z[neg_edge_index[1]]).sum(dim=1).cpu().numpy()

    scores = np.concatenate([pos_scores, neg_scores])
    labels = np.concatenate([np.ones_like(pos_scores), np.zeros_like(neg_scores)])
    auc = roc_auc_score(labels, scores)
    return auc

auc_gae = evaluate_gae(z, test_edge_index)
print("GAE link prediction AUC:", auc_gae)

print()
print("-----------------------------------------------------")
print("Node2Vec link prediction AUC:", auc_n2v)
print("embedding_dim: ", embedding_dim)
print("walk_length: ", walk_length)
print("context_size: ", context_size)
print("walks_per_node: ", walks_per_node)
print("num_negative_samples: ", num_negative_samples)
print("nv_epochs: ", nv_epochs)
print("learning_rate: ", learning_rate_nv)

print()

print("-----------------------------------------------------")
print("GAE link prediction AUC:", auc_gae)
print("hidden_dim: ", hidden_dim)
print("split_ratio: ", split_ratio)
print("gae_epochs: ", gae_epochs)
print("learning_rate: ", learning_rate_gnn)

PyG Data object will have 196 nodes and 2055 edges.
Node2Vec Epoch 10, Loss: 2.4903
Node2Vec Epoch 20, Loss: 1.9478
Node2Vec Epoch 30, Loss: 1.6251
Node2Vec Epoch 40, Loss: 1.4046
Node2Vec Epoch 50, Loss: 1.2585
Node2Vec Epoch 60, Loss: 1.1743
Node2Vec Epoch 70, Loss: 1.1143
Node2Vec Epoch 80, Loss: 1.0877
Node2Vec Epoch 90, Loss: 1.0680
Node2Vec Epoch 100, Loss: 1.0395
Node2Vec Epoch 110, Loss: 1.0313
Node2Vec Epoch 120, Loss: 1.0196
Node2Vec Epoch 130, Loss: 1.0165
Node2Vec Epoch 140, Loss: 1.0007
Node2Vec Epoch 150, Loss: 1.0024
Node2Vec Epoch 160, Loss: 0.9972
Node2Vec Epoch 170, Loss: 0.9911
Node2Vec Epoch 180, Loss: 0.9971
Node2Vec Epoch 190, Loss: 0.9865
Node2Vec Epoch 200, Loss: 0.9848
Node2Vec Epoch 210, Loss: 0.9838
Node2Vec Epoch 220, Loss: 0.9826
Node2Vec Epoch 230, Loss: 0.9731
Node2Vec Epoch 240, Loss: 0.9755
Node2Vec Epoch 250, Loss: 0.9783
Node2Vec Epoch 260, Loss: 0.9775
Node2Vec Epoch 270, Loss: 0.9691
Node2Vec Epoch 280, Loss: 0.9774
Node2Vec Epoch 290, Loss: 0.9740


# Analysis & Intuition
1. Node Features

    I built a graph where each country is a node, and there's an edge from one country to another if the last letter of the first country's name matches the first letter of the second. To help represent each country, I created a very simple feature for each node: a two-number vector. The first number represents the normalized value of the country's first letter, and the second represents the normalized value of its last letter. This choice makes sense because our rule for connecting countries is based solely on these letters.

2. Node2Vec Approach

    For the Node2Vec model, I use a method that is a bit like teaching the computer to explore the graph. Here's how it works:

    - Random Walks: The model takes random walks through the graph. This gives the model lots of examples of which countries tend to be connected or appear near each other.

    - Learning by Context: While taking these walks, the model learns to predict which countries (nodes) appear together. This is called the skip-gram objective.

    - Negative Sampling: To help the model understand what “unrelated” looks like, for every pair of countries that do appear together (a positive example), the model is also shown a negative example—a pair that rarely or never appears together. In my setup, for every positive pair, one negative pair is sampled.

    - Link Prediction: After training, each country has a vector (its embedding). The model then uses the dot product (a measure of similarity) between two country embeddings to decide if an edge should exist. If two countries have similar embeddings, they're more likely to be connected.

3. GNN (Graph Autoencoder) Approach

    For the GNN approach, I use a Graph Autoencoder (GAE) built with a simple two-layer Graph Convolutional Network (GCN):

    - Graph Convolution:
    The GCN learns new representations (or embeddings) for each country by “mixing” information from its neighbors. This way, each country's new features reflect both its own simple features (first and last letters) and the structure of the graph.

    - Masking Edges:
    Before training, I remove (mask) a portion of the edges from the graph. The idea is to force the model to learn enough about the overall graph structure so it can predict these missing connections.

    - Unsupervised Reconstruction: The GAE is trained to reconstruct (or predict) the presence of the edges that were masked out. This means the model adjusts its internal representations so that, when it tries to recreate the graph, it comes as close as possible to the original structure.

    - Link Prediction: Once the model is trained, I evaluate it by comparing the model’s predicted scores (from the dot product of the embeddings) against the actual edges (the ones that were masked for testing). This tells me how well the model learned the underlying connectivity.

4. Unsupervised Learning

    Both approaches are unsupervised:
    - Node2Vec doesn't need any labels because it learns from the way nodes appear together in random walks.

    - GAE doesn't need labeled data either because it learns by trying to reconstruct the graph from which some edges have been removed.

    In both cases, the training objectives are all about capturing the structure of the graph—learning what makes two nodes likely to be connected—without needing a separate “correct answer” for each edge.