In [None]:
# Done: 
# Graph and data generation
# Network Architecture
# Training loop

In [None]:
#%pip install torch
#%pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121


In [1]:
import pandas as pd
import os
import glob
import torch
import torch.nn as nn
import torch.nn.functional as F

In [2]:
class Encoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=64, out_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_dim)
        )

    def forward(self, x):
        # x: (N, in_dim)
        return self.net(x)  # (N, out_dim)


In [3]:
class MPNNLayer(nn.Module):
    def __init__(self, dim):
        super().__init__()
        # Message function φ: R^dim -> R^dim
        self.msg_mlp = nn.Sequential(
            nn.Linear(dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )
        # Update function ψ: R^(2*dim) -> R^dim
        self.update_mlp = nn.Sequential(
            nn.Linear(2 * dim, dim),
            nn.ReLU(),
            nn.Linear(dim, dim)
        )

    def forward(self, H, edge_index):
        """
        H: (N, dim) node embeddings at current layer
        edge_index: (2, E) tensor with [src; dst]
        """
        src, dst = edge_index  # each: (E,)

        # 1. Messages from src nodes along edges
        h_src = H[src]              # (E, dim)
        m = self.msg_mlp(h_src)     # (E, dim)

        # 2. Aggregate messages at dst nodes by summation
        N, dim = H.shape
        agg = torch.zeros_like(H)   # (N, dim)
        agg.index_add_(0, dst, m)   # sum messages into dst indices

        # 3. Update node states using previous state + aggregated message
        h_cat = torch.cat([H, agg], dim=-1)  # (N, 2*dim)
        H_next = self.update_mlp(h_cat)      # (N, dim)

        return H_next


In [4]:
class Decoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=32):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1)   # output logit per node
        )

    def forward(self, H):
        # H: (N, in_dim)
        logits = self.net(H).squeeze(-1)  # (N,)
        return logits


In [5]:
# This is our MPNN model for classifying influencers and non-influencers
class SocialGNN(nn.Module):
    def __init__(self, in_dim, hidden_dim=64, emb_dim=32, num_layers=3):
        super().__init__()
        self.encoder = Encoder(in_dim, hidden_dim, emb_dim)

        self.layers = nn.ModuleList([
            MPNNLayer(emb_dim) for _ in range(num_layers)
        ])

        self.decoder = Decoder(emb_dim, hidden_dim=hidden_dim)

    def forward(self, x, edge_index):
        """
        x: (N, in_dim) node features
        edge_index: (2, E) edge list
        """
        # 1. Encode features to latent space
        H = self.encoder(x)  # (N, emb_dim)

        # 2. Apply K message passing layers
        for layer in self.layers:
            H = layer(H, edge_index)

        # 3. Decode to logits
        logits = self.decoder(H)  # (N,)

        return logits


In [6]:
# Separate features and labels
feature_names = [
    'normalized_degree',
    'clustering_coefficient',
    'posts_per_day',
    'likes_per_post',
    'follower_ratio',
    'age',
    'years_riding',
    'miles_ridden',
    'bikes_owned',
    'avg_displacement',
    'avg_msrp',
]

In [7]:
def load_graph_from_dir(graph_dir):
    """
    graph_dir: path like './data/train/000'
    Returns: X (N,F), edge_index (2,E), y (N,)
    """
    nodes_path = os.path.join(graph_dir, "nodes.csv")
    edges_path = os.path.join(graph_dir, "edges.csv")

    nodes_df = pd.read_csv(nodes_path)
    edges_df = pd.read_csv(edges_path)

    # Node features and labels
    X_np = nodes_df[feature_names].values          # (N, F)
    y_np = nodes_df["label"].values               # (N,)

    X = torch.tensor(X_np, dtype=torch.float32)   # (N, F)
    y = torch.tensor(y_np, dtype=torch.float32)   # (N,)

    # Edge list
    src = torch.tensor(edges_df["src"].values, dtype=torch.long)
    dst = torch.tensor(edges_df["dst"].values, dtype=torch.long)

    # For now: one direction per edge; you can symmetrize later if you want
    edge_index = torch.stack([src, dst], dim=0)   # (2, E)

    return X, edge_index, y

In [77]:
train_root = "./data/train"
val_root = "./data/validate"
test_root = "./data/test"

# All subdirectories inside ./data/train
train_graph_dirs = sorted(
    d for d in glob.glob(os.path.join(train_root, "*"))
    if os.path.isdir(d)
)

val_graph_dirs = sorted(
    d for d in glob.glob(os.path.join(val_root, "*"))
    if os.path.isdir(d)
)

test_graph_dirs = sorted(
    d for d in glob.glob(os.path.join(test_root, "*"))
    if os.path.isdir(d)
)

print("Found", len(train_graph_dirs), "training graphs")
print("Example:", train_graph_dirs[:3])
print("Found", len(val_graph_dirs), "validation graphs")
print("Example:", val_graph_dirs[:3])
print("Found", len(test_graph_dirs), "test graphs")
print("Example:", test_graph_dirs[:3])



Found 20 training graphs
Example: ['./data/train\\000', './data/train\\001', './data/train\\002']
Found 10 validation graphs
Example: ['./data/validate\\000', './data/validate\\001', './data/validate\\002']
Found 100 test graphs
Example: ['./data/test\\000', './data/test\\001', './data/test\\002']


In [46]:
# This cell checks the ratio between influencers and non-influencers.

# Check a single graph
if True:
    check_graph = 4

    X, edge_index, y = load_graph_from_dir(train_graph_dirs[check_graph])
    print("y shape:", y.shape)
    print("Unique labels:", torch.unique(y, return_counts=True))
    print("Positive rate:", y.float().mean().item())

# Check all graphs in a directory
if False:
    for i in range(10): # Change to match num graphs in selected directory
        X, edge_index, y = load_graph_from_dir(val_graph_dirs[i]) # Change the directory (train, val, test)
        print("y shape:", y.shape)
        print("Unique labels:", torch.unique(y, return_counts=True))
        print("Positive rate:", y.float().mean().item())

y shape: torch.Size([20])
Unique labels: (tensor([0., 1.]), tensor([17,  3]))
Positive rate: 0.15000000596046448


In [51]:
# Load one graph to deduce input dimension
X_example, edge_index_example, y_example = load_graph_from_dir(train_graph_dirs[0])
in_dim = X_example.shape[1]      # should be 11

model = SocialGNN(in_dim=in_dim, hidden_dim=64, emb_dim=32, num_layers=3)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)


In [48]:
# Check the input dimension. This should be 11 since each node has 11 features
print(in_dim)

11


In [14]:
# Check that the model is on the GPU
next(model.parameters()).device

device(type='cuda', index=0)

In [None]:
num_epochs = 10

for epoch in range(1, num_epochs + 1):
    # ---------- TRAINING ----------
    model.train()

    train_total_loss  = 0.0
    train_total_nodes  = 0

    for graph_dir in train_graph_dirs:
        X, edge_index, y = load_graph_from_dir(graph_dir)

        # Move to GPU/CPU
        X = X.to(device)
        edge_index = edge_index.to(device)
        y = y.to(device)

        optimizer.zero_grad()

        logits = model(X, edge_index)    # (N,)
        loss = criterion(logits, y)

        loss.backward()
        #loss.backward(lr=1e-5) # This is the learning rate used in the paper 'Neural Execution of Graph Algorithms' 
        optimizer.step()

        train_total_loss  += loss.item() * X.size(0)  # weight by #nodes
        train_total_nodes  += X.size(0)

    train_avg_loss = train_total_loss  / train_total_nodes 
    
    # ---------- VALIDATION ----------
    model.eval()
    val_total_loss = 0.0
    val_total_nodes = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for graph_dir in val_graph_dirs:
            X, edge_index, y = load_graph_from_dir(graph_dir)

            X = X.to(device)
            edge_index = edge_index.to(device)
            y = y.to(device)

            logits = model(X, edge_index)
            loss = criterion(logits, y)

            val_total_loss += loss.item() * X.size(0)
            val_total_nodes += X.size(0)

            probs = torch.sigmoid(logits)
            preds = (probs >= 0.5).float()

            correct += (preds == y).sum().item()
            total += y.numel()

    val_avg_loss = val_total_loss / val_total_nodes if val_total_nodes > 0 else 0.0
    val_acc = correct / total if total > 0 else 0.0

    print(
        f"Epoch {epoch:03d} | "
        f"Train loss/node: {train_avg_loss:.4f} | "
        f"Val loss/node: {val_avg_loss:.4f} | "
        f"Val acc: {val_acc:.3f}"
    )


Epoch 001 | Train loss/node: 0.5283 | Val loss/node: 0.4250 | Val acc: 0.795
Epoch 002 | Train loss/node: 0.4075 | Val loss/node: 0.2839 | Val acc: 0.870
Epoch 003 | Train loss/node: 0.2252 | Val loss/node: 0.3348 | Val acc: 0.860
Epoch 004 | Train loss/node: 0.2145 | Val loss/node: 0.1918 | Val acc: 0.925
Epoch 005 | Train loss/node: 0.1551 | Val loss/node: 0.1455 | Val acc: 0.940
Epoch 006 | Train loss/node: 0.1125 | Val loss/node: 0.1058 | Val acc: 0.955
Epoch 007 | Train loss/node: 0.0991 | Val loss/node: 0.0982 | Val acc: 0.950
Epoch 008 | Train loss/node: 0.1323 | Val loss/node: 0.1707 | Val acc: 0.915
Epoch 009 | Train loss/node: 0.1679 | Val loss/node: 0.1193 | Val acc: 0.960
Epoch 010 | Train loss/node: 0.0993 | Val loss/node: 0.0783 | Val acc: 0.965


In [85]:
# Reset weights to random
def reset_model(model):
    for layer in model.modules():
        if hasattr(layer, 'reset_parameters'):
            layer.reset_parameters()

In [90]:
reset_model(model)

In [94]:
# ---------- TESTING ----------

model.eval()          # evaluation mode
total_correct = 0
total_nodes = 0

num_graphs_tested = 0

with torch.no_grad():  # no gradient tracking
    for graph_dir in test_graph_dirs:
        num_graphs_tested += 1
        X, edge_index, y = load_graph_from_dir(graph_dir)

        X = X.to(device)
        edge_index = edge_index.to(device)
        y = y.to(device)

        # Forward pass
        logits = model(X, edge_index)

        # Convert logits → probabilities → predicted class
        probs = torch.sigmoid(logits)
        preds = (probs >= 0.5).long()   # threshold at 0.5

        # Count accuracy
        total_correct += (preds == y.long()).sum().item()
        total_nodes += y.numel()

test_acc = total_correct / total_nodes
print(f"Tested {num_graphs_tested} graphs")
print(f"Test Accuracy: {test_acc:.4f}")


Tested 100 graphs
Test Accuracy: 0.9715
