# High-school Social Network

We will now use a high-school contact network for a node classification task: predict each student’s gender (M/F). - Categorical inputs: class and level (gender is the target and is not used as input). - Structural scalars per node: degree, number of triangles, local clustering coefficient, betweenness, and closeness centrality. Each scalar feature is min–max normalized across nodes. - Model: project to 10‑d, apply three GCN layers with ReLU to get x1, x2, x3 (each 10‑d), then concatenate to a 30‑d vector and use a Linear head to 2 logits. Use CrossEntropyLoss. - Training: use a 75%/25% train/test split of nodes, but keep the full graph for message passing. Compute the loss on the training nodes only.

## Exercise 5 
Load the node and edge CSVs, build a NetworkX graph G with attributes, and remove nodes with gender == “Unknown”. Files are assumed to be in data/nodes_full.csv and data/edges_full.csv.

In [None]:
import pandas as pd
import networkx as nx

nodes_df = pd.read_csv("data/nodes_full.csv")
edges_df = pd.read_csv("data/edges_full.csv")

G = nx.Graph()
for _, row in nodes_df.iterrows():
  nid = int(row["ID"])  # canonical numeric id
  lvl = row.get("level")
  lvl = None if pd.isna(lvl) else int(lvl)
  G.add_node(nid, Id=nid, **{"class": row.get("class"), "gender": row.get("gender"), "level": lvl})

for _, row in edges_df.iterrows():
  u = int(row["ID1"]) ; v = int(row["ID2"]) ; w = int(row.get("weight", 1))
  G.add_edge(u, v, weight=w)

# Remove nodes with unknown gender; keep only M/F
unknown = [n for n, d in G.nodes(data=True) if str(d.get("gender")) == "Unknown"]
G.remove_nodes_from(unknown)
G = G.subgraph([n for n, d in G.nodes(data=True) if str(d.get('gender')) in ('M','F')]).copy()

print(f"High-school graph -> Nodes: {G.number_of_nodes()}, Edges: {G.number_of_edges()}")

## Exercise 6 
Engineer features (categoricals and structural scalars), construct the corresponding tensors, and create 75%/25% train/test node masks.

In [None]:
import torch
import networkx as nx
from torch_geometric.utils import to_undirected

# Categorical indices for class and level (level has 0 as the missing index)
classes = sorted({str(G.nodes[n].get('class')) for n in G.nodes()})
class_to_idx = {c: i for i, c in enumerate(classes)}
levels = [G.nodes[n].get('level') for n in G.nodes()]
unique_levels = sorted({int(l) for l in levels if l is not None})
level_to_idx = {l: i+1 for i, l in enumerate(unique_levels)}

nodes = sorted(G.nodes())
idx_of = {n: i for i, n in enumerate(nodes)}
N = len(nodes)

class_idx = torch.tensor([class_to_idx[str(G.nodes[n].get('class'))] for n in nodes], dtype=torch.long)
level_idx = torch.tensor([level_to_idx.get(G.nodes[n].get('level'), 0) for n in nodes], dtype=torch.long)

# Structural scalars
deg = dict(G.degree())
tri = nx.triangles(G)
clu = nx.clustering(G)
bet = nx.betweenness_centrality(G, normalized=True)
clo = nx.closeness_centrality(G)

scalars = torch.stack([
  torch.tensor([deg[n] for n in nodes], dtype=torch.float32),
  torch.tensor([tri[n] for n in nodes], dtype=torch.float32),
  torch.tensor([clu[n] for n in nodes], dtype=torch.float32),
  torch.tensor([bet[n] for n in nodes], dtype=torch.float32),
  torch.tensor([clo[n] for n in nodes], dtype=torch.float32),
], dim=1)

# Min–max normalize per column
mins = scalars.min(dim=0).values
maxs = scalars.max(dim=0).values
scalars = (scalars - mins) / (maxs - mins + 1e-12)

# Labels F=0, M=1
gender_to_idx = {'F': 0, 'M': 1}
y = torch.tensor([gender_to_idx[str(G.nodes[n].get('gender'))] for n in nodes], dtype=torch.long)

# Edge tensors
import torch
edges = torch.tensor([(idx_of[u], idx_of[v]) for u, v in G.edges()], dtype=torch.long).t().contiguous()
edge_w = torch.tensor([G[u][v].get('weight', 1) for u, v in G.edges()], dtype=torch.float32)
edge_index, edge_w = to_undirected(edges, edge_attr=edge_w, num_nodes=N)

# Train/test masks (75%/25%)
perm = torch.randperm(N)
n_train = int(0.75 * N)
train_idx = perm[:n_train]
test_idx = perm[n_train:]
train_mask = torch.zeros(N, dtype=torch.bool); train_mask[train_idx] = True
test_mask = torch.zeros(N, dtype=torch.bool); test_mask[test_idx] = True

## Exercise 7
Define and train the 3‑hop GCN (with dropout) to predict gender using CrossEntropyLoss on the training nodes only. Report training loss and training accuracy periodically.

In [None]:
import torch
import torch.nn.functional as F
from torch import nn
from torch_geometric.nn import GCNConv

class HSNet(nn.Module):
  def __init__(self, num_classes, num_levels, scalar_dim=5, hidden_dim=10, dropout=0.1):
    super().__init__()
    self.class_emb = nn.Embedding(num_classes, hidden_dim)
    self.level_emb = nn.Embedding(num_levels + 1, hidden_dim)  # 0 for missing
    self.in_lin = nn.Linear(hidden_dim*2 + scalar_dim, hidden_dim)
    self.conv1 = GCNConv(hidden_dim, hidden_dim, add_self_loops=True)
    self.conv2 = GCNConv(hidden_dim, hidden_dim, add_self_loops=True)
    self.conv3 = GCNConv(hidden_dim, hidden_dim, add_self_loops=True)
    self.drop = nn.Dropout(p=dropout)
    self.head = nn.Linear(hidden_dim*3, 2)
  def forward(self, class_idx, level_idx, scalars, edge_index, edge_weight=None):
    x = torch.cat([self.class_emb(class_idx), self.level_emb(level_idx), scalars], dim=1)
    x = F.relu(self.in_lin(x)); x = self.drop(x)
    x1 = F.relu(self.conv1(x, edge_index, edge_weight=edge_weight)); x1 = self.drop(x1)
    x2 = F.relu(self.conv2(x1, edge_index, edge_weight=edge_weight)); x2 = self.drop(x2)
    x3 = F.relu(self.conv3(x2, edge_index, edge_weight=edge_weight)); x3 = self.drop(x3)
    z = torch.cat([x1, x2, x3], dim=1)
    return self.head(z)

model = HSNet(num_classes=len(class_to_idx), num_levels=len(unique_levels), scalar_dim=scalars.size(1), hidden_dim=10, dropout=0.3)
opt = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
crit = nn.CrossEntropyLoss()

for epoch in range(20000):
  model.train(); opt.zero_grad()
  logits = model(class_idx, level_idx, scalars, edge_index, edge_weight=edge_w)
  loss = crit(logits[train_mask], y[train_mask])
  loss.backward(); opt.step()
  if epoch % 1000 == 0:
    with torch.no_grad():
      acc_tr = (logits[train_mask].argmax(dim=1) == y[train_mask]).float().mean().item()
    print(f"epoch {epoch:4d} | train loss {loss.item():.4f} | train acc {acc_tr:.4f}")

model.eval()
with torch.no_grad():
  logits = model(class_idx, level_idx, scalars, edge_index, edge_weight=edge_w)
  acc_tr = (logits[train_mask].argmax(dim=1) == y[train_mask]).float().mean().item()
print(f"Final training accuracy (train 75%): {acc_tr:.4f}")

## Exercise 8
Produce diagnostics: parameter counts, tensor shapes (key tensors), and evaluate on the held‑out 25% test nodes (accuracy and confusion matrix).

In [None]:
import numpy as np
import torch
import torch.nn.functional as F

def count_parameters(m):
  total = 0; per_module = {}
  print("Parameters by tensor:")
  for name, p in m.named_parameters():
    n = p.numel(); total += n; mod = name.split('.')[0]
    per_module[mod] = per_module.get(mod, 0) + n
    print(f"  {name:35s} {tuple(p.shape)} -> {n}")
  print("\nParameters by top-level module:")
  for mod, cnt in sorted(per_module.items(), key=lambda x: -x[1]):
    print(f"  {mod:15s} -> {cnt}")
  print(f"Total parameters: {total}")

count_parameters(model)

with torch.no_grad():
  emb_c = model.class_emb(class_idx)
  emb_l = model.level_emb(level_idx)
  xcat  = torch.cat([emb_c, emb_l, scalars], 1)
  x0    = F.relu(model.in_lin(xcat))
  x1    = F.relu(model.conv1(x0, edge_index, edge_weight=edge_w))
  x2    = F.relu(model.conv2(x1, edge_index, edge_weight=edge_w))
  x3    = F.relu(model.conv3(x2, edge_index, edge_weight=edge_w))
  z     = torch.cat([x1, x2, x3], 1)
  logits = model.head(z)

print("\nTensor shapes:")
print("  class_emb:", tuple(emb_c.shape))
print("  level_emb:", tuple(emb_l.shape))
print("  concat [class_emb, level_emb, scalars]:", tuple(xcat.shape))
print("  after in_lin+ReLU:", tuple(x0.shape))
print("  x1 (GCN1+ReLU):", tuple(x1.shape))
print("  x2 (GCN2+ReLU):", tuple(x2.shape))
print("  x3 (GCN3+ReLU):", tuple(x3.shape))
print("  z = concat[x1,x2,x3]:", tuple(z.shape))
print("  logits:", tuple(logits.shape))

with torch.no_grad():
  pred = logits.argmax(1)
  test_acc = (pred[test_mask] == y[test_mask]).float().mean().item()

cm = np.zeros((2,2), dtype=int)
for t, p in zip(y[test_mask].tolist(), pred[test_mask].tolist()):
  cm[t][p] += 1

print(f"\nTest accuracy (25%): {test_acc:.4f}")
print("Confusion matrix (rows=true, cols=pred) on test set:")
print("          Pred F    Pred M")
print(f"True F   {cm[0,0]:7d}   {cm[0,1]:7d}")
print(f"True M   {cm[1,0]:7d}   {cm[1,1]:7d}")