# Lab E.1 Solutions: PyTorch Geometric Setup

Complete solutions to all exercises in Lab E.1.

---

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import to_networkx
import networkx as nx

## Exercise 1 Solution: Explore CiteSeer Dataset

In [None]:
# Step 1: Load CiteSeer
citeseer = Planetoid(root='/tmp/CiteSeer', name='CiteSeer')
cs_data = citeseer[0]

# Step 2: Print statistics
print("=" * 50)
print("CITESEER DATASET")
print("=" * 50)
print(f"Number of nodes: {cs_data.num_nodes}")
print(f"Number of edges: {cs_data.num_edges}")
print(f"Number of features: {citeseer.num_features}")
print(f"Number of classes: {citeseer.num_classes}")
print(f"Average degree: {cs_data.num_edges / cs_data.num_nodes:.2f}")

In [None]:
# Step 3: Compare to Cora
cora = Planetoid(root='/tmp/Cora', name='Cora')
cora_data = cora[0]

print("\n" + "=" * 50)
print("COMPARISON: Cora vs CiteSeer")
print("=" * 50)
print(f"{'Metric':<20} {'Cora':<15} {'CiteSeer':<15}")
print("-" * 50)
print(f"{'Nodes':<20} {cora_data.num_nodes:<15} {cs_data.num_nodes:<15}")
print(f"{'Edges':<20} {cora_data.num_edges:<15} {cs_data.num_edges:<15}")
print(f"{'Features':<20} {cora.num_features:<15} {citeseer.num_features:<15}")
print(f"{'Classes':<20} {cora.num_classes:<15} {citeseer.num_classes:<15}")
print(f"{'Avg Degree':<20} {cora_data.num_edges/cora_data.num_nodes:<15.2f} {cs_data.num_edges/cs_data.num_nodes:<15.2f}")

# CiteSeer is larger in nodes but has more features
if cs_data.num_nodes > cora_data.num_nodes:
    print("\nüí° CiteSeer has MORE nodes than Cora!")
else:
    print("\nüí° Cora has MORE nodes than CiteSeer!")

In [None]:
# Step 4: Visualize subgraph
G = to_networkx(cs_data, to_undirected=True)
subgraph_nodes = list(range(50))
subgraph = G.subgraph(subgraph_nodes)

plt.figure(figsize=(10, 8))
pos = nx.spring_layout(subgraph, seed=42)
nx.draw(subgraph, pos, 
        node_color=cs_data.y[subgraph_nodes].numpy(),
        cmap=plt.cm.Set3,
        node_size=200,
        with_labels=False,
        edge_color='lightgray')
plt.title("CiteSeer Subgraph (First 50 Nodes)")
plt.tight_layout()
plt.show()

## Exercise 2 Solution: Analyze Node Neighborhood

In [None]:
# Load Cora (use CPU for easier manipulation)
data = cora_data

# Step 1: Pick a node
target_node = 0

# Step 2: Find neighbors
edge_index = data.edge_index

# Find outgoing edges (target_node as source)
out_mask = edge_index[0] == target_node
out_neighbors = edge_index[1][out_mask]

# Find incoming edges (target_node as destination) 
in_mask = edge_index[1] == target_node
in_neighbors = edge_index[0][in_mask]

# Combine (for undirected graph, these should be the same)
all_neighbors = torch.unique(torch.cat([out_neighbors, in_neighbors]))

print(f"Node {target_node} Analysis")
print("=" * 40)
print(f"Node label: {data.y[target_node].item()}")
print(f"Number of neighbors: {len(all_neighbors)}")
print(f"Neighbors: {all_neighbors.tolist()}")

In [None]:
# Step 3: Get labels of neighbors
target_label = data.y[target_node].item()
neighbor_labels = data.y[all_neighbors]

print(f"\nNeighbor labels: {neighbor_labels.tolist()}")

# Step 4: Calculate homophily for this node
same_label = (neighbor_labels == target_label).sum().item()
node_homophily = same_label / len(all_neighbors)

print(f"\nüìä Homophily Analysis:")
print(f"   Neighbors with same label: {same_label} / {len(all_neighbors)}")
print(f"   Node homophily: {node_homophily:.2%}")

if node_homophily > 0.5:
    print(f"\nüí° This node's neighbors are mostly the SAME class!")
    print(f"   GNNs will easily predict this node's class.")
else:
    print(f"\n‚ö†Ô∏è This node's neighbors are mostly DIFFERENT class!")
    print(f"   This is a harder case for GNNs.")

## Challenge Solution: Graph-Wide Homophily

In [None]:
# Calculate graph-wide homophily
# Homophily = fraction of edges that connect same-class nodes

edge_index = data.edge_index
labels = data.y

# Get labels for source and destination of each edge
src_labels = labels[edge_index[0]]
dst_labels = labels[edge_index[1]]

# Count edges with same class
same_class_edges = (src_labels == dst_labels).sum().item()
total_edges = edge_index.shape[1]

homophily = same_class_edges / total_edges

print("=" * 50)
print("GRAPH-WIDE HOMOPHILY ANALYSIS")
print("=" * 50)
print(f"Total edges: {total_edges}")
print(f"Same-class edges: {same_class_edges}")
print(f"Different-class edges: {total_edges - same_class_edges}")
print(f"\nüéØ Homophily Score: {homophily:.4f}")

# Interpretation
if homophily > 0.7:
    print("\n‚úÖ HIGH HOMOPHILY!")
    print("   Similar nodes are connected.")
    print("   Standard GNNs (GCN, GAT) will work great!")
elif homophily > 0.3:
    print("\n‚öñÔ∏è MEDIUM HOMOPHILY")
    print("   Mixed connectivity pattern.")
    print("   GNNs should work reasonably well.")
else:
    print("\n‚ö†Ô∏è LOW HOMOPHILY (Heterophily)")
    print("   Different nodes are connected.")
    print("   Need specialized architectures (H2GCN, CPGNN).")

In [None]:
# Bonus: Compare homophily across datasets
def compute_homophily(data):
    src_labels = data.y[data.edge_index[0]]
    dst_labels = data.y[data.edge_index[1]]
    return (src_labels == dst_labels).float().mean().item()

# Load PubMed
pubmed = Planetoid(root='/tmp/PubMed', name='PubMed')
pubmed_data = pubmed[0]

print("\n" + "=" * 50)
print("HOMOPHILY COMPARISON")
print("=" * 50)
print(f"Cora:     {compute_homophily(cora_data):.4f}")
print(f"CiteSeer: {compute_homophily(cs_data):.4f}")
print(f"PubMed:   {compute_homophily(pubmed_data):.4f}")
print("\nüí° All three citation networks have HIGH homophily!")
print("   Papers tend to cite papers in the same field.")

---

## Key Takeaways

1. **CiteSeer is larger than Cora** in terms of nodes and has more features
2. **Homophily is high** in citation networks (~0.8) - papers cite papers in similar fields
3. **This is why GNNs work so well** on these datasets - neighbor aggregation makes sense!
4. **For heterophilic graphs** (low homophily), you'd need specialized architectures