# Hierarchical GCN

In [18]:
pip install torch

Collecting torch
  Using cached torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting filelock (from torch)
  Using cached filelock-3.17.0-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.10.0 (from torch)
  Using cached typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Using ca

In [20]:
pip install torch_geometric

Collecting torch_geometric
  Using cached torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
Using cached torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1
Note: you may need to restart the kernel to use updated packages.


In [21]:
import re
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.data import Data, HeteroData
from torch_geometric.nn import HGTConv, GATConv
from sklearn.preprocessing import StandardScaler, LabelEncoder

## 01 Load Data

In [49]:
edge_list = pd.read_csv('../02_data/train_test/train_edge_list.csv') #  0 "chat_id" int64, 1 "domain_index" object

In [134]:
# Validated Projected Bipartite Network
edge_np = np.load("../02_data/train_test/validated_edges.npy", allow_pickle=True)
edge_nodes = set(edge_np.flatten()) 

# Convert edge list numpy array to DataFrame
edge_df = pd.DataFrame(edge_np, columns=["chat_id", "domain_index"])

In [106]:
chat_data = pd.read_csv('grouped_main_topics.csv')

In [107]:
train_data = pd.read_csv('../02_data/train_test/train_data.csv')
test_data = pd.read_csv('../02_data/train_test/test_data.csv')

In [136]:
print("edge_list info (train edges)")
print(edge_df.info())
print("-------------------------------------------------")

print("chat_data info")
print(chat_data.info())
print("-------------------------------------------------")

print("train_data info")
print(train_data.info())
print("-------------------------------------------------")

print("test_data info")
print(test_data.info())
print("-------------------------------------------------")

edge_list info (train edges)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194454 entries, 0 to 194453
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype
---  ------        --------------   -----
 0   chat_id       194454 non-null  int64
 1   domain_index  194454 non-null  int64
dtypes: int64(2)
memory usage: 3.0 MB
None
-------------------------------------------------
chat_data info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49412 entries, 0 to 49411
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   id                 49412 non-null  int64         
 1   name               49412 non-null  object        
 2   type               49412 non-null  object        
 3   db_index           49412 non-null  int64         
 4   description        49411 non-null  object        
 5   message_count      49412 non-null  float64       
 6   total_view_count   49412 non-nu

In [109]:
edge_np.shape

(194454, 2)

## 02 Preprocessing

In [141]:
# Keep only valid edges
filtered_edge_df = edge_df[
    edge_df["chat_id"].isin(chat_ids) & edge_df["domain_index"].isin(domain_indices)
]
edge_np = filtered_edge_df.to_numpy()
print(f"Filtered edge list shape: {edge_np.shape}")


Filtered edge list shape: (46699, 2)


In [142]:
# Remap chat_id and domain_index to a 0-based index
unique_chat_ids = np.unique(edge_np[:, 0])
unique_domain_ids = np.unique(edge_np[:, 1])

print("chats:", len(unique_chat_ids))
print("domains:", len(unique_domain_ids))

# Create mapping dictionaries
chat_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_chat_ids)}
domain_id_map = {old_id: new_id for new_id, old_id in enumerate(unique_domain_ids)}

# Apply mapping
edge_np[:, 0] = np.vectorize(chat_id_map.get)(edge_np[:, 0])
edge_np[:, 1] = np.vectorize(domain_id_map.get)(edge_np[:, 1])

chats: 1103
domains: 3024


In [143]:
chat_data["oldest_post"] = pd.to_datetime(chat_data["oldest_post"], errors="coerce")

# Extract year
chat_data["oldest_post_year"] = chat_data["oldest_post"].dt.year.astype("Int64") 

In [145]:
# Categorize pc1
train_data["pc1_class"] = np.digitize(train_data["pc1"], bins=[0.33, 0.66])

# Normalize numerical features
scaler = StandardScaler()
train_features = scaler.fit_transform(train_data[['virality', 'avalanches', 'messages', 'year']])
chat_features = scaler.fit_transform(chat_data[['message_count', 'total_view_count', 'oldest_post_year', 'topic']])

In [146]:
chat_features

array([[-0.03910525, -0.04284129,  1.42687227, -0.72738467],
       [-0.03974649, -0.04651106,  0.68244295,  3.73440883],
       [-0.03860651, -0.0456818 ,  0.68244295, -0.72738467],
       ...,
       [-0.04052232, -0.04693159,         nan, -0.28120532],
       [-0.01308353,  0.06827296,         nan, -0.72738467],
       [-0.04052232, -0.04693159,         nan, -0.72738467]])

In [147]:
# Convert data to tensors
train_x = torch.tensor(train_features, dtype=torch.float)
chat_x = torch.tensor(chat_features, dtype=torch.float)
y = torch.tensor(train_data["pc1_class"].values, dtype=torch.long)

# Build edge index
edge_index = torch.tensor(edge_np.T, dtype=torch.long)

# Define Heterogeneous Graph Data
data = HeteroData()
data['chat'].x = chat_x
data['domain'].x = train_x
data['domain'].y = y

# Define edges
data['chat', 'interacts', 'domain'].edge_index = edge_index


In [148]:
edge_index.shape

torch.Size([2, 46699])

In [149]:
data

HeteroData(
  chat={ x=[49412, 4] },
  domain={
    x=[4753, 4],
    y=[4753],
  },
  (chat, interacts, domain)={ edge_index=[2, 46699] }
)

In [150]:
print("Node types in data:", data.node_types)

Node types in data: ['chat', 'domain']


In [151]:
print("Available x_dict keys:", data.x_dict.keys())

Available x_dict keys: dict_keys(['chat', 'domain'])


In [152]:
print(f"Chat feature size: {data['chat'].x.shape}")
print(f"Domain feature size: {data['domain'].x.shape}")

print(f"Max chat_id in edge_list: {len(unique_chat_ids)}")
print(f"Max domain_index in edge_list: {len(unique_domain_ids)}")

print(f"Max chat_id in chat_data: {chat_data['id'].max()}")
print(f"Max domain_index in train_data: {train_data['domain_index'].max()}")

Chat feature size: torch.Size([49412, 4])
Domain feature size: torch.Size([4753, 4])
Max chat_id in edge_list: 1103
Max domain_index in edge_list: 3024
Max chat_id in chat_data: 64981
Max domain_index in train_data: 5941


## 03 Model

In [160]:
import torch.nn.functional as F
from torch_geometric.nn import HGTConv, Linear
from torch_geometric.nn import global_mean_pool

In [170]:
class HGNN(torch.nn.Module):
    def __init__(self, hidden_dim, num_classes):
        super().__init__()
        self.chat_proj = Linear(data.x_dict['chat'].shape[1], hidden_dim)
        self.domain_proj = Linear(data.x_dict['domain'].shape[1], hidden_dim)

        self.conv1 = HGTConv(hidden_dim, hidden_dim, data.metadata()) 
        self.conv2 = HGTConv(hidden_dim, hidden_dim, data.metadata()) 

        self.fc = Linear(hidden_dim, num_classes)

    def forward(self, data):
        print("Available node types:", data.node_types)
        print("Available x_dict keys before projection:", data.x_dict.keys())

        if 'chat' not in data.x_dict:
            raise ValueError("❌ 'chat' node type is missing from x_dict! Ensure it's properly assigned.")

        # Initial projection
        data.x_dict['chat'] = self.chat_proj(data.x_dict['chat'])
        data.x_dict['domain'] = self.domain_proj(data.x_dict['domain'])

        print("Available x_dict keys after projection:", data.x_dict.keys())

        # Hierarchical message passing
        conv1_out = self.conv1(data.x_dict, data.edge_index_dict)
        print("conv1 output keys:", conv1_out.keys())  # Debugging step

        if 'chat' not in conv1_out:
            raise ValueError("❌ 'chat' is missing in conv1 output! Check HGTConv input.")

        data.x_dict['chat'] = F.relu(conv1_out['chat'])

        conv2_out = self.conv2(data.x_dict, data.edge_index_dict)
        print("conv2 output keys:", conv2_out.keys())  # Debugging step

        if 'domain' not in conv2_out:
            raise ValueError("❌ 'domain' is missing in conv2 output! Check HGTConv input.")

        data.x_dict['domain'] = F.relu(conv2_out['domain'])

        # Classification layer
        out = global_mean_pool(data.x_dict['domain'], torch.arange(data['domain'].num_nodes))
        return F.log_softmax(self.fc(out), dim=1)


In [171]:
import torch.optim as optim

# Define loss and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-4)

# Training Loop
def train():
    model.train()
    optimizer.zero_grad()
    out = model(data)
    loss = criterion(out, data['domain'].y)
    loss.backward()
    optimizer.step()
    return loss.item()

# Train for 50 epochs
for epoch in range(50):
    loss = train()
    print(f"Epoch {epoch+1}: Loss = {loss:.4f}")


Available node types: ['chat', 'domain']
Available x_dict keys: dict_keys(['chat', 'domain'])


KeyError: 'chat'

In [163]:
print(data.x_dict.keys())  # Should include 'chat' and 'domain'
print(data.edge_index_dict.keys())  # Should match data.metadata()

dict_keys(['chat', 'domain'])
dict_keys([('chat', 'interacts', 'domain')])


In [168]:
data.metadata()

(['chat', 'domain'], [('chat', 'interacts', 'domain')])

In [167]:
# Ensure bidirectional edges
data.edge_index_dict[('domain', 'interacts', 'chat')] = torch.flip(data.edge_index_dict[('chat', 'interacts', 'domain')], dims=[0])


# 04 Evaluation

In [None]:
def test():
    model.eval()
    with torch.no_grad():
        out = model(data)
        pred = out.argmax(dim=1)
        acc = (pred == data['domain'].y).sum().item() / data['domain'].y.size(0)
        print(f"Test Accuracy: {acc:.4f}")

test()
