# Toxic Comment Heterogeneous GNN Training Notebook




## 1. Load Context Configuration (Parse context.md)


In [1]:
import re, json, os, textwrap
from pathlib import Path
import pandas as pd

CONTEXT_FILE = Path('context.md')
assert CONTEXT_FILE.exists(), 'context.md not found in workspace.'
raw = CONTEXT_FILE.read_text(encoding='utf-8')

# heuristic extraction of CSV path
m = re.search(r'youtube_comments_with_toxicity_(\d+_\d+)\.csv', raw)
if m:
    csv_candidates = list(Path('Notebooks').glob(f'youtube_comments_with_toxicity_{m.group(1)}.csv'))
else:
    csv_candidates = list(Path('Notebooks').glob('youtube_comments_with_toxicity_*.csv'))
DATA_CSV = csv_candidates[0] if csv_candidates else None

config = {
    'data_csv': str(DATA_CSV) if DATA_CSV else None,
    'user_col': 'AuthorChannelID',
    'comment_col': 'CommentText',
    'comment_id_col': 'CommentID',
    'parent_col': 'ParentCommentID',
    'label_col': 'ToxicLabel',
    'score_col': 'ToxicScore',
    'binary_label_col': 'ToxicBinary',
    'embedding_model': 'all-MiniLM-L6-v2',
    'gnn_hidden': 128,
    'gnn_out_classes': 2,
    'gnn_dropout': 0.3,
    'gnn_num_layers': 2,
    'use_class_weights': True,
    'focal_loss': False,          # set True to experiment later
    'focal_gamma': 2.0,
    'add_knn_similarity': True,
    'knn_k': 5,
    'knn_max_nodes': 50000,       # cap for kNN (sampled subset when many comments)
    'train_val_test_split': [0.8,0.1,0.1],
    'primary_metric': 'f1',
    'epochs': 30,
    'lr': 1e-3,
    'weight_decay': 1e-5,
    'early_stopping_patience': 5,
    'batch_size_node_loader': 1024,
    'neighbors': 10,
    'community_min_degree': 2,    # filter users below this degree before community detection
    'community_max_nodes': 100000, # skip / downsample if exceeds
    'motif_max_nodes': 200000,    # skip detailed motif / triad census if bigger
    'edge_classifier_epochs': 20,
    'edge_classifier_balance': True,
    'seed': 42
}
print('CONFIG =>')
print(json.dumps(config, indent=2))

if not DATA_CSV:
    raise FileNotFoundError('Could not locate toxicity CSV. Please place it under Notebooks/.')

df = pd.read_csv(DATA_CSV)
print('Loaded dataframe shape:', df.shape)
df.head(2)

CONFIG =>
{
  "data_csv": "Notebooks/youtube_comments_with_toxicity_20250914_061551.csv",
  "user_col": "AuthorChannelID",
  "comment_col": "CommentText",
  "comment_id_col": "CommentID",
  "parent_col": "ParentCommentID",
  "label_col": "ToxicLabel",
  "score_col": "ToxicScore",
  "binary_label_col": "ToxicBinary",
  "embedding_model": "all-MiniLM-L6-v2",
  "gnn_hidden": 128,
  "gnn_out_classes": 2,
  "gnn_dropout": 0.3,
  "gnn_num_layers": 2,
  "use_class_weights": true,
  "focal_loss": false,
  "focal_gamma": 2.0,
  "add_knn_similarity": true,
  "knn_k": 5,
  "knn_max_nodes": 50000,
  "train_val_test_split": [
    0.8,
    0.1,
    0.1
  ],
  "primary_metric": "f1",
  "epochs": 30,
  "lr": 0.001,
  "weight_decay": 1e-05,
  "early_stopping_patience": 5,
  "batch_size_node_loader": 1024,
  "neighbors": 10,
  "community_min_degree": 2,
  "community_max_nodes": 100000,
  "motif_max_nodes": 200000,
  "edge_classifier_epochs": 20,
  "edge_classifier_balance": true,
  "seed": 42
}
Loaded d

Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1,non-toxic,0.998745
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17,non-toxic,0.996063


## 2. Environment Setup and Imports

In [2]:
import numpy as np
import torch, random
from pathlib import Path

REQUIRED = ['pandas','numpy','torch','sklearn','tqdm']
print('Python version OK')
print('Torch:', torch.__version__)

def set_seed(seed:int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(config.get('seed', 42))

device = torch.device('cpu')  # Force CPU to avoid MPS OOM
print('Using device:', device)

Python version OK
Torch: 2.8.0
Using device: cpu


## 3. Data Loading and Preprocessing

In [3]:
# Ensure required columns & create binary label
USER_COL=config['user_col']; COMMENT_COL=config['comment_col']; CID_COL=config['comment_id_col']
if 'ToxicBinary' not in df.columns:
    if df['ToxicLabel'].dtype==object:
        df['ToxicBinary']=df['ToxicLabel'].str.lower().str.startswith('toxic').astype(int)
    else:
        df['ToxicBinary']=(df['ToxicScore']>0.7).astype(int)

# Drop rows missing essentials
df = df.dropna(subset=[USER_COL, COMMENT_COL])

print('Rows after cleaning:', len(df))
print('Class balance:', df['ToxicBinary'].value_counts(normalize=True))

df.head(3)

Rows after cleaning: 1032225
Class balance: ToxicBinary
0    0.953549
1    0.046451
Name: proportion, dtype: float64


Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore,ToxicBinary
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1,non-toxic,0.998745,0
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17,non-toxic,0.996063,0
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27,non-toxic,0.997976,0


## 4. Text Embeddings

In [4]:

try:
    from sentence_transformers import SentenceTransformer
    model_name = config['embedding_model']
    sbert = SentenceTransformer(model_name)
    texts = df[COMMENT_COL].astype(str).tolist()
    batch=256; embs=[]
    for i in range(0,len(texts),batch):
        embs.append(sbert.encode(texts[i:i+batch], show_progress_bar=False))
    import numpy as np
    embeddings = np.vstack(embs)
except Exception as e:
    print('Falling back to bag-of-words (hash) embeddings due to error:', e)
    import numpy as np, hashlib
    def hvec(t):
        h = hashlib.md5(t.encode()).hexdigest()
        return np.array([int(h[i:i+4],16)%10000 for i in range(0,16,4)],dtype=float)
    embeddings = np.vstack([hvec(t) for t in df[COMMENT_COL].astype(str)])

print('Embeddings shape:', embeddings.shape)

Falling back to bag-of-words (hash) embeddings due to error: No module named 'sentence_transformers'
Embeddings shape: (1032225, 4)
Embeddings shape: (1032225, 4)


## 5. Build Graph (HeteroData)

In [5]:
import torch_geometric
from torch_geometric.data import HeteroData
import torch, numpy as np

# Reply edges heuristic if parent col missing (normalize IDs to str)
if config['parent_col'] in df.columns and df[config['parent_col']].notna().any():
    reply_pairs = (
        df[df[config['parent_col']].notna()][[config['parent_col'], config['comment_id_col']]]
        .astype(str)
        .values
        .tolist()
    )
else:
    reply_pairs = []
    if 'VideoID' in df.columns and 'PublishedAt' in df.columns:
        df_sorted = df.sort_values(['VideoID','PublishedAt'])
        for vid, group in df_sorted.groupby('VideoID'):
            ids = group[config['comment_id_col']].astype(str).tolist()
            for i in range(1, len(ids)):
                reply_pairs.append((ids[i-1], ids[i]))

# Canonicalize IDs to strings for consistent indexing
comment_ids = df[CID_COL].astype(str).tolist()
comment_idx = {cid: i for i, cid in enumerate(comment_ids)}
users = df[USER_COL].astype(str).unique().tolist()
user_idx = {u: i for i, u in enumerate(users)}

# user->comment authored edges (normalize to str)
authored_edges = [(str(row[USER_COL]), str(row[CID_COL])) for _, row in df.iterrows()]

# Build features for users
user_deg = {u: 0 for u in users}; user_tox = {u: [] for u in users}
for _, r in df.iterrows():
    u = str(r[USER_COL]); user_deg[u] += 1; user_tox[u].append(r['ToxicBinary'])
user_feat = np.vstack([
    [user_deg[u] for u in users],
    [np.mean(user_tox[u]) if user_tox[u] else 0 for u in users]
]).T

hetero = HeteroData()
hetero['comment'].x = torch.tensor(embeddings, dtype=torch.float)
hetero['comment'].y = torch.tensor(df['ToxicBinary'].values, dtype=torch.long)
hetero['user'].x = torch.tensor(user_feat, dtype=torch.float)

# Build edge indices (guard for any stray IDs)
src = [user_idx[str(u)] for u, c in authored_edges if str(u) in user_idx and str(c) in comment_idx]
dst = [comment_idx[str(c)] for u, c in authored_edges if str(u) in user_idx and str(c) in comment_idx]
hetero['user','authored','comment'].edge_index = torch.tensor([src, dst], dtype=torch.long)

r_src = [comment_idx[str(p)] for p, c in reply_pairs if str(p) in comment_idx and str(c) in comment_idx]
r_dst = [comment_idx[str(c)] for p, c in reply_pairs if str(p) in comment_idx and str(c) in comment_idx]
hetero['comment','replies_to','comment'].edge_index = torch.tensor([r_src, r_dst], dtype=torch.long)

# Optional: add kNN similarity edges among comments to densify the graph
if config.get('add_knn_similarity', False):
    max_nodes = config.get('knn_max_nodes', 50000)
    emb_tensor = hetero['comment'].x
    total_nodes = emb_tensor.size(0)
    if total_nodes > 10:  # only if meaningful
        if total_nodes > max_nodes:
            # sample a subset for similarity graph; map back indices
            sample_idx = torch.randperm(total_nodes)[:max_nodes]
            emb_sample = emb_tensor[sample_idx]
            base_indices = sample_idx
        else:
            emb_sample = emb_tensor
            base_indices = torch.arange(total_nodes)
        # Normalize and compute approximate cosine similarity via inner product
        with torch.no_grad():
            normed = torch.nn.functional.normalize(emb_sample, p=2, dim=1)
            # chunked to control memory
            k = config.get('knn_k', 5)
            edges_sim_src = []
            edges_sim_dst = []
            chunk = 2048
            for start in range(0, normed.size(0), chunk):
                blk = normed[start:start+chunk]
                sim = blk @ normed.T  # [chunk, N]
                topk = torch.topk(sim, k=k+1, dim=1).indices  # include self then filter
                base_rows = base_indices[start:start+chunk]
                for row_i, neighs in zip(base_rows.tolist(), topk):
                    for n in neighs.tolist():
                        if base_indices[n] != row_i:  # skip self
                            edges_sim_src.append(row_i)
                            edges_sim_dst.append(base_indices[n].item())
            if edges_sim_src:
                hetero['comment','similar','comment'].edge_index = torch.tensor([edges_sim_src, edges_sim_dst], dtype=torch.long)
                print(f"Added similarity edges: {len(edges_sim_src)}")

print(hetero)

  from .autonotebook import tqdm as notebook_tqdm


Added similarity edges: 250257
HeteroData(
  comment={
    x=[1032225, 4],
    y=[1032225],
  },
  user={ x=[759619, 2] },
  (user, authored, comment)={ edge_index=[2, 1032225] },
  (comment, replies_to, comment)={ edge_index=[2, 1027662] },
  (comment, similar, comment)={ edge_index=[2, 250257] }
)


## 6. Train/Val/Test Split

In [6]:
import torch
num_comments = hetero['comment'].num_nodes
perm = torch.randperm(num_comments)
n_train=int(config['train_val_test_split'][0]*num_comments)
n_val=int(config['train_val_test_split'][1]*num_comments)
train_idx=perm[:n_train]; val_idx=perm[n_train:n_train+n_val]; test_idx=perm[n_train+n_val:]
train_mask=torch.zeros(num_comments,dtype=torch.bool); train_mask[train_idx]=True
val_mask=torch.zeros(num_comments,dtype=torch.bool); val_mask[val_idx]=True
test_mask=torch.zeros(num_comments,dtype=torch.bool); test_mask[test_idx]=True
hetero['comment'].train_mask=train_mask
hetero['comment'].val_mask=val_mask
hetero['comment'].test_mask=test_mask
print('Split sizes:', train_mask.sum().item(), val_mask.sum().item(), test_mask.sum().item())

Split sizes: 825780 103222 103223


## 7. Model Definition

In [7]:
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
import torch.nn.functional as F
import torch.nn as nn

# Build metadata-driven multi-layer hetero GNN with dropout
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden, out_classes, num_layers=2, dropout=0.3, edge_types=None):
        super().__init__()
        self.num_layers = num_layers
        self.dropout = dropout
        self.hidden = hidden
        self.out_classes = out_classes
        rels = set(edge_types or [])
        if not rels:
            raise ValueError('HeteroGNN requires at least one edge type; got none.')
        # Type-wise input projection to hidden dim (lazy Linear infers input dim on first use)
        self.in_lins = nn.ModuleDict({
            'user': Linear(-1, hidden),
            'comment': Linear(-1, hidden),
        })
        # Helper to build relation dict depending on available relations
        def build_rel_dict():
            rel_dict = {}
            if ('user','authored','comment') in rels:
                rel_dict[('user','authored','comment')] = SAGEConv((hidden, hidden), hidden)
            if ('comment','replies_to','comment') in rels:
                rel_dict[('comment','replies_to','comment')] = SAGEConv((hidden, hidden), hidden)
            if ('comment','similar','comment') in rels:
                rel_dict[('comment','similar','comment')] = SAGEConv((hidden, hidden), hidden)
            return rel_dict
        # All layers operate on hidden-sized features
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            self.convs.append(HeteroConv(build_rel_dict(), aggr='mean'))
        self.lin = Linear(hidden, out_classes)
    def forward(self, x_dict, edge_index_dict):
        # Project raw node features to hidden size
        x_dict = {k: self.in_lins[k](v) if k in self.in_lins else v for k, v in x_dict.items()}
        for i, conv in enumerate(self.convs):
            out_dict = conv(x_dict, edge_index_dict)
            # Merge: keep previous embeddings for node types not updated in this layer
            merged = {}
            for k in x_dict.keys():
                v = out_dict.get(k, x_dict[k])
                merged[k] = F.dropout(v.relu(), p=self.dropout, training=self.training)
            x_dict = merged
        return self.lin(x_dict['comment'])

model = HeteroGNN(
    config['gnn_hidden'],
    config['gnn_out_classes'],
    config['gnn_num_layers'],
    config['gnn_dropout'],
    edge_types=hetero.edge_types
).to(device)
# Ensure hetero data is also on the same device as model
hetero = hetero.to(device)
print(model)
print(f'Model and data both on: {device}')

HeteroGNN(
  (in_lins): ModuleDict(
    (user): Linear(-1, 128, bias=True)
    (comment): Linear(-1, 128, bias=True)
  )
  (convs): ModuleList(
    (0-1): 2 x HeteroConv(num_relations=3)
  )
  (lin): Linear(128, 2, bias=True)
)
Model and data both on: cpu




## 8. Training Hyperparameters & Optimizer

In [8]:
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score
import torch

# Optional focal loss implementation
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction
    def forward(self, logits, target):
        ce = torch.nn.functional.cross_entropy(logits, target, weight=self.weight, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1-pt)**self.gamma) * ce
        if self.reduction=='mean':
            return loss.mean()
        elif self.reduction=='sum':
            return loss.sum()
        return loss

# Class weights for imbalance
if config.get('use_class_weights', False):
    y_all = hetero['comment'].y.detach().cpu().numpy()
    counts = np.bincount(y_all)
    total = counts.sum()
    weights = total / (len(counts) * counts)
    class_w_tensor = torch.tensor(weights, dtype=torch.float, device=device)
else:
    class_w_tensor = None

if config.get('focal_loss', False):
    criterion = FocalLoss(gamma=config.get('focal_gamma', 2.0), weight=class_w_tensor)
else:
    criterion = torch.nn.CrossEntropyLoss(weight=class_w_tensor)

optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

best_metric=-1.0
patience=0
best_path = Path('model_best.pt')

## 9. Training Loop with Validation & Early Stopping

## 8.5. Device Sync and Diagnostic Check

In [9]:
# Force everything to CPU and verify device consistency
print("=== Device Sync Check ===")
print(f"Target device: {device}")

# Move hetero graph to CPU (may already be there, but ensure consistency)
hetero = hetero.cpu()
print(f"hetero.x_dict devices: {[f'{k}:{v.device}' for k,v in hetero.x_dict.items()]}")

# Recreate model and optimizer on CPU (fresh start)
model = HeteroGNN(
    config['gnn_hidden'],
    config['gnn_out_classes'],
    config['gnn_num_layers'],
    config['gnn_dropout'],
    edge_types=hetero.edge_types
).to(device)

# Recreate optimizer and criterion for CPU
optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
if config.get('focal_loss', False):
    criterion = FocalLoss(gamma=config.get('focal_gamma', 2.0), weight=class_w_tensor.to(device) if class_w_tensor is not None else None)
else:
    criterion = torch.nn.CrossEntropyLoss(weight=class_w_tensor.to(device) if class_w_tensor is not None else None)

print(f"Model device: {next(model.parameters()).device}")
print(f"Class weights device: {class_w_tensor.device if class_w_tensor is not None else 'None'}")

# Test forward pass
print("Testing forward pass...")
with torch.no_grad():
    try:
        test_out = model(hetero.x_dict, hetero.edge_index_dict)
        print(f"✅ Forward pass successful! Output shape: {test_out.shape}")
    except Exception as e:
        print(f"❌ Forward pass failed: {e}")
        raise

print("=== Ready for training ===")

=== Device Sync Check ===
Target device: cpu
hetero.x_dict devices: ['comment:cpu', 'user:cpu']
Model device: cpu
Class weights device: cpu
Testing forward pass...




✅ Forward pass successful! Output shape: torch.Size([1032225, 2])
=== Ready for training ===


In [10]:
from time import time
history = []
for epoch in range(1, config['epochs']+1):
    model.train()
    optimizer.zero_grad()
    out = model(hetero.x_dict, hetero.edge_index_dict)
    y = hetero['comment'].y
    loss = criterion(out[hetero['comment'].train_mask], y[hetero['comment'].train_mask])
    loss.backward(); optimizer.step()

    # validation
    model.eval()
    with torch.no_grad():
        logits_eval = model(hetero.x_dict, hetero.edge_index_dict)
        val_logits = logits_eval[hetero['comment'].val_mask]
        val_y = y[hetero['comment'].val_mask]
        preds = val_logits.argmax(dim=1).cpu().numpy()
        val_y_np = val_y.cpu().numpy()
        f1 = f1_score(val_y_np, preds, average='macro')
        acc = accuracy_score(val_y_np, preds)
    history.append({'epoch':epoch,'loss':float(loss.item()),'val_f1':float(f1),'val_acc':float(acc)})
    print(f"Epoch {epoch:02d} | loss {loss.item():.4f} | val_f1 {f1:.4f} | val_acc {acc:.4f}")

    if f1>best_metric:
        best_metric=f1; patience=0
        torch.save({'model_state': model.state_dict(), 'config': config}, best_path)
        print('  ✅ Saved new best model (F1={:.4f})'.format(f1))
    else:
        patience+=1
        if patience>=config['early_stopping_patience']:
            print('Early stopping triggered.')
            break

import pandas as _pd
hist_df = _pd.DataFrame(history)
hist_df.head()

Epoch 01 | loss 292.2709 | val_f1 0.4851 | val_acc 0.8205
  ✅ Saved new best model (F1=0.4851)
Epoch 02 | loss 605.9946 | val_f1 0.4878 | val_acc 0.9525
  ✅ Saved new best model (F1=0.4878)
Epoch 02 | loss 605.9946 | val_f1 0.4878 | val_acc 0.9525
  ✅ Saved new best model (F1=0.4878)
Epoch 03 | loss 372.6259 | val_f1 0.4503 | val_acc 0.6957
Epoch 03 | loss 372.6259 | val_f1 0.4503 | val_acc 0.6957
Epoch 04 | loss 557.4712 | val_f1 0.0454 | val_acc 0.0475
Epoch 04 | loss 557.4712 | val_f1 0.0454 | val_acc 0.0475
Epoch 05 | loss 255.8391 | val_f1 0.0454 | val_acc 0.0475
Epoch 05 | loss 255.8391 | val_f1 0.0454 | val_acc 0.0475
Epoch 06 | loss 458.1595 | val_f1 0.0454 | val_acc 0.0475
Epoch 06 | loss 458.1595 | val_f1 0.0454 | val_acc 0.0475
Epoch 07 | loss 216.5446 | val_f1 0.2234 | val_acc 0.2462
Early stopping triggered.
Epoch 07 | loss 216.5446 | val_f1 0.2234 | val_acc 0.2462
Early stopping triggered.


Unnamed: 0,epoch,loss,val_f1,val_acc
0,1,292.270905,0.485107,0.820542
1,2,605.994629,0.487834,0.952491
2,3,372.625854,0.450315,0.695724
3,4,557.471191,0.045354,0.047509
4,5,255.839096,0.045354,0.047509


## 10. Test Evaluation (Best Model)

In [11]:
# Load best and evaluate on test
state = torch.load(best_path, map_location=device)
model.load_state_dict(state['model_state'])
model.eval()
with torch.no_grad():
    logits = model(hetero.x_dict, hetero.edge_index_dict)
    test_logits = logits[hetero['comment'].test_mask]
    test_y = hetero['comment'].y[hetero['comment'].test_mask]
    preds = test_logits.argmax(dim=1).cpu().numpy()
    y_true = test_y.cpu().numpy()
    test_f1 = f1_score(y_true, preds, average='macro')
    test_acc = accuracy_score(y_true, preds)
print(f'Test F1={test_f1:.4f} Acc={test_acc:.4f}')

Test F1=0.4880 Acc=0.9532


## 11. Serialize Artifacts (Config + Model)

In [14]:
import json, hashlib
artifacts_dir=Path('artifacts'); artifacts_dir.mkdir(exist_ok=True)
json.dump(config, open(artifacts_dir/'config.json','w'), indent=2)
# copy model file
import shutil
shutil.copy(best_path, artifacts_dir/'model_best.pt')
# hash
h=hashlib.sha256(open(artifacts_dir/'model_best.pt','rb').read()).hexdigest()
print('Model SHA256:', h)

Model SHA256: da3f767d20b6cff63f19bfb4b74005afda95e406228e0cb20f703e5b3fc1d301


## 11b. Task A: Reply Edge Classification (Abusive vs Non-Abusive)

In [15]:
# Prepare edge dataset: label is child comment toxicity with balancing options
import torch, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import random

edge_src = np.array(r_src)
edge_dst = np.array(r_dst)

if edge_src.size == 0:
    print('No reply edges available; skipping edge classification.')
else:
    model.eval()
    with torch.no_grad():
        # Compute hidden embeddings consistent with model.forward (without final classifier)
        x_dict_emb = {k: model.in_lins[k](v) if k in model.in_lins else v for k, v in hetero.x_dict.items()}
        for conv in model.convs:
            out_dict = conv(x_dict_emb, hetero.edge_index_dict)
            x_dict_emb = {k: out_dict.get(k, x_dict_emb[k]).relu() for k in x_dict_emb.keys()}
        hidden_comment = x_dict_emb['comment']
    feat = hidden_comment.detach().cpu().numpy()

    X_edge = np.concatenate([feat[edge_src], feat[edge_dst]], axis=1)
    y_edge = hetero['comment'].y.detach().cpu().numpy()[edge_dst]

    # Balance edges if configured
    if config.get('edge_classifier_balance', True):
        idx_pos = np.where(y_edge==1)[0]
        idx_neg = np.where(y_edge==0)[0]
        if len(idx_pos) > 0 and len(idx_neg) > 0:
            neg_sample = np.random.choice(idx_neg, size=min(len(idx_neg), max(len(idx_pos)*3, 1)), replace=False)
            keep = np.concatenate([idx_pos, neg_sample])
            np.random.shuffle(keep)
            X_edge = X_edge[keep]
            y_edge = y_edge[keep]
            print(f'Edge balancing applied: pos {len(idx_pos)}, sampled neg {len(neg_sample)} -> total {len(keep)}')

    X_train, X_test, y_train, y_test = train_test_split(
        X_edge, y_edge, test_size=0.2, stratify=y_edge if len(np.unique(y_edge))>1 else None, random_state=42
    )

    import torch.nn as nn
    class EdgeMLP(nn.Module):
        def __init__(self, in_dim, hidden=256, dropout=0.3):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(dropout),
                nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Dropout(dropout/2),
                nn.Linear(hidden//2, 2)
            )
        def forward(self, x):
            return self.net(x)

    in_dim = X_train.shape[1]
    clf = EdgeMLP(in_dim).to(device)
    opt = torch.optim.Adam(clf.parameters(), lr=1e-3, weight_decay=1e-4)
    lossf = nn.CrossEntropyLoss()

    Xtr = torch.tensor(X_train, dtype=torch.float, device=device); ytr = torch.tensor(y_train, dtype=torch.long, device=device)
    Xte = torch.tensor(X_test, dtype=torch.float, device=device); yte = torch.tensor(y_test, dtype=torch.long, device=device)

    epochs_edge = config.get('edge_classifier_epochs', 20)
    for epoch in range(1, epochs_edge+1):
        clf.train(); opt.zero_grad(); out = clf(Xtr); loss = lossf(out, ytr); loss.backward(); opt.step()
        if epoch%5==0 or epoch==1:
            clf.eval(); pred = clf(Xte).argmax(1); acc=(pred==yte).float().mean().item(); print(f'Edge Epoch {epoch} loss {loss.item():.4f} test_acc {acc:.4f}')

    clf.eval(); pred = clf(Xte).argmax(1).cpu().numpy()
    print(classification_report(y_test, pred, digits=3))

    from sklearn.metrics import precision_recall_fscore_support
    p, r, f1, s = precision_recall_fscore_support(y_test, pred, labels=[0,1], zero_division=0)
    edge_report = {
        'labels': ['non_abusive','abusive'],
        'precision': p.tolist(), 'recall': r.tolist(), 'f1': f1.tolist(), 'support': s.tolist()
    }
    Path('artifacts').mkdir(exist_ok=True)
    json.dump(edge_report, open('artifacts/edge_clf_report.json','w'), indent=2)
    print('Wrote artifacts/edge_clf_report.json')

Edge balancing applied: pos 47812, sampled neg 143436 -> total 191248
Edge Epoch 1 loss 41.5258 test_acc 0.7500
Edge Epoch 1 loss 41.5258 test_acc 0.7500
Edge Epoch 5 loss 23.4409 test_acc 0.2511
Edge Epoch 5 loss 23.4409 test_acc 0.2511
Edge Epoch 10 loss 21.0688 test_acc 0.7500
Edge Epoch 10 loss 21.0688 test_acc 0.7500
Edge Epoch 15 loss 9.1223 test_acc 0.4808
Edge Epoch 15 loss 9.1223 test_acc 0.4808
Edge Epoch 20 loss 5.2507 test_acc 0.7268
              precision    recall  f1-score   support

           0      0.750     0.954     0.840     28688
           1      0.245     0.045     0.075      9562

    accuracy                          0.727     38250
   macro avg      0.497     0.499     0.458     38250
weighted avg      0.624     0.727     0.649     38250

Wrote artifacts/edge_clf_report.json
Edge Epoch 20 loss 5.2507 test_acc 0.7268
              precision    recall  f1-score   support

           0      0.750     0.954     0.840     28688
           1      0.245     0.045  

## 11c. Motif Counting and k-core (Gang-up Indicators)

In [None]:
# Convert to NetworkX and compute motifs + k-core (with safety caps)
import networkx as nx
from collections import Counter
from pathlib import Path
import json, math

G = nx.DiGraph()
G.add_nodes_from(range(hetero['comment'].num_nodes))
if 'comment' in hetero.node_types and ('comment','replies_to','comment') in hetero.edge_types:
    eidx = hetero['comment','replies_to','comment'].edge_index.detach().cpu().numpy()
    for u,v in zip(eidx[0], eidx[1]):
        G.add_edge(int(u), int(v))

size_cap = config.get('motif_max_nodes', 200000)
large_graph = G.number_of_nodes() > size_cap

import numpy as np
y_np = hetero['comment'].y.detach().cpu().numpy()
abusive_edges = [(u,v) for u,v in G.edges() if y_np[v] == 1]

# If graph huge, downsample a subgraph for triadic census
def maybe_subgraph(Gd):
    if not large_graph:
        return Gd, False
    # sample nodes with activity bias: pick nodes with out-degree >0 first
    deg_nodes = [n for n,d in Gd.out_degree() if d>0]
    if len(deg_nodes) < size_cap:
        chosen = deg_nodes + [n for n in Gd.nodes() if n not in deg_nodes][:size_cap-len(deg_nodes)]
    else:
        chosen = deg_nodes[:size_cap]
    return Gd.subgraph(chosen).copy(), True

G_eval, is_sampled = maybe_subgraph(G)

abusive_only = nx.DiGraph()
if abusive_edges:
    abusive_only.add_nodes_from(G_eval.nodes())
    abusive_only.add_edges_from([e for e in abusive_edges if e[0] in G_eval and e[1] in G_eval])
else:
    abusive_only.add_nodes_from(G_eval.nodes())


def graph_summary(Gd: nx.DiGraph, label: str):
    if Gd.number_of_nodes() == 0:
        return {
            'n_nodes': 0, 'n_edges': 0, 'density': 0.0,
            'avg_clustering': 0.0, 'triadic_census': {}, 'kcore_max': 0,
            'reciprocity': 0.0, 'sampled': is_sampled if label=='all_edges' else False
        }
    und = Gd.to_undirected()
    dens = nx.density(und)
    try:
        avg_clust = nx.average_clustering(und) if und.number_of_edges() > 0 else 0.0
    except Exception:
        avg_clust = 0.0
    # Triadic census only if small enough (networkx triadic_census is O(n^3) worst-case)
    triad = {}
    if Gd.number_of_nodes() <= 50000 and Gd.number_of_edges() > 0:
        try:
            triad = nx.triadic_census(Gd)
        except Exception:
            tri = sum(nx.triangles(und).values()) // 3 if und.number_of_edges() > 0 else 0
            triad = {'triangles': int(tri)}
    else:
        triad = {'skipped': True}
    try:
        core_nums = nx.core_number(und) if und.number_of_edges() > 0 else {}
        kcore_max = int(max(core_nums.values())) if core_nums else 0
    except Exception:
        kcore_max = 0
    try:
        reciprocity = nx.reciprocity(Gd)
        reciprocity = float(reciprocity) if reciprocity is not None else 0.0
    except Exception:
        reciprocity = 0.0
    return {
        'n_nodes': Gd.number_of_nodes(),
        'n_edges': Gd.number_of_edges(),
        'density': float(dens),
        'avg_clustering': float(avg_clust),
        'triadic_census': {k:int(v) if isinstance(v,(int,float)) else v for k,v in triad.items()},
        'kcore_max': kcore_max,
        'reciprocity': reciprocity,
        'sampled': is_sampled if label=='all_edges' else False
    }

report = {}
report['all_edges'] = graph_summary(G_eval, 'all_edges')
report['abusive_only'] = graph_summary(abusive_only, 'abusive_only')

Path('artifacts').mkdir(exist_ok=True)
with open('artifacts/motifs_kcore_report.json','w') as f:
    json.dump(report, f, indent=2)
print('Wrote artifacts/motifs_kcore_report.json')

Wrote artifacts/motifs_kcore_report.json


## 12. Task B: User–User Graph, Communities, Polarization

In [None]:
# Build user-user graph from reply edges; run communities; compute metrics with filtering & caps
import networkx as nx
import json
from pathlib import Path
from collections import defaultdict

comment_user = df.set_index(CID_COL)[USER_COL].to_dict()
U = nx.DiGraph()
U.add_nodes_from(users)
for p, c in zip(r_src, r_dst):
    if p < len(comment_ids) and c < len(comment_ids):
        up = comment_user.get(comment_ids[p]); uc = comment_user.get(comment_ids[c])
        if up is None or uc is None or up == uc:
            continue
        U.add_edge(up, uc)

# Degree filter before community detection
min_deg = config.get('community_min_degree', 2)
if min_deg > 0:
    active_nodes = [n for n,d in U.degree() if d >= min_deg]
    U_sub = U.subgraph(active_nodes).copy()
else:
    U_sub = U

# Size cap
max_nodes_comm = config.get('community_max_nodes', 100000)
if U_sub.number_of_nodes() > max_nodes_comm:
    # sample nodes with highest degree
    deg_sorted = sorted(U_sub.degree(), key=lambda x: x[1], reverse=True)[:max_nodes_comm]
    keep = set(n for n,_ in deg_sorted)
    U_sub = U_sub.subgraph(keep).copy()
    sampled_flag = True
else:
    sampled_flag = False

# Toxic-only edges (based on child comment toxicity)
y_np = hetero['comment'].y.detach().cpu().numpy()
U_toxic = nx.DiGraph()
U_toxic.add_nodes_from(U_sub.nodes())
for p, c in zip(r_src, r_dst):
    if p < len(comment_ids) and c < len(comment_ids) and y_np[c] == 1:
        up = comment_user.get(comment_ids[p]); uc = comment_user.get(comment_ids[c])
        if up is None or uc is None or up == uc:
            continue
        if up in U_sub and uc in U_sub:
            U_toxic.add_edge(up, uc)

Und = U_sub.to_undirected()
Und_t = U_toxic.to_undirected()

try:
    from networkx.algorithms.community import greedy_modularity_communities
    comms = list(greedy_modularity_communities(Und)) if Und.number_of_edges()>0 else []
    comms_t = list(greedy_modularity_communities(Und_t)) if Und_t.number_of_edges()>0 else []
except Exception:
    comms, comms_t = [], []

part = {}
for i, com in enumerate(comms):
    for u in com:
        part[u] = i
part_t = {}
for i, com in enumerate(comms_t):
    for u in com:
        part_t[u] = i

mod = None; mod_t = None
try:
    from networkx.algorithms.community.quality import modularity
    if comms:
        mod = float(modularity(Und, comms))
    if comms_t:
        mod_t = float(modularity(Und_t, comms_t))
except Exception:
    pass

def ei_index(Gd: nx.DiGraph, partition: dict):
    if Gd.number_of_edges() == 0:
        return 0.0
    internal = external = 0
    for u, v in Gd.edges():
        cu = partition.get(u, -1); cv = partition.get(v, -1)
        if cu == -1 or cv == -1:
            continue
        if cu == cv:
            internal += 1
        else:
            external += 1
    denom = internal + external
    return float((external - internal) / denom) if denom > 0 else 0.0

ei = ei_index(U_sub, part)
ei_t = ei_index(U_toxic, part_t if part_t else part)

Path('artifacts').mkdir(exist_ok=True)
# partition export restricted to nodes in filtered graph
import pandas as pd
part_rows = [(u, part.get(u, -1)) for u in U_sub.nodes()]
part_t_rows = [(u, part_t.get(u, -1)) for u in U_sub.nodes()]

pd.DataFrame(part_rows, columns=['user','community']).to_csv('artifacts/user_partition.csv', index=False)
pd.DataFrame(part_t_rows, columns=['user','community_toxic']).to_csv('artifacts/user_partition_toxic.csv', index=False)

comm_report = {
    'n_users_original': int(len(users)),
    'n_users_filtered': int(U_sub.number_of_nodes()),
    'sampled': sampled_flag,
    'min_degree_filter': min_deg,
    'user_edges_all_filtered': int(U_sub.number_of_edges()),
    'user_edges_toxic_filtered': int(U_toxic.number_of_edges()),
    'n_communities_all': int(len(comms)),
    'n_communities_toxic': int(len(comms_t)),
    'modularity_all': mod,
    'modularity_toxic': mod_t,
    'ei_index_all': ei,
    'ei_index_toxic': ei_t
}
with open('artifacts/communities_report.json','w') as f:
    json.dump(comm_report, f, indent=2)
print('Wrote artifacts/user_partition.csv, user_partition_toxic.csv, communities_report.json')

Wrote artifacts/user_partition.csv, user_partition_toxic.csv, communities_report.json
