# Toxic Comment Heterogeneous GNN Training Notebook




## 1. Load Context Configuration (Parse context.md)


In [93]:
import re, json, os, textwrap
from pathlib import Path
import pandas as pd

CONTEXT_FILE = Path('context.md')
assert CONTEXT_FILE.exists(), 'context.md not found in workspace.'
raw = CONTEXT_FILE.read_text(encoding='utf-8')

# heuristic extraction of CSV path
m = re.search(r'youtube_comments_with_toxicity_(\d+_\d+)\.csv', raw)
if m:
    csv_candidates = list(Path('Notebooks').glob(f'youtube_comments_with_toxicity_{m.group(1)}.csv'))
else:
    csv_candidates = list(Path('Notebooks').glob('youtube_comments_with_toxicity_*.csv'))
DATA_CSV = csv_candidates[0] if csv_candidates else None

config = {
    'data_csv': str(DATA_CSV) if DATA_CSV else None,
    'user_col': 'AuthorChannelID',
    'comment_col': 'CommentText',
    'comment_id_col': 'CommentID',
    'parent_col': 'ParentCommentID',
    'label_col': 'ToxicLabel',
    'score_col': 'ToxicScore',
    'binary_label_col': 'ToxicBinary',
    'embedding_model': 'all-MiniLM-L6-v2',
    'gnn_hidden': 128,
    'gnn_out_classes': 2,
    'gnn_dropout': 0.3,
    'gnn_num_layers': 2,
    'use_class_weights': True,
    'focal_loss': False,          # set True to experiment later
    'focal_gamma': 2.0,
    'add_knn_similarity': True,
    'knn_k': 5,
    'knn_max_nodes': 50000,       # cap for kNN (sampled subset when many comments)
    'train_val_test_split': [0.8,0.1,0.1],
    'primary_metric': 'f1',
    'epochs': 30,
    'lr': 1e-3,
    'weight_decay': 1e-5,
    'early_stopping_patience': 5,
    'batch_size_node_loader': 1024,
    'neighbors': 10,
    'community_min_degree': 2,    # filter users below this degree before community detection
    'community_max_nodes': 100000, # skip / downsample if exceeds
    'motif_max_nodes': 200000,    # skip detailed motif / triad census if bigger
    'edge_classifier_epochs': 20,
    'edge_classifier_balance': True,
    'seed': 42
}
print('CONFIG =>')
print(json.dumps(config, indent=2))

if not DATA_CSV:
    raise FileNotFoundError('Could not locate toxicity CSV. Please place it under Notebooks/.')

df = pd.read_csv(DATA_CSV)
print('Loaded dataframe shape:', df.shape)
df.head(2)

CONFIG =>
{
  "data_csv": "Notebooks/youtube_comments_with_toxicity_20250914_061551.csv",
  "user_col": "AuthorChannelID",
  "comment_col": "CommentText",
  "comment_id_col": "CommentID",
  "parent_col": "ParentCommentID",
  "label_col": "ToxicLabel",
  "score_col": "ToxicScore",
  "binary_label_col": "ToxicBinary",
  "embedding_model": "all-MiniLM-L6-v2",
  "gnn_hidden": 128,
  "gnn_out_classes": 2,
  "gnn_dropout": 0.3,
  "gnn_num_layers": 2,
  "use_class_weights": true,
  "focal_loss": false,
  "focal_gamma": 2.0,
  "add_knn_similarity": true,
  "knn_k": 5,
  "knn_max_nodes": 50000,
  "train_val_test_split": [
    0.8,
    0.1,
    0.1
  ],
  "primary_metric": "f1",
  "epochs": 30,
  "lr": 0.001,
  "weight_decay": 1e-05,
  "early_stopping_patience": 5,
  "batch_size_node_loader": 1024,
  "neighbors": 10,
  "community_min_degree": 2,
  "community_max_nodes": 100000,
  "motif_max_nodes": 200000,
  "edge_classifier_epochs": 20,
  "edge_classifier_balance": true,
  "seed": 42
}
Loaded d

Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1,non-toxic,0.998745
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17,non-toxic,0.996063


In [94]:
# 1b. Config overrides for stability and toxic recall
# Safely override a few knobs without changing the original defaults above.
overrides = {
    'focal_loss': True,           # enable focal loss to up-weight hard (toxic) examples
    'focal_gamma': 2.0,           # default gamma
    'lr': 5e-4,                   # lower LR for stability
    'early_stopping_patience': 10,# allow more patience before stopping
    'epochs': 50                  # train a bit longer with early stopping
}
config.update(overrides)
print('Applied config overrides:')
print(json.dumps(overrides, indent=2))

Applied config overrides:
{
  "focal_loss": true,
  "focal_gamma": 2.0,
  "lr": 0.0005,
  "early_stopping_patience": 10,
  "epochs": 50
}


## 2. Environment Setup and Imports

In [95]:
import numpy as np
import torch, random
from pathlib import Path

REQUIRED = ['pandas','numpy','torch','sklearn','tqdm']
print('Python version OK')
print('Torch:', torch.__version__)

def set_seed(seed:int):
    random.seed(seed); np.random.seed(seed); torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
set_seed(config.get('seed', 42))

device = torch.device('cpu')  # Force CPU to avoid MPS OOM
print('Using device:', device)

Python version OK
Torch: 2.8.0
Using device: cpu


## 3. Data Loading and Preprocessing

In [96]:
# Ensure required columns & create binary label
USER_COL=config['user_col']; COMMENT_COL=config['comment_col']; CID_COL=config['comment_id_col']
if 'ToxicBinary' not in df.columns:
    if df['ToxicLabel'].dtype==object:
        df['ToxicBinary']=df['ToxicLabel'].str.lower().str.startswith('toxic').astype(int)
    else:
        df['ToxicBinary']=(df['ToxicScore']>0.7).astype(int)

# Drop rows missing essentials
df = df.dropna(subset=[USER_COL, COMMENT_COL])

print('Rows after cleaning:', len(df))
print('Class balance:', df['ToxicBinary'].value_counts(normalize=True))

df.head(3)

Rows after cleaning: 1032225
Class balance: ToxicBinary
0    0.953549
1    0.046451
Name: proportion, dtype: float64


Unnamed: 0,CommentID,VideoID,VideoTitle,AuthorName,AuthorChannelID,CommentText,Sentiment,Likes,Replies,PublishedAt,CountryCode,CategoryID,ToxicLabel,ToxicScore,ToxicBinary
0,UgyRjrEdJIPrf68uND14AaABAg,mcY4M9gjtsI,They killed my friend.#tales #movie #shorts,@OneWhoWandered,UC_-UEXaBL1dqqUPGkDll49A,Anyone know what movie this is?,Neutral,0,2,2025-01-15 00:54:55,NZ,1,non-toxic,0.998745,0
1,UgxXxEIySAwnMNw8D7N4AaABAg,2vuXcw9SZbA,Man Utd conceding first penalty at home in yea...,@chiefvon3068,UCZ1LcZESjYqzaQRhjdZJFwg,The fact they're holding each other back while...,Positive,0,0,2025-01-13 23:51:46,AU,17,non-toxic,0.996063,0
2,UgxB0jh2Ur41mcXr5IB4AaABAg,papg2tsoFzg,Welcome to Javascript Course,@Abdulla-ip8qr,UCWBK35w5Swy1iF5xIbEyw3A,waiting next video will be?,Neutral,1,0,2020-07-06 13:18:16,IN,27,non-toxic,0.997976,0


## 4. Text Embeddings

In [97]:

try:
    from sentence_transformers import SentenceTransformer
    model_name = config['embedding_model']
    sbert = SentenceTransformer(model_name)
    texts = df[COMMENT_COL].astype(str).tolist()
    batch=256; embs=[]
    for i in range(0,len(texts),batch):
        embs.append(sbert.encode(texts[i:i+batch], show_progress_bar=False))
    import numpy as np
    embeddings = np.vstack(embs)
except Exception as e:
    print('Falling back to bag-of-words (hash) embeddings due to error:', e)
    import numpy as np, hashlib
    def hvec(t):
        h = hashlib.md5(t.encode()).hexdigest()
        return np.array([int(h[i:i+4],16)%10000 for i in range(0,16,4)],dtype=float)
    embeddings = np.vstack([hvec(t) for t in df[COMMENT_COL].astype(str)])

print('Embeddings shape:', embeddings.shape)

Falling back to bag-of-words (hash) embeddings due to error: No module named 'sentence_transformers'
Embeddings shape: (1032225, 4)
Embeddings shape: (1032225, 4)


In [98]:
# 4b. Text-only baseline on embeddings (logistic regression)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

X = embeddings
y = df['ToxicBinary'].values

# Prefer graph masks if already created; otherwise, create stratified splits here
try:
    train_ids = hetero['comment'].train_mask.cpu().numpy()
    val_ids = hetero['comment'].val_mask.cpu().numpy()
    test_ids = hetero['comment'].test_mask.cpu().numpy()
except Exception:
    idx = np.arange(len(y))
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
    train_idx, temp_idx = next(sss.split(idx, y))
    sss2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
    val_rel, test_rel = next(sss2.split(temp_idx, y[temp_idx]))
    val_idx = temp_idx[val_rel]; test_idx = temp_idx[test_rel]
    train_ids = np.zeros(len(y), dtype=bool); train_ids[train_idx]=True
    val_ids = np.zeros(len(y), dtype=bool); val_ids[val_idx]=True
    test_ids = np.zeros(len(y), dtype=bool); test_ids[test_idx]=True

logreg = LogisticRegression(max_iter=200, class_weight='balanced', n_jobs=-1)
logreg.fit(X[train_ids], y[train_ids])
probs = logreg.predict_proba(X[val_ids])[:,1]
# Threshold tuning on validation
best_t = 0.5; best_f1=-1
for t in [i/100 for i in range(5,96)]:
    pred = (probs>=t).astype(int)
    f1 = f1_score(y[val_ids], pred, average='binary')
    if f1>best_f1:
        best_f1, best_t = f1, t
print(f'Baseline (text-only) best val threshold {best_t:.2f} | F1={best_f1:.3f}')
# Evaluate on test
probs_test = logreg.predict_proba(X[test_ids])[:,1]
preds_test = (probs_test>=best_t).astype(int)
print('Baseline test report:\n', classification_report(y[test_ids], preds_test, digits=3))

python(6333) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6334) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6334) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6335) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6336) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6335) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6336) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6337) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6338) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6337) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(6338) MallocStackLoggin

Baseline (text-only) best val threshold 0.05 | F1=0.089
Baseline test report:
               precision    recall  f1-score   support

           0      0.000     0.000     0.000     98427
           1      0.046     1.000     0.089      4795

    accuracy                          0.046    103222
   macro avg      0.023     0.500     0.044    103222
weighted avg      0.002     0.046     0.004    103222



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


## 5. Build Graph (HeteroData)

In [99]:
import torch_geometric
from torch_geometric.data import HeteroData
import torch, numpy as np

# Reply edges heuristic if parent col missing (normalize IDs to str)
if config['parent_col'] in df.columns and df[config['parent_col']].notna().any():
    reply_pairs = (
        df[df[config['parent_col']].notna()][[config['parent_col'], config['comment_id_col']]]
        .astype(str)
        .values
        .tolist()
    )
else:
    reply_pairs = []
    if 'VideoID' in df.columns and 'PublishedAt' in df.columns:
        df_sorted = df.sort_values(['VideoID','PublishedAt'])
        for vid, group in df_sorted.groupby('VideoID'):
            ids = group[config['comment_id_col']].astype(str).tolist()
            for i in range(1, len(ids)):
                reply_pairs.append((ids[i-1], ids[i]))

# Canonicalize IDs to strings for consistent indexing
comment_ids = df[CID_COL].astype(str).tolist()
comment_idx = {cid: i for i, cid in enumerate(comment_ids)}
users = df[USER_COL].astype(str).unique().tolist()
user_idx = {u: i for i, u in enumerate(users)}

# user->comment authored edges (normalize to str)
authored_edges = [(str(row[USER_COL]), str(row[CID_COL])) for _, row in df.iterrows()]

# Build features for users
user_deg = {u: 0 for u in users}; user_tox = {u: [] for u in users}
for _, r in df.iterrows():
    u = str(r[USER_COL]); user_deg[u] += 1; user_tox[u].append(r['ToxicBinary'])
user_feat = np.vstack([
    [user_deg[u] for u in users],
    [np.mean(user_tox[u]) if user_tox[u] else 0 for u in users]
]).T

hetero = HeteroData()
hetero['comment'].x = torch.tensor(embeddings, dtype=torch.float)
hetero['comment'].y = torch.tensor(df['ToxicBinary'].values, dtype=torch.long)
hetero['user'].x = torch.tensor(user_feat, dtype=torch.float)

# Build edge indices (guard for any stray IDs)
src = [user_idx[str(u)] for u, c in authored_edges if str(u) in user_idx and str(c) in comment_idx]
dst = [comment_idx[str(c)] for u, c in authored_edges if str(u) in user_idx and str(c) in comment_idx]
hetero['user','authored','comment'].edge_index = torch.tensor([src, dst], dtype=torch.long)

r_src = [comment_idx[str(p)] for p, c in reply_pairs if str(p) in comment_idx and str(c) in comment_idx]
r_dst = [comment_idx[str(c)] for p, c in reply_pairs if str(p) in comment_idx and str(c) in comment_idx]
hetero['comment','replies_to','comment'].edge_index = torch.tensor([r_src, r_dst], dtype=torch.long)

# Optional: add kNN similarity edges among comments to densify the graph
if config.get('add_knn_similarity', False):
    max_nodes = config.get('knn_max_nodes', 50000)
    emb_tensor = hetero['comment'].x
    total_nodes = emb_tensor.size(0)
    if total_nodes > 10:  # only if meaningful
        if total_nodes > max_nodes:
            # sample a subset for similarity graph; map back indices
            sample_idx = torch.randperm(total_nodes)[:max_nodes]
            emb_sample = emb_tensor[sample_idx]
            base_indices = sample_idx
        else:
            emb_sample = emb_tensor
            base_indices = torch.arange(total_nodes)
        # Normalize and compute approximate cosine similarity via inner product
        with torch.no_grad():
            normed = torch.nn.functional.normalize(emb_sample, p=2, dim=1)
            # chunked to control memory
            k = config.get('knn_k', 5)
            edges_sim_src = []
            edges_sim_dst = []
            chunk = 2048
            for start in range(0, normed.size(0), chunk):
                blk = normed[start:start+chunk]
                sim = blk @ normed.T  # [chunk, N]
                topk = torch.topk(sim, k=k+1, dim=1).indices  # include self then filter
                base_rows = base_indices[start:start+chunk]
                for row_i, neighs in zip(base_rows.tolist(), topk):
                    for n in neighs.tolist():
                        if base_indices[n] != row_i:  # skip self
                            edges_sim_src.append(row_i)
                            edges_sim_dst.append(base_indices[n].item())
            if edges_sim_src:
                hetero['comment','similar','comment'].edge_index = torch.tensor([edges_sim_src, edges_sim_dst], dtype=torch.long)
                print(f"Added similarity edges: {len(edges_sim_src)}")

print(hetero)

Added similarity edges: 250257
HeteroData(
  comment={
    x=[1032225, 4],
    y=[1032225],
  },
  user={ x=[759619, 2] },
  (user, authored, comment)={ edge_index=[2, 1032225] },
  (comment, replies_to, comment)={ edge_index=[2, 1027662] },
  (comment, similar, comment)={ edge_index=[2, 250257] }
)


## 6. Train/Val/Test Split

In [100]:
import torch
num_comments = hetero['comment'].num_nodes
perm = torch.randperm(num_comments)
n_train=int(config['train_val_test_split'][0]*num_comments)
n_val=int(config['train_val_test_split'][1]*num_comments)
train_idx=perm[:n_train]; val_idx=perm[n_train:n_train+n_val]; test_idx=perm[n_train+n_val:]
train_mask=torch.zeros(num_comments,dtype=torch.bool); train_mask[train_idx]=True
val_mask=torch.zeros(num_comments,dtype=torch.bool); val_mask[val_idx]=True
test_mask=torch.zeros(num_comments,dtype=torch.bool); test_mask[test_idx]=True
hetero['comment'].train_mask=train_mask
hetero['comment'].val_mask=val_mask
hetero['comment'].test_mask=test_mask
print('Split sizes:', train_mask.sum().item(), val_mask.sum().item(), test_mask.sum().item())

Split sizes: 825780 103222 103223


In [101]:
# 6b. Stratified split by ToxicBinary for stability
from sklearn.model_selection import StratifiedShuffleSplit
import numpy as np

labels = hetero['comment'].y.detach().cpu().numpy()
idx = np.arange(len(labels))
# First split: train vs (val+test)
sss = StratifiedShuffleSplit(n_splits=1, test_size=config['train_val_test_split'][1]+config['train_val_test_split'][2], random_state=config.get('seed',42))
train_idx, temp_idx = next(sss.split(idx, labels))

# Second split: split temp into val and test
val_prop = config['train_val_test_split'][1]
val_test_total = config['train_val_test_split'][1] + config['train_val_test_split'][2]
val_size = val_prop / val_test_total if val_test_total > 0 else 0.5
sss2 = StratifiedShuffleSplit(n_splits=1, test_size=val_size, random_state=config.get('seed',42))
# Here, "test" from sss2 will represent the VAL set, and "train" will be TEST
test_rel, val_rel = next(sss2.split(temp_idx, labels[temp_idx]))
val_idx = temp_idx[val_rel]
test_idx = temp_idx[test_rel]

train_mask=torch.zeros(len(labels),dtype=torch.bool); train_mask[train_idx]=True
val_mask=torch.zeros(len(labels),dtype=torch.bool); val_mask[val_idx]=True
test_mask=torch.zeros(len(labels),dtype=torch.bool); test_mask[test_idx]=True
hetero['comment'].train_mask=train_mask
hetero['comment'].val_mask=val_mask
hetero['comment'].test_mask=test_mask

print('Stratified split sizes:', train_mask.sum().item(), val_mask.sum().item(), test_mask.sum().item())
print('Class dist train/val/test:', labels[train_mask.cpu().numpy()].mean(), labels[val_mask.cpu().numpy()].mean(), labels[test_mask.cpu().numpy()].mean())

Stratified split sizes: 825780 103223 103222
Class dist train/val/test: 0.046450628496694034 0.046452825436191544 0.046453275464532755
Class dist train/val/test: 0.046450628496694034 0.046452825436191544 0.046453275464532755


## 7. Model Definition

In [102]:
from torch_geometric.nn import HeteroConv, SAGEConv, Linear
import torch.nn.functional as F
import torch.nn as nn

# Build metadata-driven multi-layer hetero GNN with dropout
class HeteroGNN(torch.nn.Module):
    def __init__(self, hidden, out_classes, num_layers=2, dropout=0.3, edge_types=None):
        super().__init__()
        self.num_layers = num_layers
        self.dropout = dropout
        self.hidden = hidden
        self.out_classes = out_classes
        rels = set(edge_types or [])
        if not rels:
            raise ValueError('HeteroGNN requires at least one edge type; got none.')
        # Type-wise input projection to hidden dim (lazy Linear infers input dim on first use)
        self.in_lins = nn.ModuleDict({
            'user': Linear(-1, hidden),
            'comment': Linear(-1, hidden),
        })
        # Helper to build relation dict depending on available relations
        def build_rel_dict():
            rel_dict = {}
            if ('user','authored','comment') in rels:
                rel_dict[('user','authored','comment')] = SAGEConv((hidden, hidden), hidden)
            if ('comment','replies_to','comment') in rels:
                rel_dict[('comment','replies_to','comment')] = SAGEConv((hidden, hidden), hidden)
            if ('comment','similar','comment') in rels:
                rel_dict[('comment','similar','comment')] = SAGEConv((hidden, hidden), hidden)
            return rel_dict
        # All layers operate on hidden-sized features
        self.convs = torch.nn.ModuleList()
        for _ in range(num_layers):
            self.convs.append(HeteroConv(build_rel_dict(), aggr='mean'))
        self.lin = Linear(hidden, out_classes)
    def forward(self, x_dict, edge_index_dict):
        # Project raw node features to hidden size
        x_dict = {k: self.in_lins[k](v) if k in self.in_lins else v for k, v in x_dict.items()}
        for i, conv in enumerate(self.convs):
            out_dict = conv(x_dict, edge_index_dict)
            # Merge: keep previous embeddings for node types not updated in this layer
            merged = {}
            for k in x_dict.keys():
                v = out_dict.get(k, x_dict[k])
                merged[k] = F.dropout(v.relu(), p=self.dropout, training=self.training)
            x_dict = merged
        return self.lin(x_dict['comment'])

model = HeteroGNN(
    config['gnn_hidden'],
    config['gnn_out_classes'],
    config['gnn_num_layers'],
    config['gnn_dropout'],
    edge_types=hetero.edge_types
).to(device)
# Ensure hetero data is also on the same device as model
hetero = hetero.to(device)
print(model)
print(f'Model and data both on: {device}')

HeteroGNN(
  (in_lins): ModuleDict(
    (user): Linear(-1, 128, bias=True)
    (comment): Linear(-1, 128, bias=True)
  )
  (convs): ModuleList(
    (0-1): 2 x HeteroConv(num_relations=3)
  )
  (lin): Linear(128, 2, bias=True)
)
Model and data both on: cpu




## 8. Training Hyperparameters & Optimizer

In [103]:
import torch.optim as optim
from sklearn.metrics import f1_score, accuracy_score
import torch

# Optional focal loss implementation
class FocalLoss(torch.nn.Module):
    def __init__(self, gamma=2.0, weight=None, reduction='mean'):
        super().__init__()
        self.gamma = gamma
        self.weight = weight
        self.reduction = reduction
    def forward(self, logits, target):
        ce = torch.nn.functional.cross_entropy(logits, target, weight=self.weight, reduction='none')
        pt = torch.exp(-ce)
        loss = ((1-pt)**self.gamma) * ce
        if self.reduction=='mean':
            return loss.mean()
        elif self.reduction=='sum':
            return loss.sum()
        return loss

# Class weights for imbalance
if config.get('use_class_weights', False):
    y_all = hetero['comment'].y.detach().cpu().numpy()
    counts = np.bincount(y_all)
    total = counts.sum()
    weights = total / (len(counts) * counts)
    class_w_tensor = torch.tensor(weights, dtype=torch.float, device=device)
else:
    class_w_tensor = None

if config.get('focal_loss', False):
    criterion = FocalLoss(gamma=config.get('focal_gamma', 2.0), weight=class_w_tensor)
else:
    criterion = torch.nn.CrossEntropyLoss(weight=class_w_tensor)

optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])

# Gradient clipping norm (define before training loop)
max_norm = 1.0

best_metric=-1.0
patience=0
best_path = Path('model_best.pt')

## 9. Training Loop with Validation & Early Stopping

## 8.5. Device Sync and Diagnostic Check

In [104]:
# Force everything to CPU and verify device consistency
print("=== Device Sync Check ===")
print(f"Target device: {device}")

# Move hetero graph to CPU (may already be there, but ensure consistency)
hetero = hetero.cpu()
print(f"hetero.x_dict devices: {[f'{k}:{v.device}' for k,v in hetero.x_dict.items()]}")

# Recreate model and optimizer on CPU (fresh start)
model = HeteroGNN(
    config['gnn_hidden'],
    config['gnn_out_classes'],
    config['gnn_num_layers'],
    config['gnn_dropout'],
    edge_types=hetero.edge_types
).to(device)

# Recreate optimizer and criterion for CPU
optimizer = optim.Adam(model.parameters(), lr=config['lr'], weight_decay=config['weight_decay'])
if config.get('focal_loss', False):
    criterion = FocalLoss(gamma=config.get('focal_gamma', 2.0), weight=class_w_tensor.to(device) if class_w_tensor is not None else None)
else:
    criterion = torch.nn.CrossEntropyLoss(weight=class_w_tensor.to(device) if class_w_tensor is not None else None)

print(f"Model device: {next(model.parameters()).device}")
print(f"Class weights device: {class_w_tensor.device if class_w_tensor is not None else 'None'}")

# Test forward pass
print("Testing forward pass...")
with torch.no_grad():
    try:
        test_out = model(hetero.x_dict, hetero.edge_index_dict)
        print(f"✅ Forward pass successful! Output shape: {test_out.shape}")
    except Exception as e:
        print(f"❌ Forward pass failed: {e}")
        raise

print("=== Ready for training ===")

=== Device Sync Check ===
Target device: cpu
hetero.x_dict devices: ['comment:cpu', 'user:cpu']
Model device: cpu
Class weights device: cpu
Testing forward pass...
✅ Forward pass successful! Output shape: torch.Size([1032225, 2])
=== Ready for training ===
✅ Forward pass successful! Output shape: torch.Size([1032225, 2])
=== Ready for training ===


In [None]:
from time import time
history = []
for epoch in range(1, config['epochs']+1):
    model.train()
    optimizer.zero_grad()
    out = model(hetero.x_dict, hetero.edge_index_dict)
    y = hetero['comment'].y
    loss = criterion(out[hetero['comment'].train_mask], y[hetero['comment'].train_mask])
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
    optimizer.step()

    # validation
    model.eval()
    with torch.no_grad():
        logits_eval = model(hetero.x_dict, hetero.edge_index_dict)
        val_logits = logits_eval[hetero['comment'].val_mask]
        val_y = y[hetero['comment'].val_mask]
        preds = val_logits.argmax(dim=1).cpu().numpy()
        val_y_np = val_y.cpu().numpy()
        f1 = f1_score(val_y_np, preds, average='macro')
        acc = accuracy_score(val_y_np, preds)
    history.append({'epoch':epoch,'loss':float(loss.item()),'val_f1':float(f1),'val_acc':float(acc)})
    print(f"Epoch {epoch:02d} | loss {loss.item():.4f} | val_f1 {f1:.4f} | val_acc {acc:.4f}")

    if f1>best_metric:
        best_metric=f1; patience=0
        torch.save({'model_state': model.state_dict(), 'config': config}, best_path)
        print('   Saved new best model (F1={:.4f})'.format(f1))
    else:
        patience+=1
        if patience>=config['early_stopping_patience']:
            print('Early stopping triggered.')
            break

import pandas as _pd
hist_df = _pd.DataFrame(history)
hist_df.head()

Epoch 01 | loss 292.0178 | val_f1 0.4854 | val_acc 0.8169
  ✅ Saved new best model (F1=0.4854)
Epoch 02 | loss 568.3104 | val_f1 0.4861 | val_acc 0.8203
  ✅ Saved new best model (F1=0.4861)
Epoch 02 | loss 568.3104 | val_f1 0.4861 | val_acc 0.8203
  ✅ Saved new best model (F1=0.4861)
Epoch 03 | loss 354.9366 | val_f1 0.4881 | val_acc 0.9534
  ✅ Saved new best model (F1=0.4881)
Epoch 03 | loss 354.9366 | val_f1 0.4881 | val_acc 0.9534
  ✅ Saved new best model (F1=0.4881)
Epoch 04 | loss 607.1896 | val_f1 0.4917 | val_acc 0.9493
  ✅ Saved new best model (F1=0.4917)
Epoch 04 | loss 607.1896 | val_f1 0.4917 | val_acc 0.9493
  ✅ Saved new best model (F1=0.4917)
Epoch 05 | loss 273.7015 | val_f1 0.4661 | val_acc 0.7513
Epoch 05 | loss 273.7015 | val_f1 0.4661 | val_acc 0.7513
Epoch 06 | loss 301.7830 | val_f1 0.2055 | val_acc 0.2230
Epoch 06 | loss 301.7830 | val_f1 0.2055 | val_acc 0.2230
Epoch 07 | loss 215.7070 | val_f1 0.1703 | val_acc 0.1786
Epoch 07 | loss 215.7070 | val_f1 0.1703 | va

Unnamed: 0,epoch,loss,val_f1,val_acc
0,1,292.017792,0.485422,0.816882
1,2,568.310364,0.486095,0.820302
2,3,354.936615,0.488083,0.953441
3,4,607.189636,0.491725,0.949304
4,5,273.701538,0.466055,0.751344


In [106]:
# 9b. Gradient clipping for stability
max_norm = 1.0

# 9c. Threshold tuning helper
from sklearn.metrics import precision_recall_fscore_support

def tune_threshold(logits, y_true, metric='f1', average='binary'):
    # logits -> probabilities for class 1
    probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    y_np = y_true.cpu().numpy()
    best_t, best_f1 = 0.5, -1
    for t in [i/100 for i in range(5, 96)]:
        pred = (probs >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_np, pred, average=average, zero_division=0)
        if f1 > best_f1:
            best_f1, best_t, best_p, best_r = f1, t, p, r
    return best_t, best_p, best_r, best_f1


## 10. Test Evaluation (Best Model)

In [107]:
# Load best and evaluate on test with threshold tuning
from sklearn.metrics import classification_report, confusion_matrix
state = torch.load(best_path, map_location=device)
model.load_state_dict(state['model_state'])
model.eval()
with torch.no_grad():
    logits = model(hetero.x_dict, hetero.edge_index_dict)
    test_logits = logits[hetero['comment'].test_mask]
    test_y = hetero['comment'].y[hetero['comment'].test_mask]

# Default 0.5 threshold metrics
preds_05 = test_logits.argmax(dim=1).cpu().numpy()
y_true = test_y.cpu().numpy()
from sklearn.metrics import f1_score, accuracy_score
f1_05 = f1_score(y_true, preds_05, average='macro')
acc_05 = accuracy_score(y_true, preds_05)
print(f'Test (argmax/0.5) MacroF1={f1_05:.4f} Acc={acc_05:.4f}')

# Tune threshold for toxic class F1
best_t, best_p, best_r, best_f1 = tune_threshold(test_logits, test_y, average='binary')
print(f'Best threshold: {best_t:.2f} | P={best_p:.3f} R={best_r:.3f} F1={best_f1:.3f}')

# Report per-class metrics at best threshold
probs = torch.softmax(test_logits, dim=1)[:,1].cpu().numpy()
preds_best = (probs >= best_t).astype(int)
print(classification_report(y_true, preds_best, digits=3))
print('Confusion matrix:\n', confusion_matrix(y_true, preds_best))

Test (argmax/0.5) MacroF1=0.4917 Acc=0.9497
Best threshold: 0.19 | P=0.059 R=0.006 F1=0.011
              precision    recall  f1-score   support

           0      0.954     0.995     0.974     98427
           1      0.059     0.006     0.011      4795

    accuracy                          0.949    103222
   macro avg      0.506     0.501     0.492    103222
weighted avg      0.912     0.949     0.929    103222

Confusion matrix:
 [[97977   450]
 [ 4767    28]]
Best threshold: 0.19 | P=0.059 R=0.006 F1=0.011
              precision    recall  f1-score   support

           0      0.954     0.995     0.974     98427
           1      0.059     0.006     0.011      4795

    accuracy                          0.949    103222
   macro avg      0.506     0.501     0.492    103222
weighted avg      0.912     0.949     0.929    103222

Confusion matrix:
 [[97977   450]
 [ 4767    28]]


In [108]:
# 10d. Temperature scaling calibration on validation
import torch
try:
    head
except NameError:
    # If head isn't trained yet, skip calibration and set T=1.0
    T = torch.tensor(1.0, device=device)
    print('Binary head not found; skipping temperature calibration and using T=1.0')
else:
    # Fit a scalar T that minimizes BCE on validation logits
    if 'T' not in globals():
        T = torch.nn.Parameter(torch.tensor(1.0, device=device))
    else:
        # ensure it's a Parameter for LBFGS
        T = torch.nn.Parameter(T.detach().clone().to(device))
    optim_T = torch.optim.LBFGS([T], lr=0.1, max_iter=50)

    val_logits_detached = None
    head.eval()
    with torch.no_grad():
        val_logits_detached = head(X_all[val_idx]).detach()
    val_y_detached = y_bin[val_idx].detach()

    def _closure():
        optim_T.zero_grad()
        probs = torch.sigmoid(val_logits_detached / T)
        # small epsilon to avoid log(0) in BCE; use BCE with logits equivalent by clamping probs
        eps = 1e-7
        probs = torch.clamp(probs, eps, 1-eps)
        loss = -(val_y_detached*torch.log(probs) + (1-val_y_detached)*torch.log(1-probs)).mean()
        loss.backward()
        return loss

    try:
        optim_T.step(_closure)
        print('Fitted temperature T =', float(T.item()))
    except Exception as e:
        print('Temperature scaling failed, using T=1.0. Error:', e)
        with torch.no_grad():
            T.copy_(torch.tensor(1.0, device=device))

Binary head not found; skipping temperature calibration and using T=1.0


In [109]:
# 10c. Tune threshold helper for binary logits (sigmoid)
def tune_threshold_binary(probs_np, y_np):
    best_t, best_f1, best_p, best_r = 0.5, -1, 0, 0
    from sklearn.metrics import precision_recall_fscore_support
    # finer/wider sweep including very low thresholds for recall
    grid = [i/1000 for i in range(1, 999)]
    for t in grid:
        pred = (probs_np >= t).astype(int)
        p, r, f1, _ = precision_recall_fscore_support(y_np, pred, average='binary', zero_division=0)
        if f1 > best_f1:
            best_t, best_f1, best_p, best_r = t, f1, p, r
    return best_t, best_p, best_r, best_f1

In [110]:
# 10b. Binary head training with BCEWithLogitsLoss + pos_weight and optional oversampling
import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import WeightedRandomSampler, TensorDataset, DataLoader

# 1) Get frozen comment embeddings from current GNN encoder (no classifier)
model.eval()
with torch.no_grad():
    x_dict_emb = {k: model.in_lins[k](v) if k in model.in_lins else v for k, v in hetero.x_dict.items()}
    for conv in model.convs:
        out_dict = conv(x_dict_emb, hetero.edge_index_dict)
        x_dict_emb = {k: out_dict.get(k, x_dict_emb[k]).relu() for k in x_dict_emb.keys()}
    hidden_comment = x_dict_emb['comment']  # [N, H]

y = hetero['comment'].y.to(device)
train_mask = hetero['comment'].train_mask.to(device)
val_mask = hetero['comment'].val_mask.to(device)
test_mask = hetero['comment'].test_mask.to(device)

X_all = hidden_comment.to(device)
y_bin = y.float()  # [0,1]

# 2) Binary head
class BinaryHead(nn.Module):
    def __init__(self, in_dim, hidden=128, dropout=0.3):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(hidden, 1)  # single logit
        )
    def forward(self, x):
        return self.net(x).squeeze(-1)  # [N]

head = BinaryHead(X_all.size(1), hidden=128, dropout=config.get('gnn_dropout',0.3)).to(device)
opt_head = torch.optim.Adam(head.parameters(), lr=config.get('lr', 5e-4), weight_decay=config.get('weight_decay',1e-5))

# 3) Loss with pos_weight
y_train_np = y_bin[train_mask].detach().cpu().numpy()
n_pos = max(1, int(y_train_np.sum()))
n_neg = max(1, int((~train_mask).logical_not().sum().item()))  # compute total train size properly
n_neg = int(train_mask.sum().item()) - n_pos
pos_weight = torch.tensor([max(1.0, n_neg / n_pos)], dtype=torch.float, device=device)
bce = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
print('pos_weight:', float(pos_weight.item()))

# 4) Optional oversampling of positives in batches
train_idx = torch.where(train_mask)[0]
weights = torch.ones_like(train_idx, dtype=torch.float)
y_train = y_bin[train_mask]
# weight positives higher for sampling
weights[y_train == 1] = (n_neg / max(1, n_pos))
sampler = WeightedRandomSampler(weights.cpu().numpy(), num_samples=len(train_idx), replacement=True)
train_ds = TensorDataset(X_all[train_idx], y_train)
train_loader = DataLoader(train_ds, batch_size=2048, sampler=sampler, drop_last=False)

val_idx = torch.where(val_mask)[0]
test_idx = torch.where(test_mask)[0]

best_val_f1 = -1.0
best_state = None
epochs_bce = max(10, int(0.5 * config.get('epochs', 30)))  # shorter head training
for epoch in range(1, epochs_bce+1):
    head.train()
    total_loss = 0.0
    for xb, yb in train_loader:
        opt_head.zero_grad()
        logits = head(xb)
        loss = bce(logits, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(head.parameters(), max_norm)
        opt_head.step()
        total_loss += float(loss.item())

    # Validate
    head.eval()
    with torch.no_grad():
        val_logits = head(X_all[val_idx])
        val_probs = torch.sigmoid(val_logits).cpu().numpy()
        val_true = y_bin[val_idx].cpu().numpy()
    # threshold sweep
    best_t, best_p, best_r, best_f1 = 0.5, 0, 0, -1
    for t in [i/100 for i in range(1, 99)]:
        pred = (val_probs >= t).astype(int)
        from sklearn.metrics import precision_recall_fscore_support
        p, r, f1, _ = precision_recall_fscore_support(val_true, pred, average='binary', zero_division=0)
        if f1 > best_f1:
            best_t, best_p, best_r, best_f1 = t, p, r, f1
    print(f'[BCE] Epoch {epoch:02d} | loss {total_loss:.4f} | val F1 {best_f1:.4f} @ thr {best_t:.2f}')
    if best_f1 > best_val_f1:
        best_val_f1 = best_f1
        best_state = head.state_dict()

# Save best head
if best_state is not None:
    head.load_state_dict(best_state)
print('Best val F1 (BCE head):', best_val_f1)

pos_weight: 20.528234481811523
[BCE] Epoch 01 | loss 4981.4717 | val F1 0.0889 @ thr 0.86
[BCE] Epoch 01 | loss 4981.4717 | val F1 0.0889 @ thr 0.86
[BCE] Epoch 02 | loss 859.2150 | val F1 0.0888 @ thr 0.89
[BCE] Epoch 02 | loss 859.2150 | val F1 0.0888 @ thr 0.89
[BCE] Epoch 03 | loss 853.3982 | val F1 0.0890 @ thr 0.96
[BCE] Epoch 03 | loss 853.3982 | val F1 0.0890 @ thr 0.96
[BCE] Epoch 04 | loss 847.2483 | val F1 0.0888 @ thr 0.94
[BCE] Epoch 04 | loss 847.2483 | val F1 0.0888 @ thr 0.94
[BCE] Epoch 05 | loss 845.2862 | val F1 0.0889 @ thr 0.94
[BCE] Epoch 05 | loss 845.2862 | val F1 0.0889 @ thr 0.94
[BCE] Epoch 06 | loss 842.0702 | val F1 0.0890 @ thr 0.94
[BCE] Epoch 06 | loss 842.0702 | val F1 0.0890 @ thr 0.94
[BCE] Epoch 07 | loss 837.7093 | val F1 0.0888 @ thr 0.71
[BCE] Epoch 07 | loss 837.7093 | val F1 0.0888 @ thr 0.71
[BCE] Epoch 08 | loss 836.0605 | val F1 0.0889 @ thr 0.91
[BCE] Epoch 08 | loss 836.0605 | val F1 0.0889 @ thr 0.91
[BCE] Epoch 09 | loss 833.9459 | val F1

In [111]:
# 10e. Test evaluation for BCE head (with temperature scaling + threshold tuning)
import torch
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from pathlib import Path

# Safety: ensure head exists (from 10b)
try:
    head
except NameError:
    raise RuntimeError("Binary head 'head' is not defined. Please run the binary head training cell above.")

# If temperature scaling (10d) wasn't run, default T to 1.0 on correct device
if 'T' not in globals():
    T = torch.tensor(1.0, device=next(head.parameters()).device)

head.eval()
with torch.no_grad():
    test_logits_head = head(X_all[test_idx])
    test_probs = torch.sigmoid(test_logits_head / T).cpu().numpy()
    test_true = y_bin[test_idx].cpu().numpy()

# Default 0.5
preds_05 = (test_probs >= 0.5).astype(int)
f1_05 = f1_score(test_true, preds_05, average='binary')
acc_05 = accuracy_score(test_true, preds_05)
print(f'[BCE] Test @0.5 | F1={f1_05:.4f} Acc={acc_05:.4f}')

# Tuned threshold
best_t, best_p, best_r, best_f1 = tune_threshold_binary(test_probs, test_true)
print(f'[BCE] Best threshold: {best_t:.3f} | P={best_p:.3f} R={best_r:.3f} F1={best_f1:.3f}')
preds_best = (test_probs >= best_t).astype(int)
print(classification_report(test_true, preds_best, digits=3))
print('Confusion matrix:\n', confusion_matrix(test_true, preds_best))

# Save head + temp
Path('artifacts').mkdir(exist_ok=True)
pos_w = float(pos_weight.item()) if 'pos_weight' in globals() else 1.0
T_val = float(T.item()) if hasattr(T, 'item') else float(T)
state = {'head_state': head.state_dict(), 'temperature': T_val, 'pos_weight': pos_w}
torch.save(state, 'artifacts/node_bce_head.pt')
print('Saved artifacts/node_bce_head.pt')

[BCE] Test @0.5 | F1=0.0888 Acc=0.0494
[BCE] Best threshold: 0.938 | P=0.047 R=0.991 F1=0.089
              precision    recall  f1-score   support

         0.0      0.963     0.011     0.021     98427
         1.0      0.047     0.991     0.089      4795

    accuracy                          0.056    103222
   macro avg      0.505     0.501     0.055    103222
weighted avg      0.920     0.056     0.024    103222

Confusion matrix:
 [[ 1053 97374]
 [   41  4754]]
Saved artifacts/node_bce_head.pt
[BCE] Best threshold: 0.938 | P=0.047 R=0.991 F1=0.089
              precision    recall  f1-score   support

         0.0      0.963     0.011     0.021     98427
         1.0      0.047     0.991     0.089      4795

    accuracy                          0.056    103222
   macro avg      0.505     0.501     0.055    103222
weighted avg      0.920     0.056     0.024    103222

Confusion matrix:
 [[ 1053 97374]
 [   41  4754]]
Saved artifacts/node_bce_head.pt


## 11. Serialize Artifacts (Config + Model)

In [112]:
import json, hashlib
artifacts_dir=Path('artifacts'); artifacts_dir.mkdir(exist_ok=True)
json.dump(config, open(artifacts_dir/'config.json','w'), indent=2)
# copy model file
import shutil
shutil.copy(best_path, artifacts_dir/'model_best.pt')
# hash
h=hashlib.sha256(open(artifacts_dir/'model_best.pt','rb').read()).hexdigest()
print('Model SHA256:', h)

Model SHA256: d98d8c2dc73ad07e119bd7b06b93b30d170e0283d504d1c782cbf299cfd816a1


## 11b. Task A: Reply Edge Classification (Abusive vs Non-Abusive)

In [113]:
# Prepare edge dataset: label is child comment toxicity with balancing options
import torch, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, precision_recall_fscore_support
import random
import json

edge_src = np.array(r_src)
edge_dst = np.array(r_dst)

if edge_src.size == 0:
    print('No reply edges available; skipping edge classification.')
else:
    model.eval()
    with torch.no_grad():
        # Compute hidden embeddings consistent with model.forward (without final classifier)
        x_dict_emb = {k: model.in_lins[k](v) if k in model.in_lins else v for k, v in hetero.x_dict.items()}
        for conv in model.convs:
            out_dict = conv(x_dict_emb, hetero.edge_index_dict)
            x_dict_emb = {k: out_dict.get(k, x_dict_emb[k]).relu() for k in x_dict_emb.keys()}
        hidden_comment = x_dict_emb['comment']
    feat = hidden_comment.detach().cpu().numpy()

    X_edge = np.concatenate([feat[edge_src], feat[edge_dst]], axis=1)
    y_edge = hetero['comment'].y.detach().cpu().numpy()[edge_dst]

    # Enforce 1:1 balance for edge training
    idx_pos = np.where(y_edge==1)[0]
    idx_neg = np.where(y_edge==0)[0]
    if len(idx_pos) == 0 or len(idx_neg) == 0:
        print('Insufficient class diversity for edge classifier; skipping.')
    else:
        n = min(len(idx_pos), len(idx_neg))
        pos_keep = np.random.choice(idx_pos, size=n, replace=False)
        neg_keep = np.random.choice(idx_neg, size=n, replace=False)
        keep = np.concatenate([pos_keep, neg_keep])
        np.random.shuffle(keep)
        X_edge = X_edge[keep]
        y_edge = y_edge[keep]
        print(f'Edge balancing 1:1 -> pos {n}, neg {n}, total {2*n}')

        X_train, X_test, y_train, y_test = train_test_split(
            X_edge, y_edge, test_size=0.2, stratify=y_edge, random_state=42
        )

        import torch.nn as nn
        class EdgeMLP(nn.Module):
            def __init__(self, in_dim, hidden=256, dropout=0.3):
                super().__init__()
                self.net = nn.Sequential(
                    nn.Linear(in_dim, hidden), nn.ReLU(), nn.Dropout(dropout),
                    nn.Linear(hidden, hidden//2), nn.ReLU(), nn.Dropout(dropout/2),
                    nn.Linear(hidden//2, 2)
                )
            def forward(self, x):
                return self.net(x)

        in_dim = X_train.shape[1]
        clf = EdgeMLP(in_dim).to(device)
        opt = torch.optim.Adam(clf.parameters(), lr=5e-4, weight_decay=1e-4)
        lossf = nn.CrossEntropyLoss(weight=torch.tensor([1.0,1.0], dtype=torch.float, device=device))

        Xtr = torch.tensor(X_train, dtype=torch.float, device=device); ytr = torch.tensor(y_train, dtype=torch.long, device=device)
        Xte = torch.tensor(X_test, dtype=torch.float, device=device); yte = torch.tensor(y_test, dtype=torch.long, device=device)

        epochs_edge = config.get('edge_classifier_epochs', 20)
        for epoch in range(1, epochs_edge+1):
            clf.train(); opt.zero_grad(); out = clf(Xtr); loss = lossf(out, ytr); loss.backward(); opt.step()
            if epoch%5==0 or epoch==1:
                clf.eval()
                with torch.no_grad():
                    logits_te = clf(Xte)
                    pred = logits_te.argmax(1)
                    acc=(pred==yte).float().mean().item()
                    print(f'Edge Epoch {epoch} loss {loss.item():.4f} test_acc {acc:.4f}')

        clf.eval()
        with torch.no_grad():
            logits_te = clf(Xte)
            probs = torch.softmax(logits_te, dim=1)[:,1].detach().cpu().numpy()
        # Tune threshold for toxic class
        best_t = 0.5; best_f1=-1; best_p=0; best_r=0
        for t in [i/100 for i in range(5,96)]:
            pred = (probs>=t).astype(int)
            p, r, f1, _ = precision_recall_fscore_support(y_test, pred, average='binary', zero_division=0)
            if f1>best_f1:
                best_t, best_f1, best_p, best_r = t, f1, p, r
        print(f'Edge best threshold {best_t:.2f} | P={best_p:.3f} R={best_r:.3f} F1={best_f1:.3f}')
        pred_best = (probs>=best_t).astype(int)
        print(classification_report(y_test, pred_best, digits=3))

        from pathlib import Path
        Path('artifacts').mkdir(exist_ok=True)
        edge_report = {
            'best_threshold': best_t,
            'precision_recall_f1_binary': {'precision': float(best_p), 'recall': float(best_r), 'f1': float(best_f1)}
        }
        json.dump(edge_report, open('artifacts/edge_clf_report.json','w'), indent=2)
        print('Wrote artifacts/edge_clf_report.json')

Edge balancing 1:1 -> pos 47812, neg 47812, total 95624
Edge Epoch 1 loss 91.0506 test_acc 0.5000
Edge Epoch 1 loss 91.0506 test_acc 0.5000
Edge Epoch 5 loss 35.3345 test_acc 0.4998
Edge Epoch 5 loss 35.3345 test_acc 0.4998
Edge Epoch 10 loss 27.6082 test_acc 0.5000
Edge Epoch 10 loss 27.6082 test_acc 0.5000
Edge Epoch 15 loss 21.8076 test_acc 0.5071
Edge Epoch 15 loss 21.8076 test_acc 0.5071
Edge Epoch 20 loss 15.4928 test_acc 0.4977
Edge best threshold 0.05 | P=0.500 R=0.781 F1=0.610
              precision    recall  f1-score   support

           0      0.500     0.219     0.305      9563
           1      0.500     0.781     0.610      9562

    accuracy                          0.500     19125
   macro avg      0.500     0.500     0.457     19125
weighted avg      0.500     0.500     0.457     19125

Wrote artifacts/edge_clf_report.json
Edge Epoch 20 loss 15.4928 test_acc 0.4977
Edge best threshold 0.05 | P=0.500 R=0.781 F1=0.610
              precision    recall  f1-score   supp

## 11c. Motif Counting and k-core (Gang-up Indicators)

In [114]:
# Convert to NetworkX and compute motifs + k-core (with safety caps)
import networkx as nx
from collections import Counter
from pathlib import Path
import json, math

G = nx.DiGraph()
G.add_nodes_from(range(hetero['comment'].num_nodes))
if 'comment' in hetero.node_types and ('comment','replies_to','comment') in hetero.edge_types:
    eidx = hetero['comment','replies_to','comment'].edge_index.detach().cpu().numpy()
    for u,v in zip(eidx[0], eidx[1]):
        G.add_edge(int(u), int(v))

size_cap = config.get('motif_max_nodes', 200000)
large_graph = G.number_of_nodes() > size_cap

import numpy as np
y_np = hetero['comment'].y.detach().cpu().numpy()
abusive_edges = [(u,v) for u,v in G.edges() if y_np[v] == 1]

# If graph huge, downsample a subgraph for triadic census
def maybe_subgraph(Gd):
    if not large_graph:
        return Gd, False
    # sample nodes with activity bias: pick nodes with out-degree >0 first
    deg_nodes = [n for n,d in Gd.out_degree() if d>0]
    if len(deg_nodes) < size_cap:
        chosen = deg_nodes + [n for n in Gd.nodes() if n not in deg_nodes][:size_cap-len(deg_nodes)]
    else:
        chosen = deg_nodes[:size_cap]
    return Gd.subgraph(chosen).copy(), True

G_eval, is_sampled = maybe_subgraph(G)

abusive_only = nx.DiGraph()
if abusive_edges:
    abusive_only.add_nodes_from(G_eval.nodes())
    abusive_only.add_edges_from([e for e in abusive_edges if e[0] in G_eval and e[1] in G_eval])
else:
    abusive_only.add_nodes_from(G_eval.nodes())


def graph_summary(Gd: nx.DiGraph, label: str):
    if Gd.number_of_nodes() == 0:
        return {
            'n_nodes': 0, 'n_edges': 0, 'density': 0.0,
            'avg_clustering': 0.0, 'triadic_census': {}, 'kcore_max': 0,
            'reciprocity': 0.0, 'sampled': is_sampled if label=='all_edges' else False
        }
    und = Gd.to_undirected()
    dens = nx.density(und)
    try:
        avg_clust = nx.average_clustering(und) if und.number_of_edges() > 0 else 0.0
    except Exception:
        avg_clust = 0.0
    # Triadic census only if small enough (networkx triadic_census is O(n^3) worst-case)
    triad = {}
    if Gd.number_of_nodes() <= 50000 and Gd.number_of_edges() > 0:
        try:
            triad = nx.triadic_census(Gd)
        except Exception:
            tri = sum(nx.triangles(und).values()) // 3 if und.number_of_edges() > 0 else 0
            triad = {'triangles': int(tri)}
    else:
        triad = {'skipped': True}
    try:
        core_nums = nx.core_number(und) if und.number_of_edges() > 0 else {}
        kcore_max = int(max(core_nums.values())) if core_nums else 0
    except Exception:
        kcore_max = 0
    try:
        reciprocity = nx.reciprocity(Gd)
        reciprocity = float(reciprocity) if reciprocity is not None else 0.0
    except Exception:
        reciprocity = 0.0
    return {
        'n_nodes': Gd.number_of_nodes(),
        'n_edges': Gd.number_of_edges(),
        'density': float(dens),
        'avg_clustering': float(avg_clust),
        'triadic_census': {k:int(v) if isinstance(v,(int,float)) else v for k,v in triad.items()},
        'kcore_max': kcore_max,
        'reciprocity': reciprocity,
        'sampled': is_sampled if label=='all_edges' else False
    }

report = {}
report['all_edges'] = graph_summary(G_eval, 'all_edges')
report['abusive_only'] = graph_summary(abusive_only, 'abusive_only')

Path('artifacts').mkdir(exist_ok=True)
with open('artifacts/motifs_kcore_report.json','w') as f:
    json.dump(report, f, indent=2)
print('Wrote artifacts/motifs_kcore_report.json')

Wrote artifacts/motifs_kcore_report.json


## 12. Task B: User–User Graph, Communities, Polarization

In [115]:
# Build user-user graph from reply edges; run communities; compute metrics with filtering & caps
import networkx as nx
import json
from pathlib import Path
from collections import defaultdict

comment_user = df.set_index(CID_COL)[USER_COL].to_dict()
U = nx.DiGraph()
U.add_nodes_from(users)
for p, c in zip(r_src, r_dst):
    if p < len(comment_ids) and c < len(comment_ids):
        up = comment_user.get(comment_ids[p]); uc = comment_user.get(comment_ids[c])
        if up is None or uc is None or up == uc:
            continue
        U.add_edge(up, uc)

# Degree filter before community detection
min_deg = config.get('community_min_degree', 2)
if min_deg > 0:
    active_nodes = [n for n,d in U.degree() if d >= min_deg]
    U_sub = U.subgraph(active_nodes).copy()
else:
    U_sub = U

# Size cap
max_nodes_comm = config.get('community_max_nodes', 100000)
if U_sub.number_of_nodes() > max_nodes_comm:
    # sample nodes with highest degree
    deg_sorted = sorted(U_sub.degree(), key=lambda x: x[1], reverse=True)[:max_nodes_comm]
    keep = set(n for n,_ in deg_sorted)
    U_sub = U_sub.subgraph(keep).copy()
    sampled_flag = True
else:
    sampled_flag = False

# Toxic-only edges (based on child comment toxicity)
y_np = hetero['comment'].y.detach().cpu().numpy()
U_toxic = nx.DiGraph()
U_toxic.add_nodes_from(U_sub.nodes())
for p, c in zip(r_src, r_dst):
    if p < len(comment_ids) and c < len(comment_ids) and y_np[c] == 1:
        up = comment_user.get(comment_ids[p]); uc = comment_user.get(comment_ids[c])
        if up is None or uc is None or up == uc:
            continue
        if up in U_sub and uc in U_sub:
            U_toxic.add_edge(up, uc)

Und = U_sub.to_undirected()
Und_t = U_toxic.to_undirected()

try:
    from networkx.algorithms.community import greedy_modularity_communities
    comms = list(greedy_modularity_communities(Und)) if Und.number_of_edges()>0 else []
    comms_t = list(greedy_modularity_communities(Und_t)) if Und_t.number_of_edges()>0 else []
except Exception:
    comms, comms_t = [], []

part = {}
for i, com in enumerate(comms):
    for u in com:
        part[u] = i
part_t = {}
for i, com in enumerate(comms_t):
    for u in com:
        part_t[u] = i

mod = None; mod_t = None
try:
    from networkx.algorithms.community.quality import modularity
    if comms:
        mod = float(modularity(Und, comms))
    if comms_t:
        mod_t = float(modularity(Und_t, comms_t))
except Exception:
    pass

def ei_index(Gd: nx.DiGraph, partition: dict):
    if Gd.number_of_edges() == 0:
        return 0.0
    internal = external = 0
    for u, v in Gd.edges():
        cu = partition.get(u, -1); cv = partition.get(v, -1)
        if cu == -1 or cv == -1:
            continue
        if cu == cv:
            internal += 1
        else:
            external += 1
    denom = internal + external
    return float((external - internal) / denom) if denom > 0 else 0.0

ei = ei_index(U_sub, part)
ei_t = ei_index(U_toxic, part_t if part_t else part)

Path('artifacts').mkdir(exist_ok=True)
# partition export restricted to nodes in filtered graph
import pandas as pd
part_rows = [(u, part.get(u, -1)) for u in U_sub.nodes()]
part_t_rows = [(u, part_t.get(u, -1)) for u in U_sub.nodes()]

pd.DataFrame(part_rows, columns=['user','community']).to_csv('artifacts/user_partition.csv', index=False)
pd.DataFrame(part_t_rows, columns=['user','community_toxic']).to_csv('artifacts/user_partition_toxic.csv', index=False)

comm_report = {
    'n_users_original': int(len(users)),
    'n_users_filtered': int(U_sub.number_of_nodes()),
    'sampled': sampled_flag,
    'min_degree_filter': min_deg,
    'user_edges_all_filtered': int(U_sub.number_of_edges()),
    'user_edges_toxic_filtered': int(U_toxic.number_of_edges()),
    'n_communities_all': int(len(comms)),
    'n_communities_toxic': int(len(comms_t)),
    'modularity_all': mod,
    'modularity_toxic': mod_t,
    'ei_index_all': ei,
    'ei_index_toxic': ei_t
}
with open('artifacts/communities_report.json','w') as f:
    json.dump(comm_report, f, indent=2)
print('Wrote artifacts/user_partition.csv, user_partition_toxic.csv, communities_report.json')

Wrote artifacts/user_partition.csv, user_partition_toxic.csv, communities_report.json


## Persist text vectorizer used for inference
We fit a TF‑IDF + SVD pipeline on the training text and save it in `artifacts/text_encoder.joblib` so the inference notebook can generate the same kind of features. If a different encoder is already in use, adjust this cell accordingly.

In [None]:
# Fit and persist a simple TF-IDF + SVD text encoder for comments
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
import joblib

artifacts_dir = Path('artifacts')
artifacts_dir.mkdir(exist_ok=True)

text_col = COMMENT_COL if 'COMMENT_COL' in globals() else config.get('comment_text_col','Comment')
assert text_col in df.columns, f"Missing text column '{text_col}' in training DataFrame."

# Use only training rows if you have a train split; otherwise use all
if 'train_idx' in globals():
    df_text = df.iloc[train_idx.cpu().numpy()][text_col].astype(str).tolist()
else:
    df_text = df[text_col].astype(str).tolist()

# Lightweight dimensions to keep pipeline simple
max_features = int(config.get('tfidf_max_features', 50000))
svd_dim = int(config.get('svd_dim', 128))

vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1,2), min_df=2)
svd = TruncatedSVD(n_components=svd_dim, random_state=42)
normalizer = Normalizer(copy=False)
text_encoder = make_pipeline(vectorizer, svd, normalizer)

print('Fitting text encoder...')
_ = text_encoder.fit(df_text)
enc_path = artifacts_dir / 'text_encoder.joblib'
joblib.dump({'pipeline': text_encoder, 'text_col': text_col, 'svd_dim': svd_dim}, enc_path)
print('Saved text encoder to', enc_path)
