In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/all-dataset/politifact_real.csv
/kaggle/input/all-dataset/politifact_fake.csv
/kaggle/input/all-dataset/gossipcop_real.csv
/kaggle/input/all-dataset/gossipcop_fake.csv
/kaggle/input/all-dataset/liar_dataset/test.tsv
/kaggle/input/all-dataset/liar_dataset/README
/kaggle/input/all-dataset/liar_dataset/train.tsv
/kaggle/input/all-dataset/liar_dataset/valid.tsv


In [2]:
import os
import random
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

try:
    import transformers
except ImportError:
    import subprocess, sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "transformers", "accelerate", "datasets"])
    import transformers

from transformers import AutoTokenizer, AutoModelForSequenceClassification


def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print("Setup complete")
print(f"PyTorch version     : {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")
print(f"Using device        : {device}")


Setup complete
PyTorch version     : 2.6.0+cu124
Transformers version: 4.53.3
Using device        : cuda


In [3]:
import os

print("Folders in /kaggle/input/:")
print(os.listdir("/kaggle/input/"))


Folders in /kaggle/input/:
['all-dataset']


In [4]:
import os

folder = "/kaggle/input/all-dataset"

print("Files inside all-dataset:")
print(os.listdir(folder))


Files inside all-dataset:
['politifact_real.csv', 'politifact_fake.csv', 'liar_dataset', 'gossipcop_real.csv', 'gossipcop_fake.csv']


In [5]:
import os

liar_folder = "/kaggle/input/all-dataset/liar_dataset"

print("Files inside liar_dataset:")
print(os.listdir(liar_folder))


Files inside liar_dataset:
['test.tsv', 'README', 'train.tsv', 'valid.tsv']


In [6]:

# Step 3: Loading LIAR + FakeNewsNet 


import pandas as pd
import os

base_path = "/kaggle/input/all-dataset"


# 1. Load LIAR dataset

liar_folder = os.path.join(base_path, "liar_dataset")

train_path = os.path.join(liar_folder, "train.tsv")
val_path   = os.path.join(liar_folder, "valid.tsv")  
test_path  = os.path.join(liar_folder, "test.tsv")

train_df = pd.read_csv(train_path, sep="\t", header=None, quoting=3, on_bad_lines='skip')
val_df   = pd.read_csv(val_path,   sep="\t", header=None, quoting=3, on_bad_lines='skip')
test_df  = pd.read_csv(test_path,  sep="\t", header=None, quoting=3, on_bad_lines='skip')

print("LIAR Dataset Loaded!")
print("Train:", train_df.shape)
print("Val  :", val_df.shape)
print("Test :", test_df.shape)


# 2. Loading FakeNewsNet datasets

gossip_fake = pd.read_csv(os.path.join(base_path, "gossipcop_fake.csv"))
gossip_real = pd.read_csv(os.path.join(base_path, "gossipcop_real.csv"))

politifact_fake = pd.read_csv(os.path.join(base_path, "politifact_fake.csv"))
politifact_real = pd.read_csv(os.path.join(base_path, "politifact_real.csv"))

print("\nFakeNewsNet Loaded!")
print("Gossip Fake:", gossip_fake.shape)
print("Gossip Real:", gossip_real.shape)
print("Politi Fake:", politifact_fake.shape)
print("Politi Real:", politifact_real.shape)

print("\nSample LIAR train row:")
display(train_df.head())

print("\nSample GossipCop Fake row:")
display(gossip_fake.head())


LIAR Dataset Loaded!
Train: (10269, 14)
Val  : (1284, 14)
Test : (1283, 14)

FakeNewsNet Loaded!
Gossip Fake: (5323, 4)
Gossip Real: (16817, 4)
Politi Fake: (432, 4)
Politi Real: (624, 4)

Sample LIAR train row:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,2635.json,false,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0,1,0,0,0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0,0,1,1,0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70,71,160,163,9,Denver
3,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7,19,3,5,44,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15,9,20,19,2,an interview on CNN



Sample GossipCop Fake row:


Unnamed: 0,id,news_url,title,tweet_ids
0,gossipcop-2493749932,www.dailymail.co.uk/tvshowbiz/article-5874213/...,Did Miley Cyrus and Liam Hemsworth secretly ge...,284329075902926848\t284332744559968256\t284335...
1,gossipcop-4580247171,hollywoodlife.com/2018/05/05/paris-jackson-car...,Paris Jackson & Cara Delevingne Enjoy Night Ou...,992895508267130880\t992897935418503169\t992899...
2,gossipcop-941805037,variety.com/2017/biz/news/tax-march-donald-tru...,Celebrities Join Tax March in Protest of Donal...,853359353532829696\t853359576543920128\t853359...
3,gossipcop-2547891536,www.dailymail.co.uk/femail/article-3499192/Do-...,Cindy Crawford's daughter Kaia Gerber wears a ...,988821905196158981\t988824206556172288\t988825...
4,gossipcop-5476631226,variety.com/2018/film/news/list-2018-oscar-nom...,Full List of 2018 Oscar Nominations – Variety,955792793632432131\t955795063925301249\t955798...


In [7]:

#Prepare LIAR columns and labels


# Assigning column names based on LIAR documentation
liar_columns = [
    "id",
    "label",
    "statement",
    "subject",
    "speaker",
    "speaker_job",
    "state",
    "party",
    "barely_true_counts",
    "false_counts",
    "half_true_counts",
    "mostly_true_counts",
    "pants_on_fire_counts",
    "context",
]

train_df.columns = liar_columns
val_df.columns   = liar_columns
test_df.columns  = liar_columns

print("Column names assigned.")
print("Train columns:", train_df.columns.tolist())

# 2. Createating a clean text column (we will use this for the model)
for df_name, df in [("train", train_df), ("val", val_df), ("test", test_df)]:
    df["text"] = df["statement"].astype(str)

# 3. Inspecting label values
print("\nUnique labels in LIAR:")
print(train_df["label"].unique())

print("\nLabel distribution (train):")
print(train_df["label"].value_counts())

# 4. Map string labels → numeric IDs (needed for modeling)
# We'll keep all 6 veracity classes:
# 'pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true'

unique_labels = sorted(train_df["label"].unique())
label_to_id = {lab: idx for idx, lab in enumerate(unique_labels)}
id_to_label = {idx: lab for lab, idx in label_to_id.items()}

print("\nLabel → ID mapping:")
for lab, idx in label_to_id.items():
    print(f"{lab:12s} -> {idx}")

train_df["label_id"] = train_df["label"].map(label_to_id)
val_df["label_id"]   = val_df["label"].map(label_to_id)

print("\n Added 'text' and 'label_id' columns to train/val.")
print(train_df[["text", "label", "label_id"]].head())


Column names assigned.
Train columns: ['id', 'label', 'statement', 'subject', 'speaker', 'speaker_job', 'state', 'party', 'barely_true_counts', 'false_counts', 'half_true_counts', 'mostly_true_counts', 'pants_on_fire_counts', 'context']

Unique labels in LIAR:
['false' 'half-true' 'mostly-true' 'true' 'barely-true' 'pants-fire']

Label distribution (train):
label
half-true      2123
false          1998
mostly-true    1966
true           1683
barely-true    1657
pants-fire      842
Name: count, dtype: int64

Label → ID mapping:
barely-true  -> 0
false        -> 1
half-true    -> 2
mostly-true  -> 3
pants-fire   -> 4
true         -> 5

 Added 'text' and 'label_id' columns to train/val.
                                                text        label  label_id
0  Says the Annies List political group supports ...        false         1
1  When did the decline of coal start? It started...    half-true         2
2  Hillary Clinton agrees with John McCain "by vo...  mostly-true         3
3  

In [8]:

# Step 5 : Tokenizer + Dataset + Loaders


import torch
from transformers import AutoTokenizer
from torch.utils.data import Dataset, DataLoader

# 1. Choosing model tokenizer
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

print("Loaded tokenizer:", model_name)

# 2. PyTorch Dataset
class LiarDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=128):
        self.texts = df["text"].tolist()
        self.labels = df["label_id"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

        # remove extra batch dim → (seq_len) instead of (1, seq_len)
        item = {key: val.squeeze(0) for key, val in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)

        return item

# 3. Build datasets
max_length = 128
train_dataset = LiarDataset(train_df, tokenizer, max_length=max_length)
val_dataset   = LiarDataset(val_df, tokenizer, max_length=max_length)

print(f"Train dataset size: {len(train_dataset)}")
print(f"Val dataset size  : {len(val_dataset)}")

# 4. DataLoaders
batch_size = 16

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# 5. Inspect sample batch
batch = next(iter(train_loader))

print("\nBatch keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)
print("attention_mask shape:", batch["attention_mask"].shape)
print("labels shape:", batch["labels"].shape)

print("\n Dataloaders are ready.")



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Loaded tokenizer: distilbert-base-uncased
Train dataset size: 10269
Val dataset size  : 1284

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([16, 128])
attention_mask shape: torch.Size([16, 128])
labels shape: torch.Size([16])

 Dataloaders are ready.


In [9]:

# Step 6A: taking Small subset for fast training


import torch
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Small subset so we don't wait forever
train_small_df = train_df.sample(2000, random_state=42)  # 2k rows
val_small_df   = val_df.sample(500,  random_state=42)    # 500 rows

train_small_dataset = LiarDataset(train_small_df, tokenizer, max_length=128)
val_small_dataset   = LiarDataset(val_small_df, tokenizer, max_length=128)

train_small_loader = DataLoader(train_small_dataset, batch_size=16, shuffle=True)
val_small_loader   = DataLoader(val_small_dataset, batch_size=16, shuffle=False)

print(f"Small train size: {len(train_small_dataset)}")
print(f"Small val size  : {len(val_small_dataset)}")


Using device: cuda
Small train size: 2000
Small val size  : 500


In [10]:

# Step 6B: Manual training loop (1 epoch)


from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np
import torch

num_labels = len(label_to_id)

# 1. Load model
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

model.train()
num_epochs = 1

for epoch in range(num_epochs):
    print(f"\n===== Epoch {epoch+1}/{num_epochs} =====")
    epoch_loss = 0.0
    step = 0

    for batch in tqdm(train_small_loader):
        # Move batch to GPU/CPU
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        step += 1

        if step % 20 == 0:
            print(f"Step {step} - Avg loss so far: {epoch_loss / step:.4f}")

    print(f"Epoch {epoch+1} finished. Avg loss: {epoch_loss / step:.4f}")

# Evaluation on small val set 
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in tqdm(val_small_loader):
        labels = batch["labels"].numpy()
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        logits = outputs.logits.detach().cpu().numpy()
        preds = logits.argmax(axis=1)

        all_preds.append(preds)
        all_labels.append(labels)

all_preds = np.concatenate(all_preds)
all_labels = np.concatenate(all_labels)

from sklearn.metrics import accuracy_score, classification_report

acc = accuracy_score(all_labels, all_preds)
print(f"\nValidation accuracy (small subset): {acc:.4f}\n")

print("Classification report (small subset):")
print(classification_report(
    all_labels,
    all_preds,
    target_names=[id_to_label[i] for i in range(len(id_to_label))]
))


2025-11-30 20:30:30.328088: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764534630.511198      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764534630.566673      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Epoch 1/1 =====


  0%|          | 0/125 [00:00<?, ?it/s]

Step 20 - Avg loss so far: 1.7848
Step 40 - Avg loss so far: 1.7774
Step 60 - Avg loss so far: 1.7711
Step 80 - Avg loss so far: 1.7622
Step 100 - Avg loss so far: 1.7553
Step 120 - Avg loss so far: 1.7515
Epoch 1 finished. Avg loss: 1.7526


  0%|          | 0/32 [00:00<?, ?it/s]


Validation accuracy (small subset): 0.2320

Classification report (small subset):
              precision    recall  f1-score   support

 barely-true       0.00      0.00      0.00        95
       false       0.28      0.36      0.32       108
   half-true       0.20      0.51      0.28       105
 mostly-true       0.26      0.24      0.25        96
  pants-fire       0.00      0.00      0.00        41
        true       0.00      0.00      0.00        55

    accuracy                           0.23       500
   macro avg       0.12      0.19      0.14       500
weighted avg       0.15      0.23      0.18       500



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:

# Step 7A: Full train/val dataloaders


from torch.utils.data import DataLoader
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)



full_train_dataset = LiarDataset(train_df, tokenizer, max_length=128)
full_val_dataset   = LiarDataset(val_df,   tokenizer, max_length=128)

batch_size = 16

full_train_loader = DataLoader(full_train_dataset, batch_size=batch_size, shuffle=True)
full_val_loader   = DataLoader(full_val_dataset,   batch_size=batch_size, shuffle=False)

print(f"Full train size: {len(full_train_dataset)}")
print(f"Full val size  : {len(full_val_dataset)}")


Using device: cuda
Full train size: 10269
Full val size  : 1284


In [12]:
# Step 7B: Full LIAR training + evaluation


import torch
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

num_labels = len(label_to_id)

# 1. Loading  a fresh model for full training
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels
).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 2

for epoch in range(num_epochs):
    print(f"\n===== Epoch {epoch+1}/{num_epochs} =====")
    model.train()
    epoch_loss = 0.0
    step = 0

    for batch in tqdm(full_train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        step += 1

        # print running avg loss occasionally
        if step % 100 == 0:
            print(f"Step {step} - Avg loss so far: {epoch_loss / step:.4f}")

    print(f"Epoch {epoch+1} finished. Avg train loss: {epoch_loss / step:.4f}")

    # Validation after each epoch 
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(full_val_loader):
            labels = batch["labels"].numpy()
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)
            logits = outputs.logits.detach().cpu().numpy()
            preds = logits.argmax(axis=1)

            all_preds.append(preds)
            all_labels.append(labels)

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    print(f"\n[Epoch {epoch+1}] Validation accuracy: {acc:.4f}")

    print("Classification report:")
    print(classification_report(
        all_labels,
        all_preds,
        target_names=[id_to_label[i] for i in range(len(id_to_label))]
    ))

print("\n Full LIAR training + evaluation complete.")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Using device: cuda

===== Epoch 1/2 =====


  0%|          | 0/642 [00:00<?, ?it/s]

Step 100 - Avg loss so far: 1.7684
Step 200 - Avg loss so far: 1.7567
Step 300 - Avg loss so far: 1.7464
Step 400 - Avg loss so far: 1.7355
Step 500 - Avg loss so far: 1.7309
Step 600 - Avg loss so far: 1.7220
Epoch 1 finished. Avg train loss: 1.7208


  0%|          | 0/81 [00:00<?, ?it/s]


[Epoch 1] Validation accuracy: 0.2609
Classification report:
              precision    recall  f1-score   support

 barely-true       0.29      0.01      0.02       237
       false       0.28      0.40      0.33       263
   half-true       0.22      0.43      0.29       248
 mostly-true       0.29      0.41      0.34       251
  pants-fire       0.00      0.00      0.00       116
        true       0.25      0.11      0.16       169

    accuracy                           0.26      1284
   macro avg       0.22      0.23      0.19      1284
weighted avg       0.24      0.26      0.21      1284


===== Epoch 2/2 =====


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/642 [00:00<?, ?it/s]

Step 100 - Avg loss so far: 1.6171
Step 200 - Avg loss so far: 1.6283
Step 300 - Avg loss so far: 1.6220
Step 400 - Avg loss so far: 1.6254
Step 500 - Avg loss so far: 1.6210
Step 600 - Avg loss so far: 1.6181
Epoch 2 finished. Avg train loss: 1.6184


  0%|          | 0/81 [00:00<?, ?it/s]


[Epoch 2] Validation accuracy: 0.2710
Classification report:
              precision    recall  f1-score   support

 barely-true       0.25      0.34      0.29       237
       false       0.29      0.28      0.28       263
   half-true       0.26      0.21      0.23       248
 mostly-true       0.31      0.22      0.25       251
  pants-fire       0.34      0.29      0.31       116
        true       0.23      0.32      0.27       169

    accuracy                           0.27      1284
   macro avg       0.28      0.28      0.27      1284
weighted avg       0.28      0.27      0.27      1284


 Full LIAR training + evaluation complete.


In [13]:
# FakeNewsNet - Cell 1: Load + Clean + Merge


import re
import pandas as pd
from sklearn.model_selection import train_test_split

print("Merging FakeNewsNet datasets...")

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+|www\.\S+", " ", text)  
    text = re.sub(r"<.*?>", " ", text)            
    text = re.sub(r"[^A-Za-z0-9\s]", " ", text)   
    text = re.sub(r"\s+", " ", text)              
    return text.strip().lower()




gossip_fake["label"] = 1
gossip_real["label"] = 0
politifact_fake["label"] = 1
politifact_real["label"] = 0


TEXT_COL = "text"  # change if needed

if TEXT_COL not in gossip_fake.columns:
    TEXT_COL = gossip_fake.columns[0]  # fallback

# Merge
fakenews_df = pd.concat([
    gossip_fake[[TEXT_COL, "label"]],
    gossip_real[[TEXT_COL, "label"]],
    politifact_fake[[TEXT_COL, "label"]],
    politifact_real[[TEXT_COL, "label"]],
], ignore_index=True)

# Clean text
fakenews_df["text"] = fakenews_df[TEXT_COL].astype(str).apply(clean_text)

# Convert label
fakenews_df["label_id"] = fakenews_df["label"].astype(int)

print("Label counts:\n", fakenews_df["label_id"].value_counts())

# Split
train_fakenet, val_fakenet = train_test_split(
    fakenews_df,
    test_size=0.2,
    random_state=42,
    stratify=fakenews_df["label_id"]
)

print("\nTrain size:", len(train_fakenet))
print("Val size  :", len(val_fakenet))

print("\nSample rows:")
display(train_fakenet.head())


Merging FakeNewsNet datasets...
Label counts:
 label_id
0    17441
1     5755
Name: count, dtype: int64

Train size: 18556
Val size  : 4640

Sample rows:


Unnamed: 0,id,label,text,label_id
4756,gossipcop-9800162301,1,gossipcop 9800162301,1
18824,gossipcop-857120,0,gossipcop 857120,0
3242,gossipcop-6176274941,1,gossipcop 6176274941,1
5435,gossipcop-944376,0,gossipcop 944376,0
21562,gossipcop-946635,0,gossipcop 946635,0


In [14]:

# FakeNewsNet - Cell 2: Dataset + DataLoaders


import torch
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

class FakeNewsTextDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.texts = df["text"].tolist()
        self.labels = df["label_id"].tolist()
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {k: v.squeeze(0) for k, v in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

max_length_fakenet = 256

train_fakenet_ds = FakeNewsTextDataset(train_fakenet, tokenizer, max_length=max_length_fakenet)
val_fakenet_ds   = FakeNewsTextDataset(val_fakenet,   tokenizer, max_length=max_length_fakenet)

train_fakenet_loader = DataLoader(train_fakenet_ds, batch_size=16, shuffle=True)
val_fakenet_loader   = DataLoader(val_fakenet_ds,   batch_size=16, shuffle=False)

print("Train loader batches:", len(train_fakenet_loader))
print("Val loader batches  :", len(val_fakenet_loader))

# Peek at one batch
batch = next(iter(train_fakenet_loader))
print("\nBatch keys:", batch.keys())
print("input_ids shape:", batch["input_ids"].shape)
print("labels shape   :", batch["labels"].shape)

print("\n FakeNewsNet DataLoaders ready.")


Using device: cuda
Train loader batches: 1160
Val loader batches  : 290

Batch keys: dict_keys(['input_ids', 'attention_mask', 'labels'])
input_ids shape: torch.Size([16, 256])
labels shape   : torch.Size([16])

 FakeNewsNet DataLoaders ready.


In [15]:
# FakeNewsNet - Cell 3: Training DistilBERT

import torch
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from tqdm.auto import tqdm
import numpy as np
from sklearn.metrics import accuracy_score, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Binary classifier → 2 labels
model_fakenet = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
).to(device)

optimizer = AdamW(model_fakenet.parameters(), lr=2e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs_fakenet = 2  # 2 epochs is usually enough for FakeNewsNet

for epoch in range(num_epochs_fakenet):
    print(f"\n===== FakeNewsNet Epoch {epoch+1}/{num_epochs_fakenet} =====")
    model_fakenet.train()
    epoch_loss = 0.0
    step = 0

    for batch in tqdm(train_fakenet_loader):
        labels = batch["labels"].to(device)
        batch_inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}

        outputs = model_fakenet(**batch_inputs)
        logits = outputs.logits
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        step += 1

        if step % 100 == 0:
            print(f"Step {step} - Avg loss: {epoch_loss / step:.4f}")

    print(f"\nEpoch {epoch+1} finished. Avg train loss: {epoch_loss / step:.4f}")

    #Validation
    model_fakenet.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_fakenet_loader:
            labels = batch["labels"].numpy()
            batch_inputs = {k: v.to(device) for k, v in batch.items() if k != "labels"}

            outputs = model_fakenet(**batch_inputs)
            logits = outputs.logits.detach().cpu().numpy()
            preds = logits.argmax(axis=1)

            all_preds.append(preds)
            all_labels.append(labels)

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    print(f"\n[FakeNewsNet Epoch {epoch+1}] Validation Accuracy: {acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds, target_names=["real", "fake"]))

print("\n FakeNewsNet text model training complete.")


Using device: cuda


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== FakeNewsNet Epoch 1/2 =====


  0%|          | 0/1160 [00:00<?, ?it/s]

Step 100 - Avg loss: 0.1787
Step 200 - Avg loss: 0.1027
Step 300 - Avg loss: 0.0777
Step 400 - Avg loss: 0.0617
Step 500 - Avg loss: 0.0519
Step 600 - Avg loss: 0.0470
Step 700 - Avg loss: 0.0414
Step 800 - Avg loss: 0.0389
Step 900 - Avg loss: 0.0355
Step 1000 - Avg loss: 0.0329
Step 1100 - Avg loss: 0.0334

Epoch 1 finished. Avg train loss: 0.0325

[FakeNewsNet Epoch 1] Validation Accuracy: 0.9972

Classification Report:
              precision    recall  f1-score   support

        real       1.00      1.00      1.00      3489
        fake       0.99      1.00      0.99      1151

    accuracy                           1.00      4640
   macro avg       0.99      1.00      1.00      4640
weighted avg       1.00      1.00      1.00      4640


===== FakeNewsNet Epoch 2/2 =====


  0%|          | 0/1160 [00:00<?, ?it/s]

Step 100 - Avg loss: 0.0078
Step 200 - Avg loss: 0.0108
Step 300 - Avg loss: 0.0095
Step 400 - Avg loss: 0.0093
Step 500 - Avg loss: 0.0091
Step 600 - Avg loss: 0.0091
Step 700 - Avg loss: 0.0095
Step 800 - Avg loss: 0.0097
Step 900 - Avg loss: 0.0097
Step 1000 - Avg loss: 0.0094
Step 1100 - Avg loss: 0.0099

Epoch 2 finished. Avg train loss: 0.0100

[FakeNewsNet Epoch 2] Validation Accuracy: 0.9972

Classification Report:
              precision    recall  f1-score   support

        real       1.00      1.00      1.00      3489
        fake       1.00      0.99      0.99      1151

    accuracy                           1.00      4640
   macro avg       1.00      1.00      1.00      4640
weighted avg       1.00      1.00      1.00      4640


 FakeNewsNet text model training complete.


In [16]:
# Propagation - Cell 4: Download graph dataset


import os, subprocess, sys

repo_url = "https://github.com/mdepak/fake-news-propagation.git"
repo_dir = "/kaggle/working/fake-news-propagation"

# 1) Clone the repo if not already present
if not os.path.exists(repo_dir):
    print("Cloning propagation dataset repo...")
    try:
        subprocess.check_call(["git", "clone", repo_url, repo_dir])
        print(" Cloned repository.")
    except Exception as e:
        print(" Error cloning repository.")
        print(e)
else:
    print("Repo already exists at", repo_dir)

# 2) Unzip nx_network_data.zip if present
zip_path = os.path.join(repo_dir, "data", "nx_network_data.zip")
data_dir = os.path.join(repo_dir, "data")

if os.path.exists(zip_path):
    print("\nUnzipping nx_network_data.zip ...")
    try:
        import zipfile
        with zipfile.ZipFile(zip_path, "r") as z:
            z.extractall(data_dir)
        print(" Unzipped into:", data_dir)
    except Exception as e:
        print(" Error unzipping nx_network_data.zip")
        print(e)
else:
    print("\n nx_network_data.zip not found at:", zip_path)
    print("   Check the repo structure or download manually.")

# 3) List what’s inside data/
if os.path.exists(data_dir):
    print("\nContents of data directory:")
    print(os.listdir(data_dir))
else:
    print("\nData directory not found:", data_dir)


Cloning propagation dataset repo...


Cloning into '/kaggle/working/fake-news-propagation'...


 Cloned repository.

Unzipping nx_network_data.zip ...
 Unzipped into: /kaggle/working/fake-news-propagation/data

Contents of data directory:
['sample_ids', 'nx_network_data.zip', '__MACOSX', 'nx_network_data']


In [17]:
# Propagation - Install DGL (GPU)


print("Installing DGL for CUDA...")

!pip install dgl -f https://data.dgl.ai/wheels/cu118/repo.html

print("\n DGL installation complete!")


Installing DGL for CUDA...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in links: https://data.dgl.ai/wheels/cu118/repo.html
Collecting dgl
  Downloading https://data.dgl.ai/wheels/cu118/dgl-2.1.0%2Bcu118-cp311-cp311-manylinux1_x86_64.whl (748.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m748.2/748.2 MB[0m [31m952.7 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2->torchdata>=0.5.0->dgl)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-

In [18]:
# Propagation - Cell 5: Load + Convert graphs


import os
import json
import networkx as nx
import dgl
import torch

GRAPH_PATH = "/kaggle/working/fake-news-propagation/data/nx_network_data"

graph_files = sorted(os.listdir(GRAPH_PATH))

print("Total graph files found:", len(graph_files))
print("Example graph file:", graph_files[0])

sample_graph_file = os.path.join(GRAPH_PATH, graph_files[0])
print("\nLoading:", sample_graph_file)

with open(sample_graph_file, "r") as f:
    graph_json = json.load(f)

nx_graph = nx.node_link_graph(graph_json)

print("NetworkX graph loaded.")
print("Nodes:", nx_graph.number_of_nodes())
print("Edges:", nx_graph.number_of_edges())

dgl_graph = dgl.from_networkx(nx_graph)

num_nodes = dgl_graph.num_nodes()
dgl_graph.ndata["feat"] = torch.eye(num_nodes)

print("\nDGL graph created:")
print("Nodes:", dgl_graph.num_nodes())
print("Edges:", dgl_graph.num_edges())
print("Node feature shape:", dgl_graph.ndata["feat"].shape)

print("\n Graph conversion successful!")


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


DGL backend not selected or invalid.  Assuming PyTorch for now.


OSError: libcudart.so.11.0: cannot open shared object file: No such file or directory

In [19]:
# Propagation - Fix DGL install (CPU version)


print("Uninstalling existing DGL (if any)...")
!pip uninstall -y dgl

print("\nInstalling CPU-only DGL...")
!pip install dgl==1.1.2 -f https://data.dgl.ai/wheels/repo.html

print("\n DGL CPU installation complete.")


Uninstalling existing DGL (if any)...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Found existing installation: dgl 2.1.0+cu118
Uninstalling dgl-2.1.0+cu118:
  Successfully uninstalled dgl-2.1.0+cu118

Installing CPU-only DGL...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Looking in links: https://data.dgl.ai/wheels/repo.html
Collecting dgl==1.1.2
  Downloading dgl-1.1.2-cp311-cp311-manylinux1_x86_64.whl.metadata (530 bytes)
Downloading dgl-1.1.2-cp311-cp311-manylinux1_x86_64.whl (6.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.0/6.0 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: dgl
Successfully installed dgl-1.1.2

 DGL CPU installation complete. Now re-run the graph code.


In [20]:
# Propagation - Cell 5 (retry): Load + Convert


import os
import json
import networkx as nx
import dgl
import torch

GRAPH_PATH = "/kaggle/working/fake-news-propagation/data/nx_network_data"

graph_files = sorted(os.listdir(GRAPH_PATH))

print("Total graph files found:", len(graph_files))
print("Example graph file:", graph_files[0])

sample_graph_file = os.path.join(GRAPH_PATH, graph_files[0])
print("\nLoading:", sample_graph_file)

with open(sample_graph_file, "r") as f:
    graph_json = json.load(f)

nx_graph = nx.node_link_graph(graph_json)

print("NetworkX graph loaded.")
print("Nodes:", nx_graph.number_of_nodes())
print("Edges:", nx_graph.number_of_edges())

dgl_graph = dgl.from_networkx(nx_graph)

num_nodes = dgl_graph.num_nodes()
dgl_graph.ndata["feat"] = torch.eye(num_nodes)

print("\nDGL graph created:")
print("Nodes:", dgl_graph.num_nodes())
print("Edges:", dgl_graph.num_edges())
print("Node feature shape:", dgl_graph.ndata["feat"].shape)

print("\n Graph conversion successful!")


Total graph files found: 4
Example graph file: gossipcop_fake

Loading: /kaggle/working/fake-news-propagation/data/nx_network_data/gossipcop_fake


IsADirectoryError: [Errno 21] Is a directory: '/kaggle/working/fake-news-propagation/data/nx_network_data/gossipcop_fake'

In [21]:
# Propagation - Cell 5D: Inspect folder structure

import os

GRAPH_ROOT = "/kaggle/working/fake-news-propagation/data/nx_network_data"

print("Root contents:", os.listdir(GRAPH_ROOT))

# For each category, list its first few files
for folder in os.listdir(GRAPH_ROOT):
    folder_path = os.path.join(GRAPH_ROOT, folder)
    if os.path.isdir(folder_path):
        print(f"\n {folder} contains {len(os.listdir(folder_path))} graph files")
        print("First 5:", os.listdir(folder_path)[:5])


Root contents: ['gossipcop_real', 'politifact_real', 'politifact_fake', 'gossipcop_fake']

 gossipcop_real contains 6945 graph files
First 5: ['gossipcop-817047.json', 'gossipcop-849450.json', 'gossipcop-955278.json', 'gossipcop-878459.json', 'gossipcop-862744.json']

 politifact_real contains 277 graph files
First 5: ['politifact1519.json', 'politifact11761.json', 'politifact440.json', 'politifact8310.json', 'politifact1053.json']

 politifact_fake contains 351 graph files
First 5: ['politifact15383.json', 'politifact14644.json', 'politifact13816.json', 'politifact14755.json', 'politifact13973.json']

 gossipcop_fake contains 3684 graph files
First 5: ['gossipcop-328760913.json', 'gossipcop-4131669926.json', 'gossipcop-3484539870.json', 'gossipcop-2022735106.json', 'gossipcop-5237284121.json']


In [22]:

# Propagation - Cell 5E: Load 1 graph → DGL


import os
import json
import networkx as nx
import dgl
import torch

GRAPH_ROOT = "/kaggle/working/fake-news-propagation/data/nx_network_data"

# 1. Choose a category and pick one file
category = "gossipcop_fake"  # you can change to gossipcop_real / politifact_fake / politifact_real
folder_path = os.path.join(GRAPH_ROOT, category)

files_in_folder = sorted(os.listdir(folder_path))
print(f"Category: {category}, files: {len(files_in_folder)}")

sample_file = files_in_folder[0]
file_path = os.path.join(folder_path, sample_file)

print("Loading graph file:", file_path)

# 2. Load JSON → NetworkX
with open(file_path, "r") as f:
    graph_json = json.load(f)

nx_graph = nx.node_link_graph(graph_json)

print("NetworkX graph loaded.")
print("Nodes:", nx_graph.number_of_nodes())
print("Edges:", nx_graph.number_of_edges())

# 3. NetworkX → DGL
dgl_graph = dgl.from_networkx(nx_graph)

print("\nDGL graph created.")
print("Nodes:", dgl_graph.num_nodes())
print("Edges:", dgl_graph.num_edges())

# 4. Add simple node features: in-degree as a scalar feature
deg = dgl_graph.in_degrees().float().unsqueeze(1)  # shape [N, 1]
dgl_graph.ndata["feat"] = deg

print("Node feature shape:", dgl_graph.ndata["feat"].shape)

print("\n Single propagation graph conversion successful!")



Category: gossipcop_fake, files: 3684
Loading graph file: /kaggle/working/fake-news-propagation/data/nx_network_data/gossipcop_fake/gossipcop-1000240645.json


The default value will be changed to `edges="edges" in NetworkX 3.6.


  nx.node_link_graph(data, edges="links") to preserve current behavior, or
  nx.node_link_graph(data, edges="edges") for forward compatibility.


KeyError: 'nodes'

In [23]:
# Show contents of one JSON graph file


import os

GRAPH_ROOT = "/kaggle/working/fake-news-propagation/data/nx_network_data"

# pick a folder
folder = "gossipcop_fake"
folder_path = os.path.join(GRAPH_ROOT, folder)

# pick a JSON file inside the folder
files = sorted(os.listdir(folder_path))
json_file = files[0]  # first file
json_path = os.path.join(folder_path, json_file)

print("Showing file:", json_path, "\n")

# print the first 400 characters
with open(json_path, "r") as f:
    content = f.read()

print(content[:400])


Showing file: /kaggle/working/fake-news-propagation/data/nx_network_data/gossipcop_fake/gossipcop-1000240645.json 

{"time": null, "type": 1, "user": 3849187751, "tweet_id": "gossipcop-1000240645", "id": 9303106605, "children": [{"time": 1250565338, "type": 2, "user": 4472255223, "tweet_id": 3375455802, "id": 7822188508}, {"time": 1259094442, "type": 2, "user": 5467375659, "tweet_id": 6018193721, "id": 6122009681}, {"time": 1275517044, "type": 2, "user": 1305540368, "tweet_id": 15280521439, "id": 6151658638}, {


In [24]:
# Propagation - Cell 5F: Tree JSON → DGL graph


import os
import json
import dgl
import torch

GRAPH_ROOT = "/kaggle/working/fake-news-propagation/data/nx_network_data"

def load_tree_json(path):
    with open(path, "r") as f:
        return json.load(f)

def tree_to_edges_and_types(node, edges, node_types):
    """
    Recursively traverse the tree and collect:
    - edges: (parent_id, child_id)
    - node_types: node_id -> type (1 = source, 2 = retweet, etc.)
    """
    nid = node["id"]
    ntype = node.get("type", 0)
    node_types[nid] = ntype

    children = node.get("children", [])
    for child in children:
        cid = child["id"]
        ctype = child.get("type", 0)
        node_types[cid] = ctype
        edges.append((nid, cid))
        tree_to_edges_and_types(child, edges, node_types)

def build_dgl_from_tree_json(path):
    data = load_tree_json(path)

    edges = []
    node_types = {}

    # Root of the propagation tree
    tree_to_edges_and_types(data, edges, node_types)

    # Collect all node IDs
    node_ids = sorted(node_types.keys())
    id_map = {nid: i for i, nid in enumerate(node_ids)}

    # Map edges to 0..N-1
    if len(edges) > 0:
        src = [id_map[u] for (u, v) in edges]
        dst = [id_map[v] for (u, v) in edges]
    else:
        # single-node graph with no edges
        src, dst = [], []

    g = dgl.graph((src, dst), num_nodes=len(node_ids))

    # Add self-loops for stability
    g = dgl.add_self_loop(g)

    # Node features:
    #   feature[0] = type (1,2,...) normalized
    #   feature[1] = degree (in + out)
    types_tensor = torch.zeros(g.num_nodes(), 1)
    for nid, orig_id in enumerate(node_ids):
        types_tensor[nid, 0] = float(node_types.get(orig_id, 0))

    degrees = (g.in_degrees() + g.out_degrees()).float().unsqueeze(1)

    # Normalize type slightly (optional)
    feat = torch.cat([types_tensor, degrees], dim=1)  # shape [N, 2]
    g.ndata["feat"] = feat

    return g

# Test on a sample file 
category = "gossipcop_fake"  # you can change to 'gossipcop_real', 'politifact_fake', etc.
folder_path = os.path.join(GRAPH_ROOT, category)
files = sorted(os.listdir(folder_path))

print(f"Category: {category}, num files: {len(files)}")
sample_file = files[0]
sample_path = os.path.join(folder_path, sample_file)
print("Sample file:", sample_path)

g = build_dgl_from_tree_json(sample_path)

print("\n DGL graph built from tree JSON")
print("Nodes:", g.num_nodes())
print("Edges:", g.num_edges())
print("Node feature shape:", g.ndata["feat"].shape)
print("First 5 node features:\n", g.ndata["feat"][:5])


Category: gossipcop_fake, num files: 3684
Sample file: /kaggle/working/fake-news-propagation/data/nx_network_data/gossipcop_fake/gossipcop-1000240645.json

 DGL graph built from tree JSON
Nodes: 130
Edges: 259
Node feature shape: torch.Size([130, 2])
First 5 node features:
 tensor([[2., 4.],
        [2., 3.],
        [2., 5.],
        [2., 3.],
        [2., 3.]])


In [25]:
# Propagation - Cell 6: Build Graph Dataset + Loader


import os
import dgl
import torch
from torch.utils.data import Dataset, DataLoader

GRAPH_ROOT = "/kaggle/working/fake-news-propagation/data/nx_network_data"

# Use the graph builder function from Cell 5F
# build_dgl_from_tree_json(path)

class PropagationGraphDataset(Dataset):
    def __init__(self, graph_root):
        self.samples = []
        
        # Define label mapping
        # fake -> 1, real -> 0
        folder_to_label = {
            "gossipcop_fake": 1,
            "politifact_fake": 1,
            "gossipcop_real": 0,
            "politifact_real": 0,
        }

        for folder, label in folder_to_label.items():
            folder_path = os.path.join(graph_root, folder)
            files = os.listdir(folder_path)

            for fname in files:
                if fname.endswith(".json"):
                    full_path = os.path.join(folder_path, fname)
                    self.samples.append((full_path, label))

        print(f"Total graphs collected: {len(self.samples)}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        path, label = self.samples[idx]
        g = build_dgl_from_tree_json(path)
        return g, torch.tensor(label, dtype=torch.long)


# Build dataset 
graph_dataset = PropagationGraphDataset(GRAPH_ROOT)

# Test a single sample
g, label = graph_dataset[0]
print("One sample graph:")
print("Nodes:", g.num_nodes())
print("Edges:", g.num_edges())
print("Label:", label.item())

#  Collate function (DGL batch graphs)
def collate_graphs(batch):
    graphs, labels = zip(*batch)
    batched_graph = dgl.batch(graphs)
    labels = torch.stack(labels)
    return batched_graph, labels

#  DataLoader 
graph_loader = DataLoader(
    graph_dataset,
    batch_size=16,
    shuffle=True,
    collate_fn=collate_graphs
)

print("\nGraph DataLoader ready.")


Total graphs collected: 11257
One sample graph:
Nodes: 5
Edges: 9
Label: 1

Graph DataLoader ready.


In [26]:
# Propagation - Cell 7: GNN classifier training


import torch
import torch.nn as nn
from torch.optim import Adam
from torch.utils.data import Subset, random_split, DataLoader
import dgl
import dgl.nn.pytorch as dglnn
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score

# We'll run GNN on CPU to avoid DGL/CUDA issues
device = torch.device("cpu")
print("Using device for GNN:", device)

# Use a subset of graphs for faster training
total_graphs = len(graph_dataset)
max_graphs = min(2000, total_graphs)   # cap at 2000 for speed

indices = torch.randperm(total_graphs)[:max_graphs]
subset = Subset(graph_dataset, indices)

train_size = int(0.8 * max_graphs)
val_size = max_graphs - train_size

train_subset, val_subset = random_split(subset, [train_size, val_size])

train_graph_loader = DataLoader(
    train_subset,
    batch_size=16,
    shuffle=True,
    collate_fn=collate_graphs
)

val_graph_loader = DataLoader(
    val_subset,
    batch_size=16,
    shuffle=False,
    collate_fn=collate_graphs
)

print(f"Graphs used: {max_graphs} (train {train_size}, val {val_size})")


#Define GNN model
class GraphClassifier(nn.Module):
    def __init__(self, in_feats=2, hidden_dim=64, num_classes=2):
        super().__init__()
        self.conv1 = dglnn.GraphConv(in_feats, hidden_dim)
        self.conv2 = dglnn.GraphConv(hidden_dim, hidden_dim)
        self.classifier = nn.Linear(hidden_dim, num_classes)

    def forward(self, g):
        # g.ndata["feat"] is shape [N, in_feats]
        h = g.ndata["feat"]
        h = self.conv1(g, h)
        h = torch.relu(h)
        h = self.conv2(g, h)
        h = torch.relu(h)
        g.ndata["h"] = h
        # Graph-level representation: mean of node embeddings
        hg = dgl.mean_nodes(g, "h")
        logits = self.classifier(hg)
        return logits


model_gnn = GraphClassifier(in_feats=2, hidden_dim=64, num_classes=2).to(device)
optimizer = Adam(model_gnn.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

num_epochs_gnn = 3

for epoch in range(num_epochs_gnn):
    print(f"\n===== GNN Epoch {epoch+1}/{num_epochs_gnn} =====")
    model_gnn.train()
    epoch_loss = 0.0
    step = 0

    # ---- Training loop ----
    for batched_graph, labels in tqdm(train_graph_loader):
        batched_graph = batched_graph.to(device)
        labels = labels.to(device)

        logits = model_gnn(batched_graph)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        step += 1

        if step % 20 == 0:
            print(f"Step {step} - Avg loss: {epoch_loss / step:.4f}")

    print(f"Epoch {epoch+1} finished. Avg train loss: {epoch_loss / max(step,1):.4f}")

    # ---- Validation ----
    model_gnn.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batched_graph, labels in val_graph_loader:
            batched_graph = batched_graph.to(device)
            labels_np = labels.numpy()

            logits = model_gnn(batched_graph)
            preds = logits.argmax(dim=1).cpu().numpy()

            all_preds.append(preds)
            all_labels.append(labels_np)

    import numpy as np
    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    print(f"[GNN Epoch {epoch+1}] Validation Accuracy: {acc:.4f}")

print("\n GNN training on propagation graphs complete.")


Using device for GNN: cpu
Graphs used: 2000 (train 1600, val 400)

===== GNN Epoch 1/3 =====


  0%|          | 0/100 [00:00<?, ?it/s]

Step 20 - Avg loss: 0.6805
Step 40 - Avg loss: 0.6738
Step 60 - Avg loss: 0.6596
Step 80 - Avg loss: 0.6523
Step 100 - Avg loss: 0.6461
Epoch 1 finished. Avg train loss: 0.6461
[GNN Epoch 1] Validation Accuracy: 0.6575

===== GNN Epoch 2/3 =====


  0%|          | 0/100 [00:00<?, ?it/s]

Step 20 - Avg loss: 0.5999
Step 40 - Avg loss: 0.5991
Step 60 - Avg loss: 0.6018
Step 80 - Avg loss: 0.5867
Step 100 - Avg loss: 0.5871
Epoch 2 finished. Avg train loss: 0.5871
[GNN Epoch 2] Validation Accuracy: 0.7150

===== GNN Epoch 3/3 =====


  0%|          | 0/100 [00:00<?, ?it/s]

Step 20 - Avg loss: 0.5589
Step 40 - Avg loss: 0.5569
Step 60 - Avg loss: 0.5511
Step 80 - Avg loss: 0.5491
Step 100 - Avg loss: 0.5542
Epoch 3 finished. Avg train loss: 0.5542
[GNN Epoch 3] Validation Accuracy: 0.7500

 GNN training on propagation graphs complete.


In [28]:
# ============================================
# Multimodal - Cell 8: BERT + GNN Fusion Model
# ============================================

import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device for multimodal:", device)

# ---------------------------
# Fusion Dataset
# ---------------------------

class MultimodalDataset(torch.utils.data.Dataset):
    def __init__(self, df_text, graph_root, tokenizer, max_len=256):
        """
        df_text: dataframe with text and label_id
        graph_root: propagation graph directory
        """
        self.texts = df_text["text"].tolist()
        self.labels = df_text["label_id"].tolist()
        self.ids = df_text.get("id", df_text.index).tolist()  # we will assume filename contains ID
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.graph_root = graph_root

    def __len__(self):
        return len(self.texts)

    def get_graph_path(self, article_id):
        # find matching graph file
        for folder in os.listdir(self.graph_root):
            folder_path = os.path.join(self.graph_root, folder)
            if not os.path.isdir(folder_path):
                continue
            
            # guess possible filenames:
            for name in os.listdir(folder_path):
                if article_id in name:
                    return os.path.join(folder_path, name)
        
        return None  # graph not found

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])

        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        # find graph file
        article_id = str(self.ids[idx])
        graph_path = self.get_graph_path(article_id)

        if graph_path is None:
            # fallback: return small dummy graph
            g = dgl.graph(([], []), num_nodes=1)
            g = dgl.add_self_loop(g)
            g.ndata["feat"] = torch.zeros(1, 2)
        else:
            g = build_dgl_from_tree_json(graph_path)

        # remove batch dimension on BERT inputs
        item = {
            "input_ids": enc["input_ids"].squeeze(0),
            "attention_mask": enc["attention_mask"].squeeze(0),
            "graph": g,
            "labels": torch.tensor(label, dtype=torch.long)
        }
        return item


# Fusion Model


class FusionModel(nn.Module):
    def __init__(self, bert_name="distilbert-base-uncased", graph_in=2, graph_hidden=64, graph_out=128, num_classes=2):
        super().__init__()
        self.text_encoder = AutoModel.from_pretrained(bert_name)

        self.gnn1 = dglnn.GraphConv(graph_in, graph_hidden)
        self.gnn2 = dglnn.GraphConv(graph_hidden, graph_out)

        # DistilBERT hidden size = 768
        self.classifier = nn.Sequential(
            nn.Linear(768 + graph_out, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )

    def forward(self, input_ids, attention_mask, g):
        # Text encoding
        out = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        h_text = out.last_hidden_state[:, 0, :]     # CLS-like

        # GNN encoding
        h = g.ndata["feat"]
        h = torch.relu(self.gnn1(g, h))
        h = torch.relu(self.gnn2(g, h))
        g.ndata["h"] = h
        h_graph = dgl.mean_nodes(g, "h")

        # Align batch dims
        if h_graph.shape[0] != h_text.shape[0]:
            h_graph = h_graph.unsqueeze(0)

        # Fusion
        fused = torch.cat([h_text, h_graph], dim=1)
        logits = self.classifier(fused)
        return logits.to(device)


Using device for multimodal: cuda


In [31]:

# Multimodal - Cell 9 (FIXED): Train FusionModel on CPU


import torch
from torch.utils.data import DataLoader, Subset
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Force CPU because DGL is CPU-only here
device = torch.device("cpu")
print("Using device:", device)

graph_root = "/kaggle/working/fake-news-propagation/data/nx_network_data"

#  Build multimodal datasets (same as before)
multimodal_train_ds = MultimodalDataset(train_fakenet, graph_root, tokenizer, max_len=256)
multimodal_val_ds   = MultimodalDataset(val_fakenet,   graph_root, tokenizer, max_len=256)

# Subsample for speed
max_mm_train = 1500
max_mm_val   = 400

train_indices = torch.randperm(len(multimodal_train_ds))[:max_mm_train]
val_indices   = torch.randperm(len(multimodal_val_ds))[:max_mm_val]

train_mm_subset = Subset(multimodal_train_ds, train_indices)
val_mm_subset   = Subset(multimodal_val_ds,   val_indices)

print("Multimodal train samples:", len(train_mm_subset))
print("Multimodal val samples  :", len(val_mm_subset))


def collate_multimodal(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.stack([item["labels"] for item in batch])
    graphs = [item["graph"] for item in batch]
    batched_graph = dgl.batch(graphs)  # stays on CPU
    return input_ids, attention_mask, batched_graph, labels  # all CPU tensors

multimodal_train_loader = DataLoader(
    train_mm_subset, batch_size=8, shuffle=True, collate_fn=collate_multimodal
)
multimodal_val_loader = DataLoader(
    val_mm_subset, batch_size=8, shuffle=False, collate_fn=collate_multimodal
)

# Init fusion model on CPU
fusion_model = FusionModel(
    bert_name=model_name,
    graph_in=2,
    graph_hidden=64,
    graph_out=128,
    num_classes=2
).to(device)

optimizer = torch.optim.Adam(fusion_model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

num_epochs_fusion = 2

for epoch in range(num_epochs_fusion):
    print(f"\n Fusion Epoch {epoch+1}/{num_epochs_fusion} =====")
    fusion_model.train()
    epoch_loss = 0.0
    step = 0

    for input_ids, attention_mask, graphs, labels in tqdm(multimodal_train_loader):
        # move ONLY tensors, graphs stay CPU but model is also on CPU
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        # graphs already on CPU; no .to(device) here

        logits = fusion_model(input_ids, attention_mask, graphs)
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        step += 1

        if step % 20 == 0:
            print(f"Step {step} - Avg loss: {epoch_loss / step:.4f}")

    print(f"Epoch {epoch+1} train loss: {epoch_loss / step:.4f}")

    # Validation
    fusion_model.eval()
    all_preds, all_labels = [], []

    with torch.no_grad():
        for input_ids, attention_mask, graphs, labels in multimodal_val_loader:
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels_np = labels.numpy()  # keep on CPU

            logits = fusion_model(input_ids, attention_mask, graphs)
            preds = logits.argmax(dim=1).cpu().numpy()

            all_preds.append(preds)
            all_labels.append(labels_np)

    all_preds = np.concatenate(all_preds)
    all_labels = np.concatenate(all_labels)

    acc = accuracy_score(all_labels, all_preds)
    print(f"[Fusion Epoch {epoch+1}] Validation Accuracy: {acc:.4f}")
    print("Classification report:")
    print(classification_report(all_labels, all_preds, target_names=["real", "fake"]))

print("\nMultimodal BERT+GNN training complete!")


Using device: cpu
Multimodal train samples: 1500
Multimodal val samples  : 400

 Fusion Epoch 1/2 =====


  0%|          | 0/188 [00:00<?, ?it/s]

Step 20 - Avg loss: 0.5785
Step 40 - Avg loss: 0.5614
Step 60 - Avg loss: 0.5133
Step 80 - Avg loss: 0.4295
Step 100 - Avg loss: 0.3693
Step 120 - Avg loss: 0.3238
Step 140 - Avg loss: 0.2864
Step 160 - Avg loss: 0.2605
Step 180 - Avg loss: 0.2350
Epoch 1 train loss: 0.2277
[Fusion Epoch 1] Validation Accuracy: 0.9875
Classification report:
              precision    recall  f1-score   support

        real       1.00      0.98      0.99       306
        fake       0.95      1.00      0.97        94

    accuracy                           0.99       400
   macro avg       0.97      0.99      0.98       400
weighted avg       0.99      0.99      0.99       400


 Fusion Epoch 2/2 =====


  0%|          | 0/188 [00:00<?, ?it/s]

Step 20 - Avg loss: 0.0325
Step 40 - Avg loss: 0.0289
Step 60 - Avg loss: 0.0295
Step 80 - Avg loss: 0.0277
Step 100 - Avg loss: 0.0303
Step 120 - Avg loss: 0.0356
Step 140 - Avg loss: 0.0369
Step 160 - Avg loss: 0.0398
Step 180 - Avg loss: 0.0367
Epoch 2 train loss: 0.0356
[Fusion Epoch 2] Validation Accuracy: 0.9825
Classification report:
              precision    recall  f1-score   support

        real       1.00      0.98      0.99       306
        fake       0.93      1.00      0.96        94

    accuracy                           0.98       400
   macro avg       0.97      0.99      0.98       400
weighted avg       0.98      0.98      0.98       400


Multimodal BERT+GNN training complete!
