<a href="https://colab.research.google.com/github/SamDarkKnight/Data-Mining-Project/blob/main/Fake_News_Propagation_using_machine_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Fake News Propagation


In [None]:

try:
    import torch, torch_geometric
except Exception:
    !pip install -q torch torchvision torchaudio
    !pip install -q torch_geometric torch_scatter torch_sparse -f https://data.pyg.org/whl/torch-2.2.0+cpu.html

!pip install -q sentence-transformers tqdm

import os, random, math, time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import matplotlib.animation as animation

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from torch_geometric.data import Data as GeometricData
from torch_geometric.nn import GCNConv, global_mean_pool
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
SENT_EMB_MODEL = 'all-MiniLM-L6-v2'
TIME_WINDOW_SECONDS = 3600
SNAPSHOT_COUNT = 8

def generate_synthetic_events(n_articles=300, max_events_per_article=150):
    rows = []
    rng = np.random.RandomState(42)
    for a in range(n_articles):
        article_id = f'article_{a}'
        start = 1600000000 + rng.randint(0, 1000000)
        n = rng.randint(10, max_events_per_article)
        users = [f'u{article_id}_{i}' for i in range(n * 2)]
        rows.append({'article_id': article_id, 'event_id': f'{article_id}_0',
                     'user_id': users[0], 'parent_user_id': None,
                     'timestamp': start, 'text': 'original'})
        for i in range(1, n):
            parent_idx = rng.randint(0, i)
            user, parent_user = users[i], users[parent_idx]
            ts = start + rng.randint(1, 3600 * 24)
            rows.append({'article_id': article_id, 'event_id': f'{article_id}_{i}',
                         'user_id': user, 'parent_user_id': parent_user,
                         'timestamp': ts, 'text': 'retweet'})
    df = pd.DataFrame(rows).sort_values('timestamp')
    return df


def build_snapshots_for_article(events, snapshot_count=SNAPSHOT_COUNT, window_seconds=TIME_WINDOW_SECONDS, embedder=None):
    if events.empty:
        return [], 0
    t0 = events['timestamp'].min()
    if embedder is not None:
        emb = embedder.encode(events['text'].fillna('').astype(str).tolist(), show_progress_bar=False)
        events = events.reset_index(drop=True)
        events['text_emb'] = list(emb)
    else:
        events['text_emb'] = [np.zeros(16)] * len(events)
    graphs = []
    for i in range(snapshot_count):
        end_t = t0 + (i + 1) * window_seconds
        e_slice = events[events['timestamp'] <= end_t]
        if e_slice.empty:
            graphs.append(None)
            continue
        users = pd.unique(e_slice[['user_id', 'parent_user_id']].values.ravel())
        users = [u for u in users if pd.notna(u)]
        node_id_map = {u: idx for idx, u in enumerate(users)}
        edges = [[], []]
        for _, row in e_slice.iterrows():
            if pd.notna(row['parent_user_id']):
                if row['user_id'] in node_id_map and row['parent_user_id'] in node_id_map:
                    edges[0].append(node_id_map[row['parent_user_id']])
                    edges[1].append(node_id_map[row['user_id']])
        edge_index = torch.tensor(edges, dtype=torch.long) if len(edges[0]) > 0 else torch.empty((2, 0), dtype=torch.long)
        feats = []
        deg = np.zeros(len(users))
        for s, d in zip(edge_index[0].tolist(), edge_index[1].tolist()):
            deg[s] += 1
            deg[d] += 1
        for u in users:
            user_events = e_slice[e_slice['user_id'] == u]
            if not user_events.empty and 'text_emb' in user_events and len(user_events['text_emb']) > 0:
                emb_u = np.mean(np.stack(user_events['text_emb']), axis=0)
            else:
                emb_u = np.zeros(embedder.get_sentence_embedding_dimension()) if embedder is not None else np.zeros(16)
            feats.append(np.concatenate([emb_u, [deg[node_id_map[u]]]]))
        x = torch.tensor(np.stack(feats), dtype=torch.float)
        graphs.append(GeometricData(x=x, edge_index=edge_index))
    return graphs, len(events)


class CascadeSequenceDataset(Dataset):
    def __init__(self, df, embedder, snapshot_count=SNAPSHOT_COUNT, window_seconds=TIME_WINDOW_SECONDS):
        self.embedder = embedder
        self.snapshot_count = snapshot_count
        self.window_seconds = window_seconds
        self.groups = {a: df[df['article_id'] == a] for a in df['article_id'].unique()}

    def __len__(self):
        return len(self.groups)

    def __getitem__(self, idx):
        a = list(self.groups.keys())[idx]
        graphs, final_size = build_snapshots_for_article(self.groups[a],
                                                         self.snapshot_count,
                                                         self.window_seconds,
                                                         self.embedder)
        processed_graphs = []
        for g in graphs:
            if g is None:
                dim = self.embedder.get_sentence_embedding_dimension() + 1 if self.embedder is not None else 17
                processed_graphs.append(GeometricData(x=torch.zeros((0, dim)), edge_index=torch.empty((2, 0), dtype=torch.long)))
            else:
                processed_graphs.append(g)
        target = np.log1p(final_size)  # log-scale target
        return processed_graphs, torch.tensor([target], dtype=torch.float)


def collate_with_counts(batch):
    batch_size = len(batch)
    seq_len = len(batch[0][0])
    batched_per_t, batch_node_counts = [], []
    for t in range(seq_len):
        xs, edge_idxs, counts, cum_nodes = [], [], [], 0
        for graphs, _ in batch:
            g = graphs[t]
            n = g.x.size(0)
            counts.append(n)
            xs.append(g.x)
            if g.edge_index.numel() > 0:
                edge_idxs.append(g.edge_index + cum_nodes)
            cum_nodes += n
        x = torch.cat(xs, dim=0) if len(xs) > 0 else torch.zeros((0, xs[0].size(1) if len(xs) > 0 else 1))
        edge_index = torch.cat(edge_idxs, dim=1) if len(edge_idxs) > 0 else torch.empty((2, 0), dtype=torch.long)
        batched_per_t.append(GeometricData(x=x, edge_index=edge_index))
        batch_node_counts.append(counts)
    targets = torch.cat([t for _, t in batch], dim=0)
    return batched_per_t, batch_node_counts, targets


class SnapshotEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, out_dim=128, dropout=0.3):
        super().__init__()
        self.conv1 = GCNConv(in_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, out_dim)
        self.dropout = dropout

    def forward(self, x, edge_index):
        if x.size(0) == 0:
            return torch.zeros((1, self.conv2.out_channels), device=x.device)
        h = F.relu(self.conv1(x, edge_index))
        h = F.dropout(h, self.dropout, self.training)
        return self.conv2(h, edge_index)


class TemporalGNN(nn.Module):
    def __init__(self, in_dim, seq_len=SNAPSHOT_COUNT):
        super().__init__()
        self.encoder = SnapshotEncoder(in_dim)
        self.rnn = nn.GRU(input_size=128, hidden_size=128, batch_first=True)
        self.mlp = nn.Sequential(
            nn.Linear(128, 128), nn.ReLU(),
            nn.Linear(128, 1)
        )
        self.seq_len = seq_len

    def forward(self, batched_graphs_per_t, batch_node_counts):
        seq_embs = []
        batch_size = len(batch_node_counts[0])
        for t, g in enumerate(batched_graphs_per_t):
            h_nodes = self.encoder(g.x.to(DEVICE), g.edge_index.to(DEVICE))
            counts = batch_node_counts[t]
            emb_list = [h_nodes.new_zeros(128) if c == 0 else h_nodes[s:s+c].mean(0)
                        for s, c in zip(np.cumsum([0]+counts[:-1]), counts)]
            seq_embs.append(torch.stack(emb_list))
        X = torch.stack(seq_embs, dim=1)
        out_rnn, _ = self.rnn(X)
        return self.mlp(out_rnn[:, -1, :])


print("Generating synthetic data...")
df = generate_synthetic_events()
embedder = SentenceTransformer(SENT_EMB_MODEL)
dataset = CascadeSequenceDataset(df, embedder)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_ds, batch_size=8, shuffle=True, collate_fn=collate_with_counts)
val_loader = DataLoader(val_ds, batch_size=8, shuffle=False, collate_fn=collate_with_counts)

in_dim = embedder.get_sentence_embedding_dimension() + 1
model = TemporalGNN(in_dim=in_dim).to(DEVICE)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.7)
criterion = nn.MSELoss()

EPOCHS = 5
for epoch in range(EPOCHS):
    model.train()
    train_loss = 0
    for batched_per_t, batch_node_counts, targets in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        targets = targets.to(DEVICE)
        preds = model(batched_per_t, batch_node_counts)
        loss = criterion(preds, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batched_per_t, batch_node_counts, targets in val_loader:
            preds = model(batched_per_t, batch_node_counts)
            val_loss += criterion(preds, targets.to(DEVICE)).item()
    scheduler.step()
    print(f"Epoch {epoch+1:02d} | Train Loss: {train_loss/len(train_loader):.4f} | Val Loss: {val_loss/len(val_loader):.4f}")

torch.save(model.state_dict(), "temporal_gnn_model.pth")
print("Model saved to temporal_gnn_model.pth")

model.eval()
with torch.no_grad():
    batched_per_t, batch_node_counts, targets = next(iter(val_loader))
    preds = model(batched_per_t, batch_node_counts)
    preds_denorm = np.expm1(preds.cpu().numpy())
    targets_denorm = np.expm1(targets.cpu().numpy())

plt.figure(figsize=(6,6))
plt.scatter(targets_denorm, preds_denorm, alpha=0.7)
plt.plot([min(targets_denorm), max(targets_denorm)],
         [min(targets_denorm), max(targets_denorm)], 'r--')
plt.xlabel("True Cascade Size")
plt.ylabel("Predicted Cascade Size")
plt.title("Improved Temporal GNN Predictions")
plt.show()

for i in range(min(8, len(preds_denorm))):
    print(f"Predicted: {float(preds_denorm[i]):.1f} | True: {float(targets_denorm[i]):.1f}")

def visualize_fake_news_propagation(num_nodes=20, infection_prob=0.2, steps=10):
    G = nx.erdos_renyi_graph(num_nodes, 0.2)
    source = random.choice(list(G.nodes()))
    infected = {source}
    newly_infected = {source}
    pos = nx.spring_layout(G, seed=42)

    fig, ax = plt.subplots(figsize=(6,6))
    nx.draw(G, pos, node_color=['red' if n in infected else 'lightgray' for n in G.nodes()],
            with_labels=True, node_size=500, ax=ax)
    plt.title("Fake News Propagation (Step 0)")

    def update(step):
        nonlocal newly_infected
        new_inf = set()
        for n in newly_infected:
            for nb in G.neighbors(n):
                if nb not in infected and random.random() < infection_prob:
                    new_inf.add(nb)
        infected.update(new_inf)
        newly_infected = new_inf
        ax.clear()
        nx.draw(G, pos, node_color=['red' if n in infected else 'lightgray' for n in G.nodes()],
                with_labels=True, node_size=500, ax=ax)
        ax.set_title(f"Fake News Propagation (Step {step})")

    ani = animation.FuncAnimation(fig, update, frames=steps, interval=1000, repeat=False)
    plt.show()

visualize_fake_news_propagation(num_nodes=25, infection_prob=0.3, steps=8)

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.5/511.5 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m27.5 MB/s[0m eta [36m0:00:00[0m
[?25h

  import torch_geometric.typing
  import torch_geometric.typing


Generating synthetic data...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

Epoch 01 | Train Loss: 17.3234 | Val Loss: 15.5126


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(inpu

In [None]:
import matplotlib.pyplot as plt
import networkx as nx
import random
import matplotlib.animation as animation

def visualize_fake_news_propagation(num_nodes=20, infection_prob=0.2, steps=10):
    G = nx.erdos_renyi_graph(num_nodes, 0.2)
    source = random.choice(list(G.nodes()))
    infected = {source}
    newly_infected = {source}
    pos = nx.spring_layout(G, seed=42)

    fig, ax = plt.subplots(figsize=(6,6))
    nx.draw(G, pos, node_color=['red' if n in infected else 'lightgray' for n in G.nodes()],
            with_labels=True, node_size=500, ax=ax)
    plt.title("Fake News Propagation (Step 0)")

    def update(step):
        nonlocal newly_infected
        new_inf = set()
        for n in newly_infected:
            for nb in G.neighbors(n):
                if nb not in infected and random.random() < infection_prob:
                    new_inf.add(nb)
        infected.update(new_inf)
        newly_infected = new_inf
        ax.clear()
        nx.draw(G, pos, node_color=['red' if n in infected else 'lightgray' for n in G.nodes()],
                with_labels=True, node_size=500, ax=ax)
        ax.set_title(f"Fake News Propagation (Step {step})")

    ani = animation.FuncAnimation(fig, update, frames=steps, interval=1000, repeat=False)
    return ani

anim = visualize_fake_news_propagation(num_nodes=25, infection_prob=0.3, steps=8)
from IPython.display import HTML
HTML(anim.to_jshtml())



In [None]:
# -----------------------------
# Print Dataset Information
# -----------------------------
print("\n=== Dataset Summary ===")
print(df.head())                 # Show first few rows
print("\nTotal Events:", len(df))
print("Total Articles:", df['article_id'].nunique())
print("\nColumns:", list(df.columns))

# Count number of events per article
print("\nEvents per article:")
print(df['article_id'].value_counts().head())

# Example of one full article cascade
example_article = df['article_id'].unique()[0]
print(f"\n=== Example cascade for {example_article} ===")
print(df[df['article_id'] == example_article].head(300))
