# Fake News Notarizer: AI + Decentralized Proof

# Setup & Installation

In [1]:
%pip install torch datasets pandas scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
%pip install --upgrade transformers[torch] accelerate 

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install nlpaug nltk

Note: you may need to restart the kernel to use updated packages.


# Imports & Environment

In [4]:
import os
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

In [5]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')  # <— new

import nlpaug.augmenter.word as naw
aug = naw.SynonymAug(aug_src='wordnet')

new_text = aug.augment("Moon landing was filmed in a studio.")
print(new_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rohin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rohin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\rohin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


['Moon landing place live film in a studio.']


# Creating Dataset with Synonym‑Based Data Augmentation

In [6]:
import pandas as pd
import nlpaug.augmenter.word as naw

# Load your original CSV
df = pd.read_csv("fake_news.csv")[["text", "label"]]

# Initialize augmenter
aug = naw.SynonymAug(aug_src='wordnet')

# Filter only the FAKE examples
fake_df = df[df.label == 0]

# Generate, say, 5 paraphrases per fake example
augmented_rows = []
for text in fake_df["text"].tolist():
    for i in range(100):  # tweak this count as you like
        aug_text = aug.augment(text)

        # ✅ Convert list to string if needed
        if isinstance(aug_text, list):
            aug_text = aug_text[0]

        augmented_rows.append({"text": aug_text, "label": 0})

# Build a DataFrame of augmented fakes
aug_df = pd.DataFrame(augmented_rows)

# Combine with your original data
df_expanded = pd.concat([df, aug_df], ignore_index=True)

print("Original rows:", len(df), "→ Expanded rows:", len(df_expanded))

Original rows: 10 → Expanded rows: 610


In [7]:
import pandas as pd

data = {
    "text": [
        "Scientists prove coffee cures Monday blues!",
        "Moon landing was filmed in a studio.",
        "New vaccine reduces flu risk by 70% according to study.",
        "Ancient pyramids discovered on Mars!",
        "Local bakery introduces zero-calorie croissants.",
        "Study finds listening to music improves plant growth.",
        "Government announces free public transport for all citizens.",
        "Celebrity adopts a pet dragon from a rescue shelter.",
        "Researchers develop battery that charges in 10 seconds.",
        "City installs levitating bikes for commuters."
    ],
    "label": [0, 0, 1, 0, 0, 1, 1, 0, 1, 0]
}

df = pd.DataFrame(data)
df.to_csv("fake_news.csv", index=False)
print("fake_news.csv created with 10 samples.") 

fake_news.csv created with 10 samples.


# Build & Split Hugging‑Face Dataset

In [8]:
# ─── Step 1: Load CSV into a Hugging Face Dataset ─────────────────────────────────
import pandas as pd
from datasets import Dataset

# Adjust the path if your CSV is elsewhere
df = pd.read_csv("fake_news.csv")[["text", "label"]]
dataset = Dataset.from_pandas(df)

# Split into train / eval
splits = dataset.train_test_split(test_size=0.2, seed=42)
train_ds = splits["train"]
eval_ds  = splits["test"]

# ─── Step 2: Tokenize with DistilBERT ───────────────────────────────────────────────
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

def tokenize_batch(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Apply tokenization (keeps ‘label’ intact)
train_ds = train_ds.map(tokenize_batch, batched=True)
eval_ds  = eval_ds.map(tokenize_batch,  batched=True)

# ─── Step 3: Remove raw text, set PyTorch format ──────────────────────────────────
train_ds = train_ds.remove_columns(["text"])
eval_ds  = eval_ds.remove_columns(["text"])

train_ds.set_format("torch", columns=["input_ids", "attention_mask", "label"])
eval_ds.set_format("torch",  columns=["input_ids", "attention_mask", "label"])

# ─── Step 4: Create DataLoaders ───────────────────────────────────────────────────
import torch
from torch.utils.data import DataLoader

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
eval_loader  = DataLoader(eval_ds,  batch_size=16)

# ─── Quick Sanity Check ────────────────────────────────────────────────────────────
print("Train columns:", train_ds.features)
batch = next(iter(train_loader))
for name, tensor in batch.items():
    print(f"{name:15s} → {tuple(tensor.shape)}")

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Train columns: {'label': Value('int64'), 'input_ids': List(Value('int32')), 'attention_mask': List(Value('int8'))}
label           → (8,)
input_ids       → (8, 128)
attention_mask  → (8, 128)


# Model Training

In [9]:
import torch
from torch.nn import CrossEntropyLoss
from transformers import DistilBertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
criterion = CrossEntropyLoss()

for epoch in range(10):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids      = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels         = batch["label"].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} train loss: {total_loss/len(train_loader):.4f}")

    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids      = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels         = batch["label"].to(device)
            preds = model(input_ids, attention_mask=attention_mask).logits.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total   += labels.size(0)
    print(f"Epoch {epoch+1} eval accuracy: {correct/total:.2f}")

model.save_pretrained("./model_out")
tokenizer.save_pretrained("./model_out")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 train loss: 0.7547
Epoch 1 eval accuracy: 1.00
Epoch 2 train loss: 0.6489
Epoch 2 eval accuracy: 0.00
Epoch 3 train loss: 0.5563
Epoch 3 eval accuracy: 0.00
Epoch 4 train loss: 0.4798
Epoch 4 eval accuracy: 0.00
Epoch 5 train loss: 0.4010
Epoch 5 eval accuracy: 0.00
Epoch 6 train loss: 0.3277
Epoch 6 eval accuracy: 0.00
Epoch 7 train loss: 0.2500
Epoch 7 eval accuracy: 0.00
Epoch 8 train loss: 0.1996
Epoch 8 eval accuracy: 0.00
Epoch 9 train loss: 0.1614
Epoch 9 eval accuracy: 0.00
Epoch 10 train loss: 0.1210
Epoch 10 eval accuracy: 0.00


('./model_out\\tokenizer_config.json',
 './model_out\\special_tokens_map.json',
 './model_out\\vocab.txt',
 './model_out\\added_tokens.json',
 './model_out\\tokenizer.json')

# Tokenization

In [10]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
import torch
import hashlib

# 1) Load tokenizer + fine‑tuned model
tokenizer = DistilBertTokenizerFast.from_pretrained("./model_out")
model = DistilBertForSequenceClassification.from_pretrained("./model_out")
model.eval()

# 2) Helper to classify
def classify(text):
    # Tokenize the input
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)
    
    # Run model in no-grad mode
    with torch.no_grad():
        logits = model(**inputs).logits

    # Compute probabilities
    probs = torch.softmax(logits, dim=1)[0].tolist()
    label = "FAKE" if probs[0] > probs[1] else "REAL"
    confidence = round(max(probs), 2)

    # Generate hash for the input text
    text_hash = hashlib.sha256(text.encode()).hexdigest()

    return {
        "text": text,
        "label": label,
        "confidence": confidence,
        "text_hash": text_hash
    }

# 3) Test on your own samples
samples = [
    "Study shows that coffee improves memory by 50%.",
    "Moon landing was filmed in a studio."
]

for sample in samples:
    result = classify(sample)
    print(f"\nText     : {result['text']}")
    print(f"Label    : {result['label']}")
    print(f"Confidence: {result['confidence']}")
    print(f"Text Hash: {result['text_hash'][:10]}…")


Text     : Study shows that coffee improves memory by 50%.
Label    : REAL
Confidence: 0.75
Text Hash: de3f4ff40a…

Text     : Moon landing was filmed in a studio.
Label    : FAKE
Confidence: 0.93
Text Hash: 682f7401f4…


# Blockchain‑Style Notarization

In [11]:
class Block:
    def __init__(self, index, timestamp, data, prev_hash):
        self.index = index
        self.timestamp = timestamp
        self.data = data             # e.g. {"text_hash":…, "label":…, "conf":…}
        self.prev_hash = prev_hash
        self.hash = self.compute_hash()

    def compute_hash(self):
        block_string = f"{self.index}{self.timestamp}{self.data}{self.prev_hash}"
        return hashlib.sha256(block_string.encode()).hexdigest()

class SimpleChain:
    def __init__(self):
        # genesis block
        genesis = Block(0, time.time(), {"note":"genesis"}, "0")
        self.chain = [genesis]

    def add_block(self, data):
        prev = self.chain[-1]
        block = Block(len(self.chain), time.time(), data, prev.hash)
        self.chain.append(block)
        return block

# Inference Function

In [12]:
import numpy as np
import time, hashlib
import pandas as pd
# Initialize
chain = SimpleChain()
ledger = []

def classify_and_notarize(text):
    result = classify(text)  # result is a dict
    block = chain.add_block(result)
    
    ledger.append({
        "Index": block.index,
        "Timestamp": time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(block.timestamp)),
        "Text_Hash": result["text_hash"][:10] + "…",
        "Label": result["label"],
        "Conf": result["confidence"],
        "Block_Hash": block.hash[:10] + "…",
        "Prev_Hash": block.prev_hash[:10] + "…"
    })
    return ledger[-1]

# Try it out
df = pd.DataFrame([ classify_and_notarize("Scientists prove coffee cures Monday blues!") ])
df

Unnamed: 0,Index,Timestamp,Text_Hash,Label,Conf,Block_Hash,Prev_Hash
0,1,2025-07-20 15:21:11,9f13c2450c…,FAKE,0.93,17a72d4a76…,753bb536ad…


# Final Demo & Results

In [13]:
more = classify_and_notarize("Moon landing was filmed in a studio.")
pd.DataFrame(ledger)

Unnamed: 0,Index,Timestamp,Text_Hash,Label,Conf,Block_Hash,Prev_Hash
0,1,2025-07-20 15:21:11,9f13c2450c…,FAKE,0.93,17a72d4a76…,753bb536ad…
1,2,2025-07-20 15:21:12,682f7401f4…,FAKE,0.93,344642b4d6…,17a72d4a76…
