<a href="https://colab.research.google.com/github/Abhishek-1-Kumar/fake_news_detection/blob/main/Fake_News_Detection_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Install Dependencies & Import Libraries

In [None]:
!pip install -q transformers datasets scikit-learn streamlit


In [None]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# 2. Upload Data & Preprocess it

In [None]:
# mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
fake_df_full = pd.read_csv('/content/drive/MyDrive/fake_news_detection/Fake.csv')
true_df_full = pd.read_csv('/content/drive/MyDrive/fake_news_detection/True.csv')

In [None]:
# Remove rows where 'text' is NaN, empty string, or just whitespace
true_df_clean = true_df_full.dropna(subset=["text"])
true_df_clean = true_df_clean[true_df_clean["text"].str.strip() != ""]

fake_df_clean = fake_df_full.dropna(subset=["text"])
fake_df_clean = fake_df_clean[fake_df_clean["text"].str.strip() != ""]

# Select top 100 clean rows from each
true_df = true_df_clean.head(100).copy()
fake_df = fake_df_clean.head(100).copy()

# Add labels
true_df["label"] = 1
fake_df["label"] = 0

# Combine and shuffle
df = pd.concat([true_df, fake_df]).sample(frac=1, random_state=42).reset_index(drop=True)

# Keep only necessary columns
df = df[["text", "label"]]

# Display confirmation
print(f"Loaded {len(df)} total samples (True: {len(true_df)}, Fake: {len(fake_df)})")


# 3. Tokenize and Prepare Dataset

In [None]:
from transformers import BertTokenizerFast
from datasets import Dataset
from sklearn.model_selection import train_test_split

# Load tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Step 1: First split into train (80%) and temp (20%)
train_df, temp_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Step 2: Split temp into validation (10%) and test (10%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, stratify=temp_df['label'], random_state=42)

# Convert to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenization function
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512
    )

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


# 4. Safe Conversion (Avoid NumPy 2.0 Bug)

In [None]:
# Convert to Python format
train_list = train_dataset.with_format("python")
val_list = val_dataset.with_format("python")
test_list = test_dataset.with_format("python")

# Extract fields
train_encodings = {
    "input_ids": [ex["input_ids"] for ex in train_list],
    "attention_mask": [ex["attention_mask"] for ex in train_list],
    "labels": [ex["label"] for ex in train_list]
}
val_encodings = {
    "input_ids": [ex["input_ids"] for ex in val_list],
    "attention_mask": [ex["attention_mask"] for ex in val_list],
    "labels": [ex["label"] for ex in val_list]
}
test_encodings = {
    "input_ids": [ex["input_ids"] for ex in test_list],
    "attention_mask": [ex["attention_mask"] for ex in test_list],
    "labels": [ex["label"] for ex in test_list]
}


# 5. Define PyTorch Dataset

In [None]:
import torch
from torch.utils.data import Dataset

class FakeNewsDataset(Dataset):
    def __init__(self, encodings):
        self.input_ids = encodings["input_ids"]
        self.attention_mask = encodings["attention_mask"]
        self.labels = encodings["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.input_ids[idx]),
            "attention_mask": torch.tensor(self.attention_mask[idx]),
            "labels": torch.tensor(self.labels[idx])
        }

train_dataset = FakeNewsDataset(train_encodings)
val_dataset = FakeNewsDataset(val_encodings)
test_dataset = FakeNewsDataset(test_encodings)


# 6. Load Model and Define Metrics

In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# 7. Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"  # disables wandb/tensorboard
)


# 8. Train

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


# 9. Evaluate on Test Set

In [None]:
trainer.evaluate(test_dataset)


# 10. Save Model

In [None]:
# Create a folder to store the model
model_path = "/content/drive/MyDrive/fake_news_detection/model"

# Save model and tokenizer
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

print(f"Model saved to: {model_path}")
