In [2]:
!pip install pandas scikit-learn numpy nltk matplotlib seaborn
!pip freeze > requirements.txt



In [3]:
!pip install transformers torch

Collecting transformers
  Downloading transformers-4.53.0-py3-none-any.whl.metadata (39 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Using cached huggingface_hub-0.33.2-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Downloading transformers-4.53.0-py3-none-any.whl (10.8 MB)
   ---------------------------------------- 0.0/10.8 MB ? eta -:--:--
   ---- ----------------------------------- 1.3/10.8 MB 9.1 MB/s eta 0:00:02
   ------------ --------------------------- 3.4/10.8 MB 10.0 MB/s eta 0:00:01
   ----------------------- ---------------- 6.3/10.8 MB 11.4 MB/s eta 0:00:01
   ----------------------------------- ---- 9.7/10.8 MB 13.1 MB/s eta 0:00:01
   ---------------------------------------- 10.8/10.8 MB 12.8 MB/s eta 0:00:00
Using

In [4]:
import pandas as pd
import numpy as np
import seaborn as sns

In [15]:
!pip install --upgrade transformers




In [23]:
import pandas as pd
import torch
import time
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
import os

# ===========================
# 1. Load and Prepare Dataset
# ===========================
df_true = pd.read_csv("data/True.csv")
df_fake = pd.read_csv("data/Fake.csv")

df_true['label'] = 1  # Real news
df_fake['label'] = 0  # Fake news

# Combine, shuffle and reduce to 500 samples (for speed)
df = pd.concat([df_true, df_fake], ignore_index=True)
df = df[['text', 'label']].dropna()
df = df.sample(500).reset_index(drop=True)  # ⚡ Keep only 500 for fast training

texts = df['text'].tolist()
labels = df['label'].tolist()

# ===========================
# 2. Tokenization using BERT
# ===========================
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

encodings = tokenizer(
    texts,
    truncation=True,
    padding=True,
    max_length=512,
    return_tensors='pt'
)

input_ids = encodings['input_ids']
attention_masks = encodings['attention_mask']
labels = torch.tensor(labels)

# ===========================
# 3. Dataset and DataLoaders
# ===========================
dataset = TensorDataset(input_ids, attention_masks, labels)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size

train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=2)

# ===========================
# 4. Load BERT Model
# ===========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("🖥️ Using device:", device)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)
model.to(device)

# ===========================
# 5. Training Loop
# ===========================
optimizer = AdamW(model.parameters(), lr=2e-5)

epochs = 1  # ✅ Just 1 epoch for testing
for epoch in range(epochs):
    start_time = time.time()
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_loader):
        b_input_ids, b_attention_mask, b_labels = [x.to(device) for x in batch]

        model.zero_grad()
        outputs = model(
            b_input_ids,
            attention_mask=b_attention_mask,
            labels=b_labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        if step % 50 == 0:
            print(f"🌀 Epoch {epoch + 1} Step {step}/{len(train_loader)} - Loss: {loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    elapsed = time.time() - start_time
    print(f"✅ Epoch {epoch + 1} finished in {elapsed:.2f} sec. Avg Loss: {avg_loss:.4f}")

# ===========================
# 6. Evaluation
# ===========================
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in val_loader:
        b_input_ids, b_attention_mask, b_labels = [x.to(device) for x in batch]

        outputs = model(
            b_input_ids,
            attention_mask=b_attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(b_labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
print(f"🎯 Validation Accuracy: {accuracy:.2%}")

# Optional: classification report
print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["Fake", "Real"]))


🖥️ Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🌀 Epoch 1 Step 0/200 - Loss: 0.8391
🌀 Epoch 1 Step 50/200 - Loss: 0.4342
🌀 Epoch 1 Step 100/200 - Loss: 0.0703
🌀 Epoch 1 Step 150/200 - Loss: 0.1492
✅ Epoch 1 finished in 1716.73 sec. Avg Loss: 0.2645
🎯 Validation Accuracy: 100.00%

📊 Classification Report:
              precision    recall  f1-score   support

        Fake       1.00      1.00      1.00        56
        Real       1.00      1.00      1.00        44

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [32]:
def predict_news(text, model, tokenizer, device):
    model.eval()

    # Tokenize the input text
    encoding = tokenizer(
        text,
        truncation=True,
        padding=True,
        max_length=512,
        return_tensors="pt"
    )

    input_ids = encoding["input_ids"].to(device)
    attention_mask = encoding["attention_mask"].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(output.logits, dim=1).item()

    return "🟢 Real News" if pred == 1 else "🔴 Fake News"
# ===========================
# 8. Try Out a Prediction
# ===========================
sample_text = "Election Results Reversed After Mysterious USB Drive Discovered"

result = predict_news(sample_text, model, tokenizer, device)
print("\n📢 Prediction for sample input:")
print(result)



📢 Prediction for sample input:
🔴 Fake News
