In [None]:
!pip install transformers torch nltk seaborn




In [None]:
!pip install -U transformers



In [None]:
from google.colab import files
import io
import pandas as pd

uploaded = files.upload()

# Read CSVs
fake_df = pd.read_csv(io.BytesIO(uploaded['Fake.csv']))
true_df = pd.read_csv(io.BytesIO(uploaded['True.csv']))

print("Fake shape:", fake_df.shape)
print("True shape:", true_df.shape)

Saving True.csv to True.csv
Saving Fake.csv to Fake.csv
Fake shape: (23481, 4)
True shape: (21417, 4)


In [None]:
import nltk, re
nltk.download("stopwords")
from nltk.corpus import stopwords

fake_df["label"] = 1
true_df["label"] = 0

df = pd.concat([fake_df, true_df]).reset_index(drop=True)
df = df[["title", "text", "label"]].dropna()
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

STOPWORDS = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = " ".join([w for w in text.split() if w not in STOPWORDS])
    return text

df["content"] = df["content"].apply(clean_text)
df.head()


In [None]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["content"], df["label"], test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

print("Train samples:", len(train_encodings["input_ids"]))
print("Test samples:", len(test_encodings["input_ids"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train samples: 35918
Test samples: 8980


In [None]:
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification

# Dataset Class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create train & test datasets
train_dataset = NewsDataset(train_encodings, list(y_train))
test_dataset = NewsDataset(test_encodings, list(y_test))

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

print("✅ Dataset & model ready!")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Dataset & model ready!


In [14]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",          # save model here
    per_device_train_batch_size=8,   # batch size per device
    per_device_eval_batch_size=8,
    num_train_epochs=1,              # try 2-3 for better results
    weight_decay=0.01,               # L2 regularization
    logging_dir="./logs",            # save logs
    eval_strategy="epoch",     # evaluate every epoch
    save_strategy="epoch",            # save checkpoints
    report_to="none"                 # Disable wandb integration
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

print("✅ Trainer is ready! You can now train the model.")

✅ Trainer is ready! You can now train the model.


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Run evaluation
eval_results = trainer.evaluate()
print("📊 Eval Results:", eval_results)

# Predictions on test dataset
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Accuracy
acc = accuracy_score(y_test, pred_labels)
print(f"✅ Final Accuracy: {acc:.4f}")

# Classification Report
print("\n📑 Classification Report:")
print(classification_report(y_test, pred_labels, target_names=["Real", "Fake"]))

# Confusion Matrix
print("\n🔍 Confusion Matrix:")
print(confusion_matrix(y_test, pred_labels))


In [None]:
news_samples = [
    # Fake news examples
    "Aliens have landed in New York City, eyewitnesses claim.",
    "Celebrity endorses miracle pill that guarantees weight loss in one week.",
    "Government to give $10,000 to every citizen next month.",
    "Scientists confirm chocolate can prevent all types of cancer.",
    "New study proves that the earth is flat and NASA has been lying.",

    # Real news examples
    "NASA successfully launches James Webb Space Telescope into orbit.",
    "UN reports global poverty rates declining over the past decade.",
    "Local school district announces new policy to improve student safety.",
    "Stock markets rally as economic growth exceeds expectations.",
    "New vaccine shows effectiveness against seasonal flu in clinical trials."
]


In [None]:
for news in news_samples:
    result = predict_news(news)
    print(f"News: {news}\nPrediction: {result}\n")
