In [None]:
!pip install transformers torch nltk seaborn



In [None]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.55.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m739.7 kB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.55.4-py3-none-any.whl (11.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.3/11.3 MB[0m [31m58.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.55.2
    Uninstalling transformers-4.55.2:
      Successfully uninstalled transformers-4.55.2
Successfully installed transformers-4.55.4


In [1]:
from google.colab import files
import io
import pandas as pd

uploaded = files.upload()

# Read CSVs
fake_df = pd.read_csv(io.BytesIO(uploaded['Fake.csv']))
true_df = pd.read_csv(io.BytesIO(uploaded['True.csv']))

print("Fake shape:", fake_df.shape)
print("True shape:", true_df.shape)

Saving True.csv to True.csv
Saving Fake.csv to Fake.csv
Fake shape: (9085, 4)
True shape: (1999, 4)


In [2]:
import nltk, re
nltk.download("stopwords")
from nltk.corpus import stopwords

fake_df["label"] = 1
true_df["label"] = 0

df = pd.concat([fake_df, true_df]).reset_index(drop=True)
df = df[["title", "text", "label"]].dropna()
df["content"] = df["title"].astype(str) + " " + df["text"].astype(str)

STOPWORDS = set(stopwords.words("english"))

def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.lower()
    text = " ".join([w for w in text.split() if w not in STOPWORDS])
    return text

df["content"] = df["content"].apply(clean_text)
df.head()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,title,text,label,content
0,Donald Trump Sends Out Embarrassing New Year’...,Donald Trump just couldn t wish all Americans ...,1,donald trump sends embarrassing new year eve m...
1,Drunk Bragging Trump Staffer Started Russian ...,House Intelligence Committee Chairman Devin Nu...,1,drunk bragging trump staffer started russian c...
2,Sheriff David Clarke Becomes An Internet Joke...,"On Friday, it was revealed that former Milwauk...",1,sheriff david clarke becomes internet joke thr...
3,Trump Is So Obsessed He Even Has Obama’s Name...,"On Christmas day, Donald Trump announced that ...",1,trump obsessed even obama name coded website i...
4,Pope Francis Just Called Out Donald Trump Dur...,Pope Francis used his annual Christmas Day mes...,1,pope francis called donald trump christmas spe...


In [3]:
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df["content"], df["label"], test_size=0.2, random_state=42
)

# Tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

print("Train samples:", len(train_encodings["input_ids"]))
print("Test samples:", len(test_encodings["input_ids"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Train samples: 3198
Test samples: 800


In [4]:
import torch
from torch.utils.data import Dataset
from transformers import BertForSequenceClassification

# Dataset Class
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create train & test datasets
train_dataset = NewsDataset(train_encodings, list(y_train))
test_dataset = NewsDataset(test_encodings, list(y_test))

# Load BERT model
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

print("✅ Dataset & model ready!")


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Dataset & model ready!


In [5]:
from transformers import Trainer, TrainingArguments

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",          # save model here
    per_device_train_batch_size=8,   # batch size per device
    per_device_eval_batch_size=8,
    num_train_epochs=1,              # try 2-3 for better results
    weight_decay=0.01,               # L2 regularization
    logging_dir="./logs",            # save logs
    eval_strategy="epoch",     # evaluate every epoch
    save_strategy="epoch",            # save checkpoints
    report_to="none"                 # Disable wandb integration
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

print("✅ Trainer is ready! You can now train the model.")

✅ Trainer is ready! You can now train the model.


In [6]:
import os
os.environ["WANDB_DISABLED"] = "true"

trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.005634


TrainOutput(global_step=400, training_loss=0.06081190586090088, metrics={'train_runtime': 4362.1372, 'train_samples_per_second': 0.733, 'train_steps_per_second': 0.092, 'total_flos': 210357288760320.0, 'train_loss': 0.06081190586090088, 'epoch': 1.0})

In [7]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Run evaluation
eval_results = trainer.evaluate()
print("📊 Eval Results:", eval_results)

# Predictions on test dataset
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Accuracy
acc = accuracy_score(y_test, pred_labels)
print(f"✅ Final Accuracy: {acc:.4f}")

# Classification Report
print("\n📑 Classification Report:")
print(classification_report(y_test, pred_labels, target_names=["Real", "Fake"]))

# Confusion Matrix
print("\n🔍 Confusion Matrix:")
print(confusion_matrix(y_test, pred_labels))




📊 Eval Results: {'eval_loss': 0.005634307395666838, 'eval_runtime': 335.0609, 'eval_samples_per_second': 2.388, 'eval_steps_per_second': 0.298, 'epoch': 1.0}
✅ Final Accuracy: 0.9988

📑 Classification Report:
              precision    recall  f1-score   support

        Real       1.00      1.00      1.00       385
        Fake       1.00      1.00      1.00       415

    accuracy                           1.00       800
   macro avg       1.00      1.00      1.00       800
weighted avg       1.00      1.00      1.00       800


🔍 Confusion Matrix:
[[385   0]
 [  1 414]]


In [8]:
news_samples = [
    # Fake news examples
    "Aliens have landed in New York City, eyewitnesses claim.",
    "Celebrity endorses miracle pill that guarantees weight loss in one week.",
    "Government to give $10,000 to every citizen next month.",
    "Scientists confirm chocolate can prevent all types of cancer.",
    "New study proves that the earth is flat and NASA has been lying.",

    # Real news examples
    "NASA successfully launches James Webb Space Telescope into orbit.",
    "UN reports global poverty rates declining over the past decade.",
    "Local school district announces new policy to improve student safety.",
    "Stock markets rally as economic growth exceeds expectations.",
    "New vaccine shows effectiveness against seasonal flu in clinical trials."
]


In [10]:
def predict_news(news_text):
    # Preprocess the news text
    cleaned_text = clean_text(news_text)

    # Tokenize and encode the text
    encoding = tokenizer(cleaned_text, truncation=True, padding=True, max_length=128, return_tensors='pt')

    # Move tensors to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    encoding = {key: val.to(device) for key, val in encoding.items()}

    # Make prediction
    model.eval()
    with torch.no_grad():
        outputs = model(**encoding)
        logits = outputs.logits
        probabilities = torch.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()

    # Return label (0 for Real, 1 for Fake)
    return "Fake" if prediction == 1 else "Real"


for news in news_samples:
    result = predict_news(news)
    print(f"News: {news}\nPrediction: {result}\n")

News: Aliens have landed in New York City, eyewitnesses claim.
Prediction: Fake

News: Celebrity endorses miracle pill that guarantees weight loss in one week.
Prediction: Fake

News: Government to give $10,000 to every citizen next month.
Prediction: Real

News: Scientists confirm chocolate can prevent all types of cancer.
Prediction: Fake

News: New study proves that the earth is flat and NASA has been lying.
Prediction: Fake

News: NASA successfully launches James Webb Space Telescope into orbit.
Prediction: Real

News: UN reports global poverty rates declining over the past decade.
Prediction: Real

News: Local school district announces new policy to improve student safety.
Prediction: Real

News: Stock markets rally as economic growth exceeds expectations.
Prediction: Real

News: New vaccine shows effectiveness against seasonal flu in clinical trials.
Prediction: Real

