In [1]:
import os
os.environ["WANDB_DISABLED"] = "true"
# Install necessary libraries
!pip install torch torchvision torchaudio transformers datasets numpy pandas scikit-learn --quiet

# Imports
import torch
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import precision_score

# Load dataset
df = pd.read_csv('News_Dataset.csv')

# Remove duplicate news articles (keep only unique texts)
df = df.drop_duplicates(subset=['text'], keep='first')

# Preprocess dataset (before splitting!)
df['text'] = df['text'].astype(str).str.lower()

# Train-test split (directly on DataFrame)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Extract text and labels after splitting
X_train, y_train = train_df['text'].values, train_df['label'].values
X_test, y_test = test_df['text'].values, test_df['label'].values

# Define class names
# CLASS_NAMES = ["Fake", "Real"]

# Load BERT tokenizer
MODEL_NAME = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize data
train_encodings = tokenizer(list(X_train), padding=True, truncation=True, max_length=512)
test_encodings = tokenizer(list(X_test), padding=True, truncation=True, max_length=512)

train_texts = set(X_train)
test_texts = set(X_test)
overlap = train_texts.intersection(test_texts)
print(f"Number of overlapping texts: {len(overlap)}")  # Should be **0**

# Convert to dataset format
train_dataset = Dataset.from_dict({
    'text': X_train,
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': list(y_train)
})

test_dataset = Dataset.from_dict({
    'text': X_test,
    'input_ids': test_encodings['input_ids'],
    'attention_mask': test_encodings['attention_mask'],
    'labels': list(y_test)
})

# Load BERT model for classification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    fp16=True,
    gradient_checkpointing=True,
    optim="adamw_torch",
)


# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train model
trainer.train()

# Evaluate model
results = trainer.evaluate()
print(f"Eval loss: {results['eval_loss']:.2f}")

# Save model and tokenizer
model.save_pretrained("bert_fake_news_detector")
tokenizer.save_pretrained("bert_fake_news_detector")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m35.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Number of overlapping texts: 0


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,0.0144,0.000327
2,0.001,1.6e-05


Epoch,Training Loss,Validation Loss
1,0.0144,0.000327
2,0.001,1.6e-05
3,0.0004,1.1e-05


Eval loss: 0.00


('bert_fake_news_detector/tokenizer_config.json',
 'bert_fake_news_detector/special_tokens_map.json',
 'bert_fake_news_detector/vocab.txt',
 'bert_fake_news_detector/added_tokens.json',
 'bert_fake_news_detector/tokenizer.json')

In [2]:
preds = trainer.predict(test_dataset).predictions.argmax(axis=-1)
true_labels = y_test

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(true_labels, preds)
precision = precision_score(true_labels, preds, average="binary")
recall = recall_score(true_labels, preds, average="binary")
f1 = f1_score(true_labels, preds, average="binary")

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 1.0000
Precision: 1.0000
Recall: 1.0000
F1 Score: 1.0000
