In [10]:
# Cell 1: Imports and Setup
import pandas as pd
import numpy as np
import re
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import Dataset

print("Imports complete.")
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")

Imports complete.
Using device: cuda
Device name: NVIDIA GeForce RTX 3050 Laptop GPU


In [None]:
# Cell 2: Define Cleaning Function and Load Data
def clean_source_tags(text):
  """Removes source tags like (Reuters) from the start of a string."""
  text = str(text)
  # This regex removes patterns like "WASHINGTON (Reuters) -"
  cleaned_text = re.sub(r'^\s*[\w\s/]+\s*\([\w\s]+\)\s*-\s*', '', text, flags=re.IGNORECASE)
  return cleaned_text

print("Loading datasets...")
try:
    real1 = pd.read_csv("../data/Real.csv")
    real2 = pd.read_csv("../data/Real_2.csv")
    # Using on_bad_lines='skip' for robustness
    fake1 = pd.read_csv("../data/Fake.csv", on_bad_lines='skip', low_memory=False)
    fake2 = pd.read_csv("../data/Fake_2.csv")
    print("All CSV files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: {e}. Make sure the CSV files are in the same folder as the notebook.")

Loading datasets...
All CSV files loaded successfully.


In [12]:
# Cell 3: Process, Clean, and Merge DataFrames
# Assign labels BEFORE merging
real1['label'] = 1
real2['label'] = 1
fake1['label'] = 0
fake2['label'] = 0

# Merge all into one DataFrame
df = pd.concat([real1, real2, fake1, fake2], ignore_index=True, sort=False)

# Keep only necessary columns and drop rows with missing data
df = df[['title', 'text', 'label']]
df.dropna(inplace=True)

# --- APPLY THE FIX FOR DATA LEAKAGE ---
print("Applying cleaning function to text data...")
df['text'] = df['text'].apply(clean_source_tags)

# Combine title and text into a single 'content' column
df['content'] = df['title'] + " " + df['text']

# Final cleanup
df = df[['content', 'label']]
df.drop_duplicates(inplace=True)
df = df.sample(frac=1).reset_index(drop=True)

print("\nData cleaning and merging complete.")
print(f"Total articles: {len(df)}")
print("Class distribution:\n", df['label'].value_counts())
print("\nSample of cleaned data:")
display(df.head())

# This line is optional, but good practice
df.to_csv("cleaned_final_data.csv", index=False)
print("\nCleaned data has been saved to 'cleaned_final_data.csv'")

Applying cleaning function to text data...

Data cleaning and merging complete.
Total articles: 39661
Class distribution:
 label
1    21698
0    17963
Name: count, dtype: int64

Sample of cleaned data:


Unnamed: 0,content,label
0,WATCH: Trump Just Accidentally Admitted He St...,0
1,Does face yoga actually work? Experts weigh in...,1
2,Tax overhaul drama moves to Senate as House ap...,1
3,House committee considering tax relief for hur...,1
4,'Grave question' on House intelligence chairma...,1



Cleaned data has been saved to 'cleaned_final_data.csv'


In [13]:
# Cell 4: Convert to Dataset and Tokenize
print("Converting to Hugging Face Dataset and tokenizing...")

# Convert pandas DataFrame to Hugging Face Dataset
hg_dataset = Dataset.from_pandas(df)

# Load tokenizer
model_checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Tokenize the 'content' column
def tokenize_function(examples):
    return tokenizer(examples["content"], truncation=True, max_length=512)

tokenized_dataset = hg_dataset.map(tokenize_function, batched=True, remove_columns=['content'])
print("Tokenization complete.")

Converting to Hugging Face Dataset and tokenizing...




Map:   0%|          | 0/39661 [00:00<?, ? examples/s]

Tokenization complete.


In [14]:
# Cell 5: Split into Training and Test Sets
final_datasets = tokenized_dataset.train_test_split(test_size=0.2, seed=42)

print("Dataset split into training and testing sets:")
print(final_datasets)

Dataset split into training and testing sets:
DatasetDict({
    train: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 31728
    })
    test: Dataset({
        features: ['label', 'input_ids', 'attention_mask'],
        num_rows: 7933
    })
})


In [15]:
# Cell 6: Configure the Trainer (Corrected)

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# Define the function to compute metrics during evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# Training Arguments (without the problematic argument)
training_args = TrainingArguments(
    output_dir="./cleaned_results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    report_to="none"
)

# Data collator handles batching and padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# ✨ FIX IS HERE: We define the callback separately
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=2)

# Instantiate the Trainer and pass the callback
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=final_datasets["train"],
    eval_dataset=final_datasets["test"],
    compute_metrics=compute_metrics,
    data_collator=data_collator,
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback] # ✨ AND PASS IT HERE
)

print("Trainer configured successfully with early stopping.")

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Trainer configured successfully with early stopping.


In [16]:
# Cell 7: Train the Model
print("Starting model training...")

trainer.train()

print("Training complete.")

Starting model training...


  0%|          | 0/11898 [00:00<?, ?it/s]

{'loss': 0.2596, 'grad_norm': 0.07416758686304092, 'learning_rate': 4.978988065221046e-05, 'epoch': 0.01}
{'loss': 0.0965, 'grad_norm': 0.031324438750743866, 'learning_rate': 4.957976130442091e-05, 'epoch': 0.03}
{'loss': 0.1381, 'grad_norm': 11.813939094543457, 'learning_rate': 4.936964195663137e-05, 'epoch': 0.04}
{'loss': 0.0661, 'grad_norm': 29.78945541381836, 'learning_rate': 4.9159522608841824e-05, 'epoch': 0.05}
{'loss': 0.0442, 'grad_norm': 0.01289269607514143, 'learning_rate': 4.894940326105228e-05, 'epoch': 0.06}
{'loss': 0.0854, 'grad_norm': 33.22032165527344, 'learning_rate': 4.8739283913262734e-05, 'epoch': 0.08}
{'loss': 0.0433, 'grad_norm': 0.05260776728391647, 'learning_rate': 4.852916456547319e-05, 'epoch': 0.09}
{'loss': 0.0782, 'grad_norm': 0.046225883066654205, 'learning_rate': 4.8319045217683645e-05, 'epoch': 0.1}
{'loss': 0.0687, 'grad_norm': 0.011060580611228943, 'learning_rate': 4.81089258698941e-05, 'epoch': 0.11}
{'loss': 0.006, 'grad_norm': 0.0097336759790778

  0%|          | 0/992 [00:00<?, ?it/s]

{'eval_loss': 0.01037507876753807, 'eval_accuracy': 0.9977309971007186, 'eval_f1': 0.9979166666666667, 'eval_precision': 0.9979166666666667, 'eval_recall': 0.9979166666666667, 'eval_runtime': 140.1496, 'eval_samples_per_second': 56.604, 'eval_steps_per_second': 7.078, 'epoch': 1.0}
{'loss': 0.0001, 'grad_norm': 0.0015129001112654805, 'learning_rate': 3.3190452176836446e-05, 'epoch': 1.01}
{'loss': 0.0127, 'grad_norm': 0.002271753503009677, 'learning_rate': 3.29803328290469e-05, 'epoch': 1.02}
{'loss': 0.0402, 'grad_norm': 0.006044188514351845, 'learning_rate': 3.277021348125736e-05, 'epoch': 1.03}
{'loss': 0.0003, 'grad_norm': 0.00409852247685194, 'learning_rate': 3.256009413346781e-05, 'epoch': 1.05}
{'loss': 0.0305, 'grad_norm': 0.0010938247432932258, 'learning_rate': 3.234997478567827e-05, 'epoch': 1.06}
{'loss': 0.0191, 'grad_norm': 0.0020509427413344383, 'learning_rate': 3.213985543788872e-05, 'epoch': 1.07}
{'loss': 0.0211, 'grad_norm': 0.002401539124548435, 'learning_rate': 3.19

  0%|          | 0/992 [00:00<?, ?it/s]

{'eval_loss': 0.006436367053538561, 'eval_accuracy': 0.998991554266986, 'eval_f1': 0.999073859689743, 'eval_precision': 0.9993052339045855, 'eval_recall': 0.9988425925925926, 'eval_runtime': 140.2425, 'eval_samples_per_second': 56.566, 'eval_steps_per_second': 7.073, 'epoch': 2.0}
{'loss': 0.0, 'grad_norm': 8.859809167915955e-05, 'learning_rate': 1.659102370146243e-05, 'epoch': 2.0}
{'loss': 0.003, 'grad_norm': 9.774068166734651e-05, 'learning_rate': 1.6380904353672887e-05, 'epoch': 2.02}
{'loss': 0.0005, 'grad_norm': 7.229402399389073e-05, 'learning_rate': 1.6170785005883342e-05, 'epoch': 2.03}
{'loss': 0.0248, 'grad_norm': 0.00017230129742529243, 'learning_rate': 1.5960665658093797e-05, 'epoch': 2.04}
{'loss': 0.0087, 'grad_norm': 8.562341690063477, 'learning_rate': 1.5750546310304252e-05, 'epoch': 2.05}
{'loss': 0.0, 'grad_norm': 0.001984846079722047, 'learning_rate': 1.5540426962514708e-05, 'epoch': 2.07}
{'loss': 0.0, 'grad_norm': 0.00015360457473434508, 'learning_rate': 1.5330307

  0%|          | 0/992 [00:00<?, ?it/s]

{'eval_loss': 0.0033583189360797405, 'eval_accuracy': 0.999495777133493, 'eval_f1': 0.9995369298448715, 'eval_precision': 0.9997684113015285, 'eval_recall': 0.9993055555555556, 'eval_runtime': 139.4014, 'eval_samples_per_second': 56.908, 'eval_steps_per_second': 7.116, 'epoch': 3.0}
{'train_runtime': 5657.1338, 'train_samples_per_second': 16.825, 'train_steps_per_second': 2.103, 'train_loss': 0.012865702102552586, 'epoch': 3.0}
Training complete.


In [17]:
# Cell 8: Evaluate and Save
print("Evaluating final model on the test set...")

evaluation_results = trainer.evaluate()
print("\nFinal Evaluation Results:")
print(evaluation_results)

# Save the fine-tuned model and tokenizer
model_save_path = "./final_fake_news_model_fixed"
trainer.save_model(model_save_path)
print(f"\n✅ Model saved to: {model_save_path}")

Evaluating final model on the test set...


  0%|          | 0/992 [00:00<?, ?it/s]


Final Evaluation Results:
{'eval_loss': 0.0033583189360797405, 'eval_accuracy': 0.999495777133493, 'eval_f1': 0.9995369298448715, 'eval_precision': 0.9997684113015285, 'eval_recall': 0.9993055555555556, 'eval_runtime': 139.0723, 'eval_samples_per_second': 57.042, 'eval_steps_per_second': 7.133, 'epoch': 3.0}

✅ Model saved to: ./final_fake_news_model_fixed


In [20]:
# Cell 9: Inference Test
from transformers import pipeline

# This path was defined in the previous cell
model_save_path = "./final_fake_news_model_fixed"
print(f"Loading model from: {model_save_path}")

# Load the model using the pipeline for easy inference
pipe = pipeline("text-classification", model=model_save_path)

# Test sentences
test_texts = [
    "Virat Kohli Gave test cricket retirement",
    "Nasa says the Sun is gonna blast",
    "sky is white",
    "ind vs pak war ended",
    "trump is the president of america"
]

print("\n--- Testing the new, corrected model ---")
for text in test_texts:
    result = pipe(text)
    
    # The model's default labels are 'LABEL_0' and 'LABEL_1'.
    # We'll map them to our desired "FAKE" and "REAL" names for clarity.
    label_map = {'LABEL_1': 'REAL', 'LABEL_0': 'FAKE'}
    predicted_label = label_map[result[0]['label']]
    
    print(f"\n📰 Text: {text}")
    print(f"✅ Prediction: {predicted_label} (Confidence: {result[0]['score']:.2%})")

Loading model from: ./final_fake_news_model_fixed

--- Testing the new, corrected model ---

📰 Text: Virat Kohli Gave test cricket retirement
✅ Prediction: REAL (Confidence: 96.10%)

📰 Text: Nasa says the Sun is gonna blast
✅ Prediction: FAKE (Confidence: 100.00%)

📰 Text: sky is white
✅ Prediction: FAKE (Confidence: 99.99%)

📰 Text: ind vs pak war ended
✅ Prediction: FAKE (Confidence: 99.95%)

📰 Text: trump is the president of america
✅ Prediction: FAKE (Confidence: 99.99%)


In [21]:
# Add a new cell for custom testing

my_own_news_article = """
A new study published in the journal 'Nature' has found that certain species of deep-sea microbes
are capable of biodegrading plastics at a rate previously thought impossible. The discovery
could open new avenues for tackling plastic pollution in the world's oceans.
"""

# Use the same pipeline from the previous cell
result = pipe(my_own_news_article)

# Map the label and print the result
label_map = {'LABEL_1': 'REAL', 'LABEL_0': 'FAKE'}
predicted_label = label_map[result[0]['label']]

print(f"✅ Prediction: {predicted_label} (Confidence: {result[0]['score']:.2%})")

✅ Prediction: REAL (Confidence: 100.00%)
