In [1]:
import torch
import pandas as pd
import numpy as np
import re
import nltk
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Download NLTK stopwords (if needed)
nltk.download("stopwords")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vinaybabujatla\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Define dataset path (adjust as needed)
dataset_path = "D:/nagababujatla/FakeNewsDetection/liar/"

# Load LIAR dataset files (assumes .tsv files)
train_df = pd.read_csv(dataset_path + "train.tsv", sep="\t", header=None)
valid_df = pd.read_csv(dataset_path + "valid.tsv", sep="\t", header=None)
test_df  = pd.read_csv(dataset_path + "test.tsv", sep="\t", header=None)

# Define column names based on LIAR dataset structure
column_names = ["id", "label", "statement", "subject", "speaker", "job", "state", "party", 
                "barely_true", "false", "half_true", "mostly_true", "pants_on_fire", "context"]

# Assign column names
train_df.columns = column_names
valid_df.columns = column_names
test_df.columns = column_names

# Keep only the necessary columns: 'statement' and 'label'
train_df = train_df[["statement", "label"]]
valid_df = valid_df[["statement", "label"]]
test_df  = test_df[["statement", "label"]]

# Display the first few rows of the training set
train_df.head()


Unnamed: 0,statement,label
0,Says the Annies List political group supports ...,false
1,When did the decline of coal start? It started...,half-true
2,"Hillary Clinton agrees with John McCain ""by vo...",mostly-true
3,Health care reform legislation is likely to ma...,false
4,The economic turnaround started at the end of ...,half-true


In [3]:
# Define label mapping: "true" & "mostly-true" as Real (1); the rest as Fake (0)
label_mapping = {
    "true": 1,
    "mostly-true": 1,
    "half-true": 1,
    "barely-true": 0,
    "false": 0,
    "pants-on-fire": 0
}

# Apply mapping to all datasets
train_df["label"] = train_df["label"].map(label_mapping)
valid_df["label"] = valid_df["label"].map(label_mapping)
test_df["label"]  = test_df["label"].map(label_mapping)

# Drop any rows with missing values (if any)
train_df.dropna(inplace=True)
valid_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# Check the updated training data
train_df.head()


Unnamed: 0,statement,label
0,Says the Annies List political group supports ...,0.0
1,When did the decline of coal start? It started...,1.0
2,"Hillary Clinton agrees with John McCain ""by vo...",1.0
3,Health care reform legislation is likely to ma...,0.0
4,The economic turnaround started at the end of ...,1.0


In [4]:
# Define the local TinyBERT model path (adjust as needed)
tinybert_path = "D:/nagababujatla/FakeNewsDetection/TinyBERT_General_4L_312D/"

# Load TinyBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained(tinybert_path)
model = BertForSequenceClassification.from_pretrained(
    tinybert_path, 
    num_labels=2,
    problem_type="single_label_classification"  # ✅ Force CrossEntropyLoss (correct for binary classification)
)

# Extract text from dataframes
train_texts = train_df["statement"].tolist()
valid_texts = valid_df["statement"].tolist()
test_texts  = test_df["statement"].tolist()

# Tokenize texts (with truncation and padding)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=128)
test_encodings  = tokenizer(test_texts,  truncation=True, padding=True, max_length=128)


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at D:/nagababujatla/FakeNewsDetection/TinyBERT_General_4L_312D/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
print("Real News Count:", sum(train_df["label"] == 1))
print("Fake News Count:", sum(train_df["label"] == 0))


Real News Count: 5752
Fake News Count: 3649


In [6]:
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": train_df["label"].astype(int).tolist()  # ✅ Ensure integer labels
})

valid_dataset = Dataset.from_dict({
    "input_ids": valid_encodings["input_ids"],
    "attention_mask": valid_encodings["attention_mask"],
    "labels": valid_df["label"].astype(int).tolist()
})


test_dataset = Dataset.from_dict({
    "input_ids": test_encodings["input_ids"],
    "attention_mask": test_encodings["attention_mask"],
    "labels": torch.tensor(test_df["label"].astype(int).tolist(), dtype=torch.long).tolist(),
})


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,  # ⬆️ Increase to 20
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True
)


In [10]:
import shutil
shutil.rmtree("results/", ignore_errors=True)


In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

# Begin training TinyBERT
trainer.train()


Epoch,Training Loss,Validation Loss


In [96]:
eval_results = trainer.evaluate(test_dataset)
print("Test Evaluation Results:", eval_results)


Test Evaluation Results: {'eval_loss': 0.7235797047615051, 'eval_runtime': 23.4364, 'eval_samples_per_second': 50.136, 'eval_steps_per_second': 3.157, 'epoch': 5.0}


In [11]:
from sklearn.metrics import accuracy_score, classification_report

# Get raw model predictions
predictions = trainer.predict(test_dataset)

# Extract logits (raw output before softmax)
logits = predictions.predictions

# Convert logits to predicted labels (argmax selects the highest probability class)
predicted_labels = np.argmax(logits, axis=1)

# Get actual labels from test dataset
true_labels = test_df["label"].tolist()

# Calculate accuracy
accuracy = accuracy_score(true_labels, predicted_labels)
print(f"Test Accuracy: {accuracy * 100:.2f}%")

# Print detailed classification report (Precision, Recall, F1-score)
print(classification_report(true_labels, predicted_labels, target_names=["Fake News", "Real News"]))


NameError: name 'trainer' is not defined

In [97]:
model.save_pretrained("D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert")
tokenizer.save_pretrained("D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert")


('D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert\\tokenizer_config.json',
 'D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert\\special_tokens_map.json',
 'D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert\\vocab.txt',
 'D:/nagababujatla/FakeNewsDetection/fine_tuned_tinybert\\added_tokens.json')

In [98]:
def predict_fixed(statement):
    inputs = tokenizer(statement, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    
    # Swap prediction mapping
    return "Fake News" if prediction == 0 else "Real News"

# Test with swapped mapping
print(predict_fixed("You have won a gift, click on this link!"))
print(predict_fixed("Sun rises in the East."))


Fake News
Real News


In [99]:
test_statements = [
    "The government has approved a new economic policy.",  # Real News
    "NASA has confirmed the discovery of a new planet.",  # Real News
    "Click here to claim your free iPhone now!",  # Fake News
    "COVID-19 vaccines are effective and safe.",  # Real News
    "Breaking: The president has resigned amid corruption charges.",  # Real News
    "Congratulations! You've won $1,000,000! Claim now!",  # Fake News
    "Climate change is causing rising sea levels.",  # Real News
    "A celebrity was spotted using this secret weight-loss pill!",  # Fake News
    "Aliens have been found in the Amazon rainforest!",  # Fake News
    "Stock market sees a 10% rise after positive economic reports.",  # Real News
    "This miracle cure can remove all diseases!",  # Fake News
    "Scientists discover a new species of dinosaur in Argentina.",  # Real News
    "Lottery winner reveals the secret trick to winning!",  # Fake News
    "Fake news alert: A dangerous email scam is going around!",  # Fake News
    "Sports: The national team wins the championship!",  # Real News
    "Experts warn about the rise of misinformation on social media.",  # Real News
    "A man claims he traveled through time and met his future self!",  # Fake News
    "The central bank announces a new interest rate hike.",  # Real News
    "Shocking! A woman finds gold hidden in her backyard!",  # Fake News
    "New study finds that daily exercise improves mental health.",  # Real News
]

# Run predictions
for statement in test_statements:
    print(f"Input: {statement}")
    print(f"Prediction: {predict_fixed(statement)}\n")


Input: The government has approved a new economic policy.
Prediction: Fake News

Input: NASA has confirmed the discovery of a new planet.
Prediction: Fake News

Input: Click here to claim your free iPhone now!
Prediction: Fake News

Input: COVID-19 vaccines are effective and safe.
Prediction: Fake News

Input: Breaking: The president has resigned amid corruption charges.
Prediction: Real News

Input: Congratulations! You've won $1,000,000! Claim now!
Prediction: Real News

Input: Climate change is causing rising sea levels.
Prediction: Fake News

Input: A celebrity was spotted using this secret weight-loss pill!
Prediction: Fake News

Input: Aliens have been found in the Amazon rainforest!
Prediction: Fake News

Input: Stock market sees a 10% rise after positive economic reports.
Prediction: Real News

Input: This miracle cure can remove all diseases!
Prediction: Real News

Input: Scientists discover a new species of dinosaur in Argentina.
Prediction: Real News

Input: Lottery winner r

In [77]:
import torch.nn.functional as F

def predict_fixed(statement):
    inputs = tokenizer(statement, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    logits = outputs.logits

    # Convert logits to probabilities
    probs = F.softmax(logits, dim=1)
    
    # Adjust threshold: Instead of argmax, use a probability threshold
    prediction = 1 if probs[0][1] > 0.6 else 0  # Adjust the 0.6 threshold as needed

    print(f"Logits: {logits}")
    print(f"Probabilities: {probs}")
    print(f"Predicted Class: {prediction}")

    return "Fake News" if prediction == 0 else "Real News"

# Run new tests
for statement in test_statements:
    print(f"Input: {statement}")
    print(f"Prediction: {predict_fixed(statement)}\n")


Input: The government has approved a new economic policy.
Logits: tensor([[ 0.7740, -0.7078]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.8148, 0.1852]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 0
Prediction: Fake News

Input: NASA has confirmed the discovery of a new planet.
Logits: tensor([[ 0.9731, -0.9009]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.8669, 0.1331]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 0
Prediction: Fake News

Input: Click here to claim your free iPhone now!
Logits: tensor([[ 1.0024, -0.9274]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.8732, 0.1268]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 0
Prediction: Fake News

Input: COVID-19 vaccines are effective and safe.
Logits: tensor([[ 0.9515, -0.8805]], grad_fn=<AddmmBackward0>)
Probabilities: tensor([[0.8620, 0.1380]], grad_fn=<SoftmaxBackward0>)
Predicted Class: 0
Prediction: Fake News

Input: Breaking: The president has resigned amid corruption charges.
Logits: tensor([[ 