In [1]:
import pandas as pd
import numpy as np
import re 
import torch 

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support 

from transformers import(
    DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
)

from datasets import Dataset

# load csv data into dataframe

In [3]:
fake_df = pd.read_csv('dataset/fake.csv')
true_df = pd.read_csv('dataset/true.csv')

print(fake_df.shape, true_df.shape)

(23481, 4) (21417, 4)


# Label encoding before Concatination

In [4]:
fake_df["label"] = 0 
true_df["label"] = 1

In [5]:
df = pd.concat([fake_df, true_df], axis= 0)
df.sample(5)

Unnamed: 0,title,text,subject,date,label
9522,APOLOGY ISSUED After LA TIMES and NY TIMES Col...,Wow! What a couple of hypocrites and haters! W...,politics,"Nov 3, 2017",0
19607,DANIEL HANNAN Tells “The Generation Of The Saf...,,left-news,"Nov 15, 2016",0
14764,FOX News Just Announced Moderators For Next GO...,Will Megyn Kelly be working towards a gotcha ...,politics,"Dec 22, 2015",0
11230,BREAKING: A Third Democrat Senator To Vote For...,"Assuming all Republicans support Gorsuch, the ...",politics,"Apr 2, 2017",0
381,"Kid Rock Just Had His First Political Rally, ...","Robert Ritchie, AKA Kid Rock, has shown quite ...",News,"September 14, 2017",0


In [6]:
df = df.sample(frac= 1, random_state= 42).reset_index(drop= True)

df = df[["text", "label"]]
df["label"].value_counts()

label
0    23481
1    21417
Name: count, dtype: int64

In [7]:
# Text cleaning

def clean_text(text): 
    text = text.lower() 
    text = re.sub(r"http\S+", "", text) 
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip() 

df["text"] = df["text"].apply(clean_text)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size= 0.2, 
    random_state= 42, 
    stratify= df["label"]
)

In [9]:
# Convert to Huggingface dataset

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_dataset = Dataset.from_pandas(train_df) 
test_dataset = Dataset.from_pandas(test_df)

In [10]:
# Load DistilBERT tokenizer 

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

In [11]:
# Tokenization Function

def tokenize(batch): 
    return tokenizer(
        batch["text"], 
        padding = "max_length", 
        truncation = True, 
        max_length = 128
    )

train_dataset = train_dataset.map(tokenize, batched= True, batch_size= 512)
test_dataset = test_dataset.map(tokenize, batched= True, batch_size= 512) 

train_dataset.set_format(
    type= "torch", 
    columns= ["input_ids", "attention_mask", "label"]
)
test_dataset.set_format(
    type= "torch", 
    columns= ["input_ids", "attention_mask", "label"]
)

Map:   0%|          | 0/35918 [00:00<?, ? examples/s]

Map:   0%|          | 0/8980 [00:00<?, ? examples/s]

In [12]:
# Load DistilBERT Model 

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels= 2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce GTX 1050 Ti


In [29]:
import torch
import sys

print("Python:", sys.executable)
print("Torch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)


Python: C:\ProgramData\anaconda3\python.exe
Torch version: 2.5.1+cu121
CUDA available: True
CUDA version: 12.1


In [30]:
# Evaluation Metrics 
def compute_metrics(eval_pred): 
    logits, labels = eval_pred
    preds = np.argmax(logits, axis= 1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average= "binary"
    )
    acc = accuracy_score(labels, preds)

    return{
        "accuracy": acc, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1
    }

In [31]:
# Training Arguments

training_args = TrainingArguments(
    output_dir= "./results", 
    eval_strategy= "epoch", 
    save_strategy= "epoch", 
    learning_rate= 2e-5, 
    per_device_train_batch_size= 16, 
    per_device_eval_batch_size= 16, 
    num_train_epochs= 1, 
    weight_decay= 0.01, 
    fp16= True,
    dataloader_num_workers= 3,
    dataloader_pin_memory= True,
    logging_dir= "./logs", 
    logging_steps= 500, 
    load_best_model_at_end= True, 
    metric_for_best_model= "f1"
)

In [32]:
# Trainer Setup 

trainer = Trainer(
    model= model, 
    args= training_args, 
    train_dataset= train_dataset, 
    eval_dataset= test_dataset, 
    processing_class= tokenizer, 
    compute_metrics= compute_metrics
)

In [33]:
# Train model 

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0051,0.004521,0.998998,0.9986,0.9993,0.99895


TrainOutput(global_step=2245, training_loss=0.015028969588417785, metrics={'train_runtime': 1936.945, 'train_samples_per_second': 18.544, 'train_steps_per_second': 1.159, 'total_flos': 1189491006231552.0, 'train_loss': 0.015028969588417785, 'epoch': 1.0})

In [34]:
results = trainer.evaluate()
print(results)

{'eval_loss': 0.004520554095506668, 'eval_accuracy': 0.9989977728285078, 'eval_precision': 0.9986004198740378, 'eval_recall': 0.9992997198879552, 'eval_f1': 0.9989499474973749, 'eval_runtime': 138.594, 'eval_samples_per_second': 64.794, 'eval_steps_per_second': 4.055, 'epoch': 1.0}


In [35]:
model.save_pretrained("fake_news_distilbert_modelep1")
tokenizer.save_pretrained("fake_news_distilbert_modelep1")

('fake_news_distilbert_modelep1\\tokenizer_config.json',
 'fake_news_distilbert_modelep1\\special_tokens_map.json',
 'fake_news_distilbert_modelep1\\vocab.txt',
 'fake_news_distilbert_modelep1\\added_tokens.json',
 'fake_news_distilbert_modelep1\\tokenizer.json')

In [36]:
device = model.device
device

device(type='cuda', index=0)

In [56]:
def predict_news(text): 

    model.eval()  # very important

    device = model.device  # gpu or cpu
    
    inputs = tokenizer(
        text, 
        return_tensors= "pt", 
        truncation= True, 
        padding= True, 
        max_length= 256
    )

    # move inputs to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs) 

    pred = torch.argmax(outputs.logits, dim= 1).item()
    return "REAL" if pred == 1 else "FAKE" 

In [59]:
sample_real = """
The Government of India on Monday announced a new education policy
aimed at improving access to higher education and digital learning.
Officials said the policy will be implemented nationwide next year.
"""

sample_fake = """
A shocking revelation has emerged claiming aliens helped create the
COVID-19 virus. Social media users are spreading unverified reports
without any scientific evidence.
"""

print(predict_news(sample_real))
print(predict_news(sample_fake))

REAL
FAKE


In [60]:
import torch
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Initialize app
app = FastAPI(title="Fake News Detection API")

# Load tokenizer & model
MODEL_PATH = "fake_news_distilbert_modelep1"

tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_PATH)
model = DistilBertForSequenceClassification.from_pretrained(MODEL_PATH)
model.eval()

# Request schema
class NewsRequest(BaseModel):
    text: str

# Prediction endpoint
@app.post("/predict")
def predict_news(news: NewsRequest):
    inputs = tokenizer(
        news.text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    with torch.no_grad():
        outputs = model(**inputs)

    prediction = torch.argmax(outputs.logits, dim=1).item()

    return {
        "prediction": "REAL" if prediction == 1 else "FAKE"
    }

In [43]:
df["label"].value_counts(normalize=True)

label
0    0.522985
1    0.477015
Name: proportion, dtype: float64

In [51]:
print(len("Government announces new education policy".split()))

5


In [52]:
df["text"].str.split().str.len().describe()

count    44898.000000
mean       399.588690
std        346.856578
min          0.000000
25%        200.000000
50%        358.000000
75%        506.000000
max       8044.000000
Name: text, dtype: float64