In [257]:
import pandas as pd
import numpy as np
import re 
import torch 
import torch.nn as nn

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight

from transformers import(
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification, 
    Trainer, 
    TrainingArguments
)

from datasets import Dataset

In [258]:
df = pd.read_parquet('dataset/news_dataset.parquet')
df

Unnamed: 0,date,category,country,country_code,text,source,hash
0,2026-01-18,general,India,in,"Women outnumber men in 151-member NMC House, b...",times of india,02d4db3abf9d3ae52e304b44c5079248bd7b88c6cfdb41...
1,2026-01-18,general,India,in,Punjab: 227 drug smugglers arrested in statewi...,hindustan times,aee700b752c3ad413321b7b5d07ccfe608553bff074b39...
2,2026-01-18,general,India,in,Sat calls for strong public support to realize...,daily excelsior,15b8454b537eda0484985c579d35e2dd2a2f8b2e5b88bc...
3,2026-01-18,general,India,in,NC‚Äôs agenda to place J&K on path of developmen...,daily excelsior,4d8771bfbfb094d99a6c4bec7c017a552fd82aed4ad535...
4,2026-01-18,general,India,in,"DIG, SSP decorate newly promoted Selection Gra...",daily excelsior,b9f79908f6c92672b0363fdb850d45ff2d180e9b68b168...
...,...,...,...,...,...,...,...
1554,2026-01-20,health,Philippines,ph,2.8M Mindanao kids eyed for measles vaccine. C...,sunstar philippines,37876200e2e94b0ef26efccc01eb64a0a235e592b01cdb...
1555,2026-01-20,health,Philippines,ph,Sharjah Achieves Historic WHO Age-Friendly Sta...,the manila times,f008c780b764cc658427a18535a761c2b8b0ec06d244c5...
1556,2026-01-20,health,Philippines,ph,iCare HMO transitions to independent operation...,manilastandard.net,606a1e8dfd390b93e01acf8a64ceb047301597f46d642e...
1557,2026-01-20,health,Philippines,ph,CNP Global Reignites the K-Kids Red Ginseng Be...,the manila times,2438543359263ddc1f9052e9c6d1b00f3fb2a164c0657b...


In [259]:
df.shape

(1559, 7)

In [260]:
df["source"].value_counts()/100*100

source
the star             89.0
dhaka tribune        89.0
the manila times     72.0
marketscreener       72.0
the straits times    60.0
                     ... 
the herald            1.0
sky news              1.0
toronto star          1.0
derby telegraph       1.0
daily echo            1.0
Name: count, Length: 245, dtype: float64

In [261]:
len(df["source"].unique())

245

In [262]:
SUSPICIOUS_KEYWORDS = [
    "shocking", "breaking", "exposed",
    "you won't believe", "miracle",
    "secret", "banned", "truth revealed", 
    "aliens", "miracle cure", "5g causes",
    "secret government", "illuminati",
    "banned by media", "truth they hide",
    "hoax exposed", "fake vaccine"
]

def is_strong_fake(text):
    text = text.lower()
    return any(k in text for k in SUSPICIOUS_KEYWORDS)

In [263]:
def assign_label(source, text):
    source = str(source).lower().strip()

    if is_strong_fake(text):
        return 0

    return 1  # drop

In [264]:
df["label"] = df.apply(
    lambda row: assign_label(row["source"], row["text"]),
    axis=1
)

In [265]:
df["label"].value_counts()

label
1    1486
0      73
Name: count, dtype: int64

In [266]:
df["category"].value_counts()

category
world            223
business         191
technology       181
general          179
health           170
nation           165
sports           159
entertainment    147
science          144
Name: count, dtype: int64

In [267]:
df.head(2)

Unnamed: 0,date,category,country,country_code,text,source,hash,label
0,2026-01-18,general,India,in,"Women outnumber men in 151-member NMC House, b...",times of india,02d4db3abf9d3ae52e304b44c5079248bd7b88c6cfdb41...,1
1,2026-01-18,general,India,in,Punjab: 227 drug smugglers arrested in statewi...,hindustan times,aee700b752c3ad413321b7b5d07ccfe608553bff074b39...,1


In [268]:
df_real = df[df["label"]== 1].reset_index(drop= True)

In [269]:
df_real

Unnamed: 0,date,category,country,country_code,text,source,hash,label
0,2026-01-18,general,India,in,"Women outnumber men in 151-member NMC House, b...",times of india,02d4db3abf9d3ae52e304b44c5079248bd7b88c6cfdb41...,1
1,2026-01-18,general,India,in,Punjab: 227 drug smugglers arrested in statewi...,hindustan times,aee700b752c3ad413321b7b5d07ccfe608553bff074b39...,1
2,2026-01-18,general,India,in,"DIG, SSP decorate newly promoted Selection Gra...",daily excelsior,b9f79908f6c92672b0363fdb850d45ff2d180e9b68b168...,1
3,2026-01-18,general,India,in,DGCA slaps IndiGo with fine of Rs 22 crore for...,times of india,ec133cd208c1b40e18bf2c0a89052654c5325dce5b6cb0...,1
4,2026-01-18,general,India,in,Cop dragged on moving SUV in Hisar; two arrest...,times of india,dbb04a9d42517b171b3eafc7d9ea7f73cb2ef18430b96c...,1
...,...,...,...,...,...,...,...,...
1481,2026-01-20,health,Philippines,ph,2.8M Mindanao kids eyed for measles vaccine. C...,sunstar philippines,37876200e2e94b0ef26efccc01eb64a0a235e592b01cdb...,1
1482,2026-01-20,health,Philippines,ph,Sharjah Achieves Historic WHO Age-Friendly Sta...,the manila times,f008c780b764cc658427a18535a761c2b8b0ec06d244c5...,1
1483,2026-01-20,health,Philippines,ph,iCare HMO transitions to independent operation...,manilastandard.net,606a1e8dfd390b93e01acf8a64ceb047301597f46d642e...,1
1484,2026-01-20,health,Philippines,ph,CNP Global Reignites the K-Kids Red Ginseng Be...,the manila times,2438543359263ddc1f9052e9c6d1b00f3fb2a164c0657b...,1


In [270]:
df_fake = df[df["label"]== 0].reset_index(drop= True)

In [271]:
df_fake

Unnamed: 0,date,category,country,country_code,text,source,hash,label
0,2026-01-18,general,India,in,Sat calls for strong public support to realize...,daily excelsior,15b8454b537eda0484985c579d35e2dd2a2f8b2e5b88bc...,0
1,2026-01-18,general,India,in,NC‚Äôs agenda to place J&K on path of developmen...,daily excelsior,4d8771bfbfb094d99a6c4bec7c017a552fd82aed4ad535...,0
2,2026-01-18,general,India,in,Marg Darshan corner stone for rehabilitation o...,daily excelsior,7a98f75483d2690966132b9a6bd0a8417f003c2cd07956...,0
3,2026-01-18,general,India,in,"EPS announces free bus travel for men, ‚Çπ2,000 ...",times of india,f49ba7b6a9aa97571c0d8195d4cc29ac1ffc1cea333dda...,0
4,2026-01-18,general,India,in,CAG urges EDC to overhaul loan management afte...,times of india,9bfb6abb4d31473646afffacb06090acaa011551380564...,0
...,...,...,...,...,...,...,...,...
68,2026-01-20,general,Malaysia,my,"‚ÄúNo Team A, B or C, but only Team DAP,‚Äù Loke d...",the star,eebf5b423c9ec3b6e04f0f4dff77f87838c737db8f514a...,0
69,2026-01-20,technology,Malaysia,my,Philippines seeks to block access to Grok on c...,the star,16676a24f7b7ad365a6a598df7149a1b3b38273db28a7c...,0
70,2026-01-20,health,Malaysia,my,Record-breaking Hong Kong powerlifter and diet...,the star,3c676e6da95c94075516adcd84311d95e670074e1ece1e...,0
71,2026-01-20,general,Philippines,ph,Secretary-General on UN at 80: Humanity strong...,the manila times,01298639921f5cc13a56d74aabd771982c153ea616232f...,0


In [272]:
# Introducing news from a fake_dataset

df_fake_news = pd.read_parquet("dataset/fake_news.parquet")

In [273]:
df_fake_news.shape

(33405, 5)

In [274]:
df_fake_news.head(1)

Unnamed: 0,source,text,label,title,url
0,unknown,Donald Trump just couldn t wish all Americans ...,0,Donald Trump Sends Out Embarrassing New Year‚Äô...,


In [275]:
df_fake_sampled = df_fake_news.sample(
    n=2 * len(df_real),
    random_state=42
)

In [276]:
df_fake_sampled.head(2)

Unnamed: 0,source,text,label,title,url
26240,unknown,says its a statistical reality that most singl...,0,says its a statistical reality that most singl...,
7831,unknown,Imagining what Donald Trump would be like as t...,0,Bill Maher Acts Out Trump SOTU Address And PE...,


In [277]:
df_fake_sampled = df_fake_sampled[["text", "label"]]
df_fake = df_fake[["text", "label"]]
df_real = df_real[["text", "label"]]

In [278]:
df_balanced = pd.concat(
    [df_real, df_fake_sampled, df_fake],
    ignore_index=True
)

In [279]:
df_balanced.head(2)

Unnamed: 0,text,label
0,"Women outnumber men in 151-member NMC House, b...",1
1,Punjab: 227 drug smugglers arrested in statewi...,1


In [280]:
df_balanced.shape

(4531, 2)

In [281]:
df_balanced["label"].value_counts()

label
0    3045
1    1486
Name: count, dtype: int64

In [282]:
df = df_balanced.copy()

In [283]:
# Text cleaning
def clean_text(text): 
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text) 
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df["text"] = df["text"].apply(clean_text)

In [284]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], 
    df["label"], 
    test_size= 0.2, 
    random_state= 42, 
    stratify= df["label"]
)

In [285]:
# Convert to Huggingface dataset

train_df = pd.DataFrame({"text": X_train, "label": y_train})
test_df = pd.DataFrame({"text": X_test, "label": y_test})

train_dataset = Dataset.from_pandas(train_df) 
test_dataset = Dataset.from_pandas(test_df)

In [286]:
# Load DistilBERT tokenizer 

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

'(ReadTimeoutError("HTTPSConnectionPool(host='huggingface.co', port=443): Read timed out. (read timeout=10)"), '(Request ID: f8e07825-051f-445e-be52-8fd807af2426)')' thrown while requesting HEAD https://huggingface.co/distilbert-base-uncased/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].


In [287]:
# Tokenization Function

def tokenize(batch): 
    return tokenizer(
        batch["text"], 
        padding = "max_length", 
        truncation = True, 
        max_length = 128
    )

train_dataset = train_dataset.map(tokenize, batched= True, batch_size= 512)
test_dataset = test_dataset.map(tokenize, batched= True, batch_size= 512) 

train_dataset.set_format(
    type= "torch", 
    columns= ["input_ids", "attention_mask", "label"]
)
test_dataset.set_format(
    type= "torch", 
    columns= ["input_ids", "attention_mask", "label"]
)

Map:   0%|          | 0/3624 [00:00<?, ? examples/s]

Map:   0%|          | 0/907 [00:00<?, ? examples/s]

In [288]:
# Load DistilBERT Model 

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels= 2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [289]:
import torch

print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

True
NVIDIA GeForce GTX 1050 Ti


In [290]:
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=df_balanced["label"].values
)

class_weights = torch.tensor(class_weights, dtype=torch.float)

In [291]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        loss_fn = nn.CrossEntropyLoss(
            weight=class_weights.to(logits.device)
        )
        loss = loss_fn(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [292]:
# Evaluation Metrics 
def compute_metrics(eval_pred): 
    logits, labels = eval_pred
    preds = np.argmax(logits, axis= 1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average= "macro"
    )
    acc = accuracy_score(labels, preds)

    return{
        "accuracy": acc, 
        "precision": precision, 
        "recall": recall, 
        "f1": f1
    }

In [293]:
# Training Arguments

training_args = TrainingArguments(
    output_dir= "./results", 
    eval_strategy= "epoch", 
    save_strategy= "epoch", 
    learning_rate= 2e-5, 
    per_device_train_batch_size= 16, 
    per_device_eval_batch_size= 16, 
    num_train_epochs= 2, 
    weight_decay= 0.01, 
    fp16= True,
    dataloader_num_workers= 3,
    dataloader_pin_memory= True,
    logging_dir= "./logs", 
    logging_steps= 500, 
    load_best_model_at_end= True, 
    metric_for_best_model= "f1"
)

In [294]:
# Trainer Setup 

trainer = WeightedTrainer(
    model= model, 
    args= training_args, 
    train_dataset= train_dataset, 
    eval_dataset= test_dataset, 
    processing_class= tokenizer, 
    compute_metrics= compute_metrics
)

In [295]:
# Train model 

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.108639,0.969129,0.958264,0.974458,0.965587
2,No log,0.111264,0.970232,0.959717,0.975277,0.966789


TrainOutput(global_step=454, training_loss=0.14355957770662686, metrics={'train_runtime': 626.256, 'train_samples_per_second': 11.574, 'train_steps_per_second': 0.725, 'total_flos': 240030926364672.0, 'train_loss': 0.14355957770662686, 'epoch': 2.0})

In [303]:
results = trainer.evaluate()
print("\nEvaluation results:")
print(results)


Evaluation results:
{'eval_loss': 0.11126358807086945, 'eval_accuracy': 0.9702315325248071, 'eval_precision': 0.9597174616394912, 'eval_recall': 0.9752773638019541, 'eval_f1': 0.966789397668789, 'eval_runtime': 85.8518, 'eval_samples_per_second': 10.565, 'eval_steps_per_second': 0.664, 'epoch': 2.0}


In [297]:
model.save_pretrained("fake_news_distilbert_modelep1")
tokenizer.save_pretrained("fake_news_distilbert_modelep1")

('fake_news_distilbert_modelep1\\tokenizer_config.json',
 'fake_news_distilbert_modelep1\\special_tokens_map.json',
 'fake_news_distilbert_modelep1\\vocab.txt',
 'fake_news_distilbert_modelep1\\added_tokens.json',
 'fake_news_distilbert_modelep1\\tokenizer.json')

In [298]:
device = model.device
device

device(type='cuda', index=0)

In [304]:
def predict_news(text: str, threshold: float = 0.9):
    """
    Predicts ONLY REAL or FAKE.
    Uses confidence thresholding with forced fallback.
    """

    model.eval()
    device = next(model.parameters()).device  # safer than model.device

    # Tokenize input
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=256
    )

    # Move tensors to model device
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    # Softmax over logits
    probs = torch.softmax(outputs.logits, dim=1).squeeze(0)

    # IMPORTANT: label mapping
    # 0 -> FAKE
    # 1 -> REAL
    prob_fake = probs[0].item()
    prob_real = probs[1].item()

    # High-confidence decision
    if prob_fake >= threshold:
        label = "FAKE"
        confidence = prob_fake

    elif prob_real >= threshold:
        label = "REAL"
        confidence = prob_real

    else:
        # Forced fallback (still ONLY 2 labels)
        if prob_real >= prob_fake:
            label = "REAL"
            confidence = prob_real
        else:
            label = "FAKE"
            confidence = prob_fake

    return {
        "prediction": label,
        "confidence": round(confidence, 4),
        "prob_fake": round(prob_fake, 4),
        "prob_real": round(prob_real, 4)
    }

In [308]:
sample_real = """
The Government of India on Monday announced a new education policy
aimed at improving access to higher education and digital learning.
Officials said the policy will be implemented nationwide next year.
"""

sample_fake = """
A shocking revelation has emerged claiming aliens helped create the
COVID-19 virus. Social media users are spreading unverified reports
without any scientific evidence.
"""

print(predict_news(sample_real))
print(predict_news(sample_fake))

{'prediction': 'REAL', 'confidence': 0.7736, 'prob_fake': 0.2264, 'prob_real': 0.7736}
{'prediction': 'FAKE', 'confidence': 0.991, 'prob_fake': 0.991, 'prob_real': 0.009}
