In [1]:
import pandas as pd
import re
import string
import torch
import joblib
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split




In [2]:
# --- Load Dataset ---
fake = pd.read_csv('Datasets/Fake.csv')
true = pd.read_csv('Datasets/True.csv')
fake['class'] = 0
true['class'] = 1

In [3]:
# Combine and shuffle
df = pd.concat([fake, true]).sample(frac=1, random_state=42)
df['content'] = df['title'].fillna('') + " " + df['text'].fillna('') + " " + df['subject'].fillna('')
df.drop(["title", "subject", "date", "text"], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [4]:
# --- Clean Text ---
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r"https?://\S+|www\.\S+", "", text)
    text = re.sub(r"<.*?>+", "", text)
    text = re.sub(r"[%s]" % re.escape(string.punctuation), "", text)
    text = re.sub(r"\n", "", text)
    text = re.sub(r"\w*\d\w*", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

In [5]:
df["text"] = df["content"].apply(clean_text)
df = df[["text", "class"]].dropna().rename(columns={"class": "label"})

In [6]:
# 2. Tokenizer and Dataset
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

In [7]:
class NewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(list(texts), truncation=True, padding=True, max_length=512)
        self.labels = list(labels)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} | {"labels": torch.tensor(self.labels[idx])}

    def __len__(self):
        return len(self.labels)


In [8]:
# Split and create Dataloaders
train_texts, val_texts, train_labels, val_labels = train_test_split(df["text"], df["label"], test_size=0.2)
train_dataset = NewsDataset(train_texts, train_labels)
val_dataset = NewsDataset(val_texts, val_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)


In [9]:
from torch.optim import AdamW


In [10]:
# 3. Model Setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from tqdm import tqdm

In [13]:
# 4. Training Loop
epochs = 3
model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    # for batch in loop:
    for step, batch in enumerate(loop):
        if step >= 10:
            break
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

Epoch 1:   0%|          | 10/4490 [06:38<49:32:53, 39.82s/it, loss=0.653]
Epoch 2:   0%|          | 10/4490 [07:16<54:18:25, 43.64s/it, loss=0.0381]
Epoch 3:   0%|          | 10/4490 [06:35<49:15:22, 39.58s/it, loss=0.0761]


In [None]:
# 4. Training Loop
epochs = 3
model.train()
for epoch in range(epochs):
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

In [14]:
# 5. Save model + tokenizer
model.save_pretrained("./bert_fakenews")
tokenizer.save_pretrained("./bert_fakenews")
print("✅ Model and tokenizer saved at './bert_fakenews'")

✅ Model and tokenizer saved at './bert_fakenews'


In [18]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the fine-tuned BERT model and tokenizer
tokenizer_bert = AutoTokenizer.from_pretrained("./bert_fakenews")
bert_model = AutoModelForSequenceClassification.from_pretrained("./bert_fakenews")
bert_model.eval()



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
