In [None]:
!pip install opendatasets --quiet
import opendatasets as od
od.download('https://www.kaggle.com/datasets/clmentbisaillon/fake-and-real-news-dataset')

In [None]:
import torch
import pandas as pd
import os
import numpy as np

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device available: ", device)

#Load the Dataset

In [None]:
true = pd.read_csv("/content/fake-and-real-news-dataset/True.csv")
fake = pd.read_csv("/content/fake-and-real-news-dataset/Fake.csv")

In [None]:
true.head()

In [None]:
true["label"] = 1

In [None]:
true.head()

In [None]:
fake['label'] = 0

In [None]:
fake.head()

In [None]:
fake.drop(columns=["title","date","subject"],inplace=True)
true.drop(columns=["title","date","subject"],inplace=True)

In [None]:
true.head()

In [None]:
fake.head()

In [None]:
data=pd.concat([fake,true],ignore_index=True)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.isnull().sum()

In [None]:
data.duplicated().sum()

In [None]:
data.drop_duplicates(inplace=True)

In [None]:
data.duplicated().sum()

In [None]:
data['label'].value_counts()

#Pre-Process

In [None]:
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))
context_words = {'not', 'no', 'nor', 'never', 'very', 'against', 'without', 'hardly', 'barely'}
custom_stopwords = stop_words - context_words

In [None]:
lemm = nltk.stem.WordNetLemmatizer()

In [None]:
def clean(text):

  text = text.lower()
  text = text.replace('-', ' ')
  # Remove special characters
  text = re.sub(r'[^\w\s]', '', text)
  # Remove digits
  text = re.sub(r'\d+', '', text)
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text).strip()

  # Lemmatization and StopWord Removal
  tokens = text.split()
  tokens = [lemm.lemmatize(token, pos='v') for token in tokens if token not in custom_stopwords]

  sentence = ' '.join(tokens)
  return sentence

In [None]:
data['cleaned'] = data['text'].apply(lambda x: clean(x))

In [None]:
data.head()

In [None]:
max_len = 0
for text in data['cleaned']:
  length = len(text.split(' '))
  if length>max_len:
    max_len = length
print(max_len)

# Split

In [None]:
x = data['cleaned'].values  # numpy
y = data['label'].values

In [None]:
y[:5]

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_temp, y_train, y_temp = train_test_split(x, y, test_size=0.3)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.5)

In [None]:
x_train.shape, x_val.shape, x_test.shape

In [None]:
y_train[:5]

#Encoding

In [None]:
from torch.utils.data import Dataset, DataLoader

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") #each token’s embedding vector is of size 768
bert_model = AutoModel.from_pretrained("bert-base-uncased")

In [None]:
class dataset(Dataset):

  def __init__(self,X,y):
    self.X = [tokenizer(x,
                        max_length=300,
                        truncation=True,
                        padding = 'max_length',
                        return_tensors='pt' ).to(device)
              for x in X

              ]
    self.y = torch.tensor(y,dtype=torch.long).to(device) # Changed from float32 to long

  def __len__(self):
    return len(self.X)

  def __getitem__(self,idx):
    return  self.X[idx],self.y[idx]

In [None]:
training_data = dataset(x_train, y_train)
validation_data = dataset(x_val, y_val)
testing_data = dataset(x_test, y_test)

In [None]:
BATCH_SIZE = 32
EPOCHS = 20
LR = 1e-4

In [None]:
train_dataloader = DataLoader(training_data, batch_size=BATCH_SIZE, shuffle= True)
validation_dataloader = DataLoader(validation_data, batch_size=BATCH_SIZE, shuffle= False)
testing_dataloader = DataLoader(testing_data, batch_size=BATCH_SIZE, shuffle= False)

# Model

In [None]:
import torch.nn as nn

In [None]:
class Model(nn.Module):

  def __init__(self,bert):
    super().__init__()

    self.bert = bert
    self.gru1 = nn.GRU(input_size=768, hidden_size=128, num_layers=2,dropout=0.3, batch_first=True)
    self.dropout = nn.Dropout(0.25)
    self.linear1 = nn.Linear(128,2)


  def forward(self,input_ids,attention_mask):

    bert_output =  self.bert(input_ids,attention_mask,return_dict=False)[0][:,0] # to get the CLS token of each sample in the batch
    bert_output = bert_output.unsqueeze(1) # Add sequence length dimension
    gru_output, _ = self.gru1(bert_output)
    output = gru_output[:, -1, :]
    output = self.dropout(output)
    output = self.linear1(output)


    return output

In [None]:
for param in bert_model.parameters():
    param.requires_grad = False

In [None]:
model = Model(bert_model).to(device)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW([
    {"params": model.bert.parameters(), "lr": 1e-5},   # BERT (low LR)
    {"params": model.gru1.parameters(), "lr": 1e-3},    # GRU (higher LR)
    {"params": model.linear1.parameters(), "lr": 1e-3},     # Linear layer
])

#Training

In [None]:
EPOCHS = 5

In [None]:
for epoch in range(EPOCHS):

    train_loss = 0
    val_loss = 0
    train_acc = 0
    val_acc = 0
    correct_pred = 0
    total_pred = 0

    model.train()
    for batch in train_dataloader:
      input,label = batch
      input = input.to(device)
      label = label.to(device)

      optimizer.zero_grad() #reset gradient
      output = model(input['input_ids'].squeeze(1),input['attention_mask'].squeeze(1))
      loss = criterion(output,label)
      train_loss += loss.item()
      loss.backward() #gradient
      optimizer.step() #update wts

      #accuracy
      pred = torch.argmax(output,dim=1)
      correct_pred += (pred==label).sum().item()
      total_pred += len(label)

    train_loss = train_loss/len(train_dataloader)  # avg loss per batch
    train_acc = correct_pred/total_pred
    model.eval()

    with torch.no_grad():
      for input,label in validation_dataloader:
        input = input.to(device)
        label = label.to(device)

        output = model(input['input_ids'].squeeze(1),input['attention_mask'].squeeze(1))
        loss = criterion(output,label)
        val_loss += loss.item()

        # accuracy
        pred = torch.argmax(output,dim=1)
        correct_pred += (pred==label).sum().item()
        total_pred += len(label)

      val_loss = val_loss/len(validation_dataloader)
      val_acc = correct_pred/total_pred

    print(f"Epoch: {epoch+1}/{EPOCHS},Training Loss: {train_loss:.4f},Validation Loss: {val_loss:.4f},Training Accuracy: {train_acc*100:.2f},Validation Accuracy: {val_acc*100:.2f}" )
    print('-'*125)


# Testing

In [None]:
with torch.no_grad():
    test_loss = 0
    test_acc = 0
    correct_pred = 0
    total_pred = 0
    for input,label in testing_dataloader:
        input = input.to(device)
        label = label.to(device)

        output = model(input['input_ids'].squeeze(1),input['attention_mask'].squeeze(1))
        loss = criterion(output,label)
        test_loss += loss.item()

        # accuracy
        pred = torch.argmax(output,dim=1)
        correct_pred += (pred==label).sum().item()
        total_pred += len(label)

    test_loss = test_loss/len(testing_dataloader)
    test_acc = correct_pred/total_pred

print(f"Testing Loss is : {test_loss:.4f} and Testing Accuracy is {test_acc*100:.2f}")

# Inference

In [None]:
def predict_text(text, model, tokenizer, device):
    model.eval()

    # tokenize
    encoded = tokenizer(
        text,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='pt'
    )

    input_ids = encoded['input_ids'].to(device)
    attention_mask = encoded['attention_mask'].to(device)

    # forward pass (no grad)
    with torch.no_grad():
        logits = model(input_ids, attention_mask)

    # for multiclass → argmax
    pred_class = torch.argmax(logits, dim=1).item()

    return pred_class


In [None]:
label_map = {
    1: "REAL",
    0: "FAKE"
}

In [None]:
text = "Scientists at the North Atlantic Research Institute announced a shocking breakthrough yesterday, claiming they have discovered a massive underwater city beneath the Arctic ice. According to the team, the structures appear far older than any known human civilization and contain advanced metallic inscriptions that remain undeciphered. The researchers stated the city emits a faint electromagnetic field, suggesting unknown technology may still be functioning. Government officials reportedly sealed off the area and restricted satellite imagery within hours of the announcement. Critics argue the claims lack peer-reviewed evidence, but speculation online has exploded, with some suggesting it could rewrite human history entirely."
prediction = predict_text(text, model, tokenizer, device)
print("Prediction:", label_map[prediction])

In [None]:
text = "The Ministry of Education announced on Tuesday that it will increase funding for public schools by 8 percent starting next year, focusing on expanding digital learning resources and upgrading classroom technology. Officials stated that the budget will prioritize rural districts where access to devices and high-speed internet remains limited. The initiative also includes professional training programs to help teachers integrate new tools into their lessons. According to the ministry, the policy aims to reduce educational gaps highlighted during the pandemic and ensure equal learning opportunities for all students. Local school administrators welcomed the decision, calling it an important step forward."
prediction = predict_text(text, model, tokenizer, device)
print("Prediction:", label_map[prediction])