In [1]:
import os
import re
import string
import pickle
import numpy as np
import pandas as pd
import torch
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
BATCH_SIZE = 16
MAX_LENGTH = 512
RANDOM_STATE = 42

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [4]:
true_df = pd.read_csv('Dataset/true.csv')
true_df['label'] = 1

In [5]:
false_df = pd.read_csv('Dataset/Fake.csv')
false_df['label'] = 0

In [6]:
df = pd.concat([true_df, false_df], ignore_index=True)
df = df.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
print("Data loaded. Label distribution:")
print(df['label'].value_counts())

Data loaded. Label distribution:
label
0    23481
1    21417
Name: count, dtype: int64


In [7]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\gujar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gujar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
stop_words = set(stopwords.words('english'))

In [10]:
def clean_text(text):
    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\d+', '', text)
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

In [11]:
df['clean_text'] = df['text'].apply(clean_text)

In [12]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [13]:
tfidf_features = tfidf_vectorizer.fit_transform(df['clean_text'])

In [14]:
print("TF-IDF feature shape:", tfidf_features.shape)

TF-IDF feature shape: (44898, 5000)


In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [16]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [17]:
bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [18]:
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [19]:
def get_batch_bert_embeddings(texts, batch_size=BATCH_SIZE):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        inputs = tokenizer(
            batch_texts,
            return_tensors='pt',
            truncation=True,
            max_length=MAX_LENGTH,
            padding=True
        )
        
        inputs = {key: value.to(device) for key, value in inputs.items()}
        with torch.no_grad():
            outputs = bert_model(**inputs)
        
        batch_embeddings = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        embeddings.append(batch_embeddings)
        
        torch.cuda.empty_cache()
    return np.vstack(embeddings)

In [20]:
texts = df['clean_text'].tolist()

In [21]:
bert_features = get_batch_bert_embeddings(texts, batch_size=BATCH_SIZE)

In [22]:
print("BERT feature shape:", bert_features.shape)

BERT feature shape: (44898, 768)


In [23]:
tfidf_dense = tfidf_features.toarray()

In [24]:
combined_features = np.hstack([tfidf_dense, bert_features])

In [25]:
print("Combined feature shape:", combined_features.shape)

Combined feature shape: (44898, 5768)


In [26]:
X = combined_features
y = df['label'].values

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

In [28]:
clf = LogisticRegression(max_iter=1000)

In [29]:
clf.fit(X_train, y_train)

In [31]:
y_pred = clf.predict(X_test)

In [32]:
accuracy = accuracy_score(y_test, y_pred)

In [33]:
print(f"Test Accuracy: {accuracy * 100:.2f}%")

Test Accuracy: 99.41%


In [34]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4669
           1       0.99      0.99      0.99      4311

    accuracy                           0.99      8980
   macro avg       0.99      0.99      0.99      8980
weighted avg       0.99      0.99      0.99      8980



In [35]:
with open('fake_news_clf.pkl', 'wb') as f:
    pickle.dump(clf, f)
with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

print("Model and TF-IDF vectorizer saved successfully.")

Model and TF-IDF vectorizer saved successfully.
