In [1]:
# Fake News Detection Model Training
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import joblib

In [2]:
# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pranj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pranj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load dataset
true_df = pd.read_csv('True.csv')
fake_df = pd.read_csv('Fake.csv')

In [4]:
# Label and combine
true_df['label'] = 0
fake_df['label'] = 1
df = pd.concat([true_df, fake_df]).sample(frac=1, random_state=42)

In [5]:
# Text preprocessing
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


In [6]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

In [7]:
df['processed_text'] = df['text'].apply(preprocess_text)

In [8]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    df['processed_text'], df['label'], test_size=0.3, random_state=42
)

In [9]:
# Vectorization
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [10]:
# Model training
model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [11]:
# Evaluation
print("Test Accuracy:", model.score(X_test_vec, y_test))
print("\nClassification Report:")
print(classification_report(y_test, model.predict(X_test_vec)))

Test Accuracy: 0.9407572383073497

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      6448
           1       0.94      0.94      0.94      7022

    accuracy                           0.94     13470
   macro avg       0.94      0.94      0.94     13470
weighted avg       0.94      0.94      0.94     13470



In [12]:
# Save model and vectorizer
joblib.dump(model, 'fake_news_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']