# 📘 Fake News Detection Project - EDA + ML Model

In [None]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
# Step 2: Load Dataset
df = pd.read_csv('data/fake_or_real_news.csv')
df.dropna(inplace=True)

In [None]:
# Step 3: Exploratory Data Analysis (EDA)
print("\nShape of Dataset:", df.shape)
print("\nData Types:\n", df.dtypes)
print("\nLabel Distribution:\n", df['label'].value_counts())
sns.countplot(data=df, x='label')
plt.title("Fake vs Real News Distribution")
plt.show()
df['text_len'] = df['text'].apply(lambda x: len(x.split()))
sns.histplot(data=df, x='text_len', hue='label', bins=50)
plt.title("Word Count Distribution by Label")
plt.show()

In [None]:
# Step 4: Text Preprocessing
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    words = text.split()
    ps = PorterStemmer()
    stop_words = set(stopwords.words('english'))
    return ' '.join([ps.stem(word) for word in words if word not in stop_words])
df['clean_text'] = df['text'].apply(clean_text)

In [None]:
# Step 5: Train-Test Split
X = df['clean_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 6: TF-IDF + Logistic Regression Pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression())
])
model.fit(X_train, y_train)

In [None]:
# Step 7: Evaluate the Model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred, pos_label='REAL'))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
# Step 8: Save the Model
with open('models/fake_news_model.pkl', 'wb') as f:
    pickle.dump(model, f)
print("\n✅ Model saved successfully as 'fake_news_model.pkl'")