In [1]:
# Fake News Detection – Baseline Models

In [2]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [3]:
 #Load Data
try:
    train_df = pd.read_csv("../data/processed/train_clean.tsv", sep="\t")
    valid_df = pd.read_csv("../data/processed/valid_clean.tsv", sep="\t")
    test_df  = pd.read_csv("../data/processed/test_clean.tsv", sep="\t")
except:
    # Fallback: load raw and assume "clean_statement" already exists or fallback to "statement"
    train_df = pd.read_csv("../data/raw/train.tsv", sep="\t")
    valid_df = pd.read_csv("../data/raw/valid.tsv", sep="\t")
    test_df  = pd.read_csv("../data/raw/test.tsv", sep="\t")
    if "clean_statement" not in train_df.columns:
        train_df["clean_statement"] = train_df["statement"]
        valid_df["clean_statement"] = valid_df["statement"]
        test_df["clean_statement"]  = test_df["statement"]

print("Train shape:", train_df.shape)
print("Labels:", train_df['label'].unique())
train_df.head(3)


KeyError: 'statement'

In [None]:
#Split Variables
X_train = train_df['clean_statement']
y_train = train_df['label']

X_valid = valid_df['clean_statement']
y_valid = valid_df['label']

X_test  = test_df['clean_statement']
y_test  = test_df['label']

print("Training Samples:", len(X_train))
print("Validation Samples:", len(X_valid))
print("Test Samples:", len(X_test))


In [None]:
#TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")

X_train_tfidf = tfidf.fit_transform(X_train)
X_valid_tfidf = tfidf.transform(X_valid)
X_test_tfidf  = tfidf.transform(X_test)

print("TF-IDF shape (Train):", X_train_tfidf.shape)


In [None]:
 #Logistic Regression 
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced", random_state=42)

param_grid = {
    "C": [0.1, 1, 10]
}

log_reg_cv = GridSearchCV(log_reg, param_grid, cv=5, scoring="f1_weighted", n_jobs=-1)
log_reg_cv.fit(X_train_tfidf, y_train)

print("Best Logistic Regression Parameters:", log_reg_cv.best_params_)


In [None]:
#Logistic Regression Evaluation
y_pred_valid = log_reg_cv.predict(X_valid_tfidf)

print("Logistic Regression Validation Results:")
print(classification_report(y_valid, y_pred_valid))

cm = confusion_matrix(y_valid, y_pred_valid, labels=log_reg_cv.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=log_reg_cv.classes_, yticklabels=log_reg_cv.classes_)
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig("../results/confusion_logistic.png")
plt.show()


In [None]:
#Logistic Regression Evaluation
y_pred_valid = log_reg_cv.predict(X_valid_tfidf)

print("Logistic Regression Validation Results:")
print(classification_report(y_valid, y_pred_valid))

cm = confusion_matrix(y_valid, y_pred_valid, labels=log_reg_cv.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=log_reg_cv.classes_, yticklabels=log_reg_cv.classes_)
plt.title("Confusion Matrix - Logistic Regression")
plt.savefig("../results/confusion_logistic.png")
plt.show()


In [None]:
#Random Forest Evaluation
y_pred_valid_rf = rf_cv.predict(X_valid_tfidf)

print("Random Forest Validation Results:")
print(classification_report(y_valid, y_pred_valid_rf))

cm = confusion_matrix(y_valid, y_pred_valid_rf, labels=rf_cv.classes_)
plt.figure(figsize=(8,6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Greens", xticklabels=rf_cv.classes_, yticklabels=rf_cv.classes_)
plt.title("Confusion Matrix - Random Forest")
plt.savefig("../results/confusion_rf.png")
plt.show()

In [None]:
#Test Evaluation (
print("===== Final Evaluation on Test Set =====")

print("\n[Logistic Regression]")
y_pred_test = log_reg_cv.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test))

print("\n[Random Forest]")
y_pred_test_rf = rf_cv.predict(X_test_tfidf)
print(classification_report(y_test, y_pred_test_rf))


In [None]:
#Save Models & Vectorize
# Save vectorizer and best models
pickle.dump(tfidf, open("../models/tfidf_vectorizer.pkl", "wb"))
pickle.dump(log_reg_cv.best_estimator_, open("../models/tfidf_logistic.pkl", "wb"))
pickle.dump(rf_cv.best_estimator_, open("../models/tfidf_rf.pkl", "wb"))

print("✅ Models and vectorizer saved to /models/")
