### Import Libraries

In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.calibration import CalibratedClassifierCV
import joblib
import os

### Load the data

In [26]:
df = pd.read_csv('../data/final_cleaned_news_data.csv')  
X = df['text']
y = df['label']  # Assumes label is 0 (fake) and 1 (real)

### Split and Vectorize

In [27]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load saved vectorizer
tfidf = joblib.load('../models/tfidf_vectorizer.pkl')
X_train_tfidf = tfidf.transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Train and Evaluate All Models

In [28]:
# Initialize models
lr = LogisticRegression(max_iter=1000, random_state=42)
nb = MultinomialNB()
svm = CalibratedClassifierCV(LinearSVC(random_state=42))
rf = RandomForestClassifier(n_estimators=100, random_state=42)
dt = DecisionTreeClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)

# Ensemble model
voting = VotingClassifier(estimators=[
    ('lr', lr),
    ('nb', nb),
    ('svm', svm),
    ('rf', rf),
    ('dt', dt),
    ('gb', gb)
], voting='soft')

# Dictionary of models
models = {
    'Logistic Regression': lr,
    'Naive Bayes': nb,
    'Support Vector Machine': svm,
    'Random Forest': rf,
    'Decision Tree': dt,
    'Gradient Boosting': gb,
    'Ensemble Voting': voting
}

In [29]:
# Create models directory if it doesn't exist
os.makedirs('../models', exist_ok=True)

# Training, evaluation, and saving
for name, model in models.items():
    print(f"\n===== {name} =====")
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test, y_pred))
    print("Recall:", recall_score(y_test, y_pred))
    print("F1 Score:", f1_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))

    # Save the trained model
    model_filename = f"../models/{name.lower().replace(' ', '_')}_model.pkl"
    joblib.dump(model, model_filename)
    print(f"Model saved to {model_filename}")



===== Logistic Regression =====
Accuracy: 0.9786535303776683
Precision: 0.9777292576419214
Recall: 0.9824484422992541
F1 Score: 0.9800831691836288
Confusion Matrix:
 [[3866  102]
 [  80 4478]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98      3968
           1       0.98      0.98      0.98      4558

    accuracy                           0.98      8526
   macro avg       0.98      0.98      0.98      8526
weighted avg       0.98      0.98      0.98      8526

Model saved to ../models/logistic_regression_model.pkl

===== Naive Bayes =====
Accuracy: 0.9202439596528267
Precision: 0.9241032370953631
Recall: 0.926941641070645
F1 Score: 0.9255202628696605
Confusion Matrix:
 [[3621  347]
 [ 333 4225]]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.91      0.91      3968
           1       0.92      0.93      0.93      4558

    accuracy               