In [8]:


import pandas as pd
import numpy as np
import re
import joblib
import os
from datetime import datetime
from bs4 import BeautifulSoup
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer

# ===== NLTK setup =====
try:
    nltk.data.find("tokenizers/punkt")
    nltk.data.find("corpora/stopwords")
    nltk.data.find("corpora/wordnet")
except LookupError:
    print("Downloading NLTK resources...")
    nltk.download("punkt")
    nltk.download("stopwords")
    nltk.download("wordnet")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """Remove HTML, lowercase, remove stopwords, and lemmatize."""
    if pd.isnull(text):
        return ""
    text = BeautifulSoup(text, "html.parser").get_text()
    text = text.lower()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words and len(w) > 1]
    return " ".join(tokens)

# ===== File paths =====
TRAIN_FILE = "dataset/train_data.csv"
TEST_FILE = "dataset/test_data.csv"
TEST_LABEL_FILE = "dataset/test_labels.csv"
MODEL_FILE = "model.pkl"
EVAL_FILE = "model_evaluation.txt"
PREDICTION_FILE = "dataset/prediction.csv"
COMPARISON_FILE = "model_comparison.csv"

# ===== Load & preprocess training data =====
df = pd.read_csv(TRAIN_FILE)
df['label'] = df['label'].replace({'FAKE': 0, 'REAL': 1})
print("Initial class distribution of training data:")
print(df['label'].value_counts())

# Combine title + text and clean
df["full_text"] = (df["title"].fillna("") + " " + df["text"].fillna("")).apply(clean_text)

# Train/Validation split (stratified so classes are balanced)
X_train_text, X_val_text, y_train, y_val = train_test_split(
    df["full_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# ===== SBERT embeddings =====
print("Loading Sentence Transformer model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')

print("Creating SBERT embeddings...")
X_train_embeddings = sbert_model.encode(X_train_text.tolist(), show_progress_bar=True)
X_val_embeddings = sbert_model.encode(X_val_text.tolist(), show_progress_bar=True)

# ===== Helper: Train + Evaluate =====
def train_and_evaluate_model(name, model, Xtr, ytr, Xte, yte):
    print(f"\nTraining {name}...")
    model.fit(Xtr, ytr)
    y_pred = model.predict(Xte)
    acc = accuracy_score(yte, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(yte, y_pred, average="weighted", zero_division=0)
    cm = confusion_matrix(yte, y_pred)
    print(f"--- {name} Results ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)
    return {"name": name, "model": model, "accuracy": acc, "precision": prec, "recall": rec, "f1": f1}

# ===== SBERT models =====
results = [
    train_and_evaluate_model("SBERT + Logistic Regression", LogisticRegression(class_weight='balanced', max_iter=1000),
                             X_train_embeddings, y_train, X_val_embeddings, y_val),
    train_and_evaluate_model("SBERT + SGDClassifier (SVM)", SGDClassifier(loss='hinge', class_weight='balanced', max_iter=1000, random_state=42),
                             X_train_embeddings, y_train, X_val_embeddings, y_val)
]
best_result = max(results, key=lambda r: r['accuracy'])
best_model = best_result['model']
print(f"\n🏆 Best SBERT model: {best_result['name']} with accuracy {best_result['accuracy']:.4f}")

# ===== (Optional) TF-IDF & BOW baselines =====
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_val_tfidf = tfidf_vectorizer.transform(X_val_text)
tfidf_result = train_and_evaluate_model("TF-IDF + Logistic Regression",
                                        LogisticRegression(class_weight='balanced', max_iter=1000),
                                        X_train_tfidf, y_train, X_val_tfidf, y_val)

bow_vectorizer = CountVectorizer(max_features=5000)
X_train_bow = bow_vectorizer.fit_transform(X_train_text)
X_val_bow = bow_vectorizer.transform(X_val_text)
bow_result = train_and_evaluate_model("Bag-of-Words + Logistic Regression",
                                      LogisticRegression(class_weight='balanced', max_iter=1000),
                                      X_train_bow, y_train, X_val_bow, y_val)

# ===== Comparison Table =====
comparison_df = pd.DataFrame([best_result, tfidf_result, bow_result])[["name", "accuracy", "precision", "recall", "f1"]]
comparison_df.columns = ["Model", "Accuracy", "Precision", "Recall", "F1 Score"]
print("\n📊 Model Performance Comparison:\n", comparison_df.to_string(index=False))
comparison_df.to_csv(COMPARISON_FILE, index=False)
print(f"✅ Model comparison saved to {COMPARISON_FILE}")

# ===== Save best model =====
joblib.dump({"sbert_model": sbert_model, "classifier": best_model}, MODEL_FILE)
print(f"✅ Best model saved to {MODEL_FILE}")

# ===== Predict on external test_data.csv =====
print("\n=== Generating numeric test predictions from test_data.csv ===")
test_df = pd.read_csv(TEST_FILE)
test_df["full_text"] = (test_df["title"].fillna("") + " " + test_df["text"].fillna("")).apply(clean_text)
test_embeddings = sbert_model.encode(test_df['full_text'].tolist(), show_progress_bar=True)

test_preds = best_model.predict(test_embeddings)
pd.DataFrame(test_preds, columns=["Prediction"]).to_csv(PREDICTION_FILE, index=False)
print(f"✅ Numeric test predictions saved to {PREDICTION_FILE} with {len(test_preds)} rows (should be {len(test_df)})")

# ===== Final Evaluation Using Provided Test Labels =====
if os.path.exists(TEST_LABEL_FILE):
    test_labels_df = pd.read_csv(TEST_LABEL_FILE)
    # Convert labels if needed
    y_true = test_labels_df['label'].replace({'FAKE': 0, 'REAL': 1}).values
    y_pred = pd.read_csv(PREDICTION_FILE)['Prediction'].values

    test_acc = accuracy_score(y_true, y_pred)
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='weighted', zero_division=0)
    cm = confusion_matrix(y_true, y_pred)

    print("\n=== Final Evaluation on Test Labels ===")
    print(f"Accuracy: {test_acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall: {rec:.4f}")
    print(f"F1 score: {f1:.4f}")
    print("Confusion Matrix:\n", cm)

    # Save the metrics to a CSV
    final_results_df = pd.DataFrame([{
        "Accuracy": test_acc,
        "Precision": prec,
        "Recall": rec,
        "F1 Score": f1
    }])
    final_results_df.to_csv("final_test_evaluation.csv", index=False)
    print("✅ Final test evaluation saved to final_test_evaluation.csv")

else:
    print("⚠️ test_label.csv not found. Skipping final test evaluation.")

print("\n🎯 All training, evaluation, and predictions complete!")


Downloading NLTK resources...


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saifo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saifo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saifo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial class distribution of training data:
label
0    22787
1    17612
Name: count, dtype: int64
Loading Sentence Transformer model...
Creating SBERT embeddings...


Batches: 100%|██████████| 1010/1010 [15:19<00:00,  1.10it/s]
Batches: 100%|██████████| 253/253 [03:48<00:00,  1.11it/s]



Training SBERT + Logistic Regression...
--- SBERT + Logistic Regression Results ---
Accuracy: 0.9499
Precision: 0.9503
Recall: 0.9499
F1 Score: 0.9499
Confusion Matrix:
 [[4308  250]
 [ 155 3367]]

Training SBERT + SGDClassifier (SVM)...
--- SBERT + SGDClassifier (SVM) Results ---
Accuracy: 0.9470
Precision: 0.9484
Recall: 0.9470
F1 Score: 0.9472
Confusion Matrix:
 [[4248  310]
 [ 118 3404]]

🏆 Best SBERT model: SBERT + Logistic Regression with accuracy 0.9499

Training TF-IDF + Logistic Regression...
--- TF-IDF + Logistic Regression Results ---
Accuracy: 0.9911
Precision: 0.9911
Recall: 0.9911
F1 Score: 0.9911
Confusion Matrix:
 [[4512   46]
 [  26 3496]]

Training Bag-of-Words + Logistic Regression...
--- Bag-of-Words + Logistic Regression Results ---
Accuracy: 0.9975
Precision: 0.9975
Recall: 0.9975
F1 Score: 0.9975
Confusion Matrix:
 [[4548   10]
 [  10 3512]]

📊 Model Performance Comparison:
                              Model  Accuracy  Precision   Recall  F1 Score
       SBERT 

Batches: 100%|██████████| 141/141 [02:03<00:00,  1.14it/s]

✅ Numeric test predictions saved to dataset/prediction.csv with 4489 rows (should be 4489)

=== Final Evaluation on Test Labels ===
Accuracy: 0.9655
Precision: 0.9661
Recall: 0.9655
F1 score: 0.9657
Confusion Matrix:
 [[ 620   64]
 [  91 3714]]
✅ Final test evaluation saved to final_test_evaluation.csv

🎯 All training, evaluation, and predictions complete!



