In [None]:
# notebooks/Model_Comparison.ipynb

import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib


In [None]:

# Load pre-trained models and test data
X_test_tfidf = joblib.load("../data/processed/X_test_tfidf.pkl")
y_test = pd.read_csv("../data/processed/y_test.csv").values.ravel()

# Load different models
models = {
    "Logistic Regression": joblib.load("../models/logistic_regression_model.pkl"),
    "SVM": joblib.load("../models/svm_model.pkl"),
    "Random Forest": joblib.load("../models/random_forest_model.pkl"),
    "LSTM": joblib.load("../models/lstm_model.h5"),
    "BERT": joblib.load("../models/bert_model.pkl")
}


In [None]:

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    return accuracy, precision, recall, f1

# Evaluate each model and store results
results = []
for model_name, model in models.items():
    accuracy, precision, recall, f1 = evaluate_model(model, X_test_tfidf, y_test)
    results.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    })
    


In [None]:

# Display results in a DataFrame
results_df = pd.DataFrame(results)
display(results_df)

# Plotting performance metrics for comparison
results_df.set_index("Model").plot(kind="bar", figsize=(12, 6))
plt.title("Model Performance Comparison")
plt.xlabel("Model")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(loc="lower right")
plt.show()


In [None]:

# Confusion Matrix for the best model
best_model_name = results_df.loc[results_df['F1 Score'].idxmax(), 'Model']
best_model = models[best_model_name]
y_pred_best = best_model.predict(X_test_tfidf)
cm = confusion_matrix(y_test, y_pred_best)

# Confusion matrix visualization
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title(f"Confusion Matrix - {best_model_name}")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()
