# Task 3: Model Evaluation
This notebook evaluates the trained models using comprehensive metrics and visualizations.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)
import pickle
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load Trained Models and Data

In [None]:
# Load trained models
with open('trained_models.pkl', 'rb') as f:
    trained_models = pickle.load(f)

# Load preprocessing objects
with open('preprocessing_objects.pkl', 'rb') as f:
    preprocessing_objects = pickle.load(f)

# Load train/test split
with open('split_data.pkl', 'rb') as f:
    split_data = pickle.load(f)

# Extract data
X_train = split_data['X_train']
X_test = split_data['X_test']
X_train_scaled = split_data['X_train_scaled']
X_test_scaled = split_data['X_test_scaled']
y_train = split_data['y_train']
y_test = split_data['y_test']

# Extract preprocessing objects
label_encoder_target = preprocessing_objects['label_encoder_target']
target_mapping = preprocessing_objects['target_mapping']

print("Data loaded successfully!")
print(f"Number of models: {len(trained_models)}")
print(f"Test set size: {len(y_test)} samples")
print(f"\nModels available: {list(trained_models.keys())}")

## 2. Evaluate All Models on Test Set

In [None]:
# Evaluate all models
results = []

for name, model in trained_models.items():
    print(f"Evaluating {name}...")
    
    # Use scaled data for models that were trained on scaled data
    if name in ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes']:
        y_pred = model.predict(X_test_scaled)
        y_train_pred = model.predict(X_train_scaled)
    else:
        y_pred = model.predict(X_test)
        y_train_pred = model.predict(X_train)
    
    # Calculate metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results.append({
        'Model': name,
        'Train Accuracy': train_accuracy,
        'Test Accuracy': test_accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'Overfit': train_accuracy - test_accuracy
    })

# Create results dataframe
results_df = pd.DataFrame(results).sort_values('Test Accuracy', ascending=False)

print("\n" + "=" * 100)
print("MODEL EVALUATION RESULTS")
print("=" * 100)
print(results_df.to_string(index=False))
print("=" * 100)

## 3. Visualize Model Performance

In [None]:
# Plot Test Accuracy comparison
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.barh(results_df['Model'], results_df['Test Accuracy'], color='lightblue')
plt.xlabel('Test Accuracy')
plt.title('Model Test Accuracy Comparison')
plt.xlim([0, 1])
for i, v in enumerate(results_df['Test Accuracy']):
    plt.text(v + 0.01, i, f'{v:.4f}', va='center')

# Plot multiple metrics comparison
plt.subplot(1, 2, 2)
x = np.arange(len(results_df))
width = 0.2
plt.bar(x - width*1.5, results_df['Test Accuracy'], width, label='Accuracy', color='lightblue')
plt.bar(x - width*0.5, results_df['Precision'], width, label='Precision', color='lightgreen')
plt.bar(x + width*0.5, results_df['Recall'], width, label='Recall', color='lightcoral')
plt.bar(x + width*1.5, results_df['F1-Score'], width, label='F1-Score', color='lightyellow')
plt.xlabel('Model')
plt.ylabel('Score')
plt.title('Model Performance Metrics')
plt.xticks(x, results_df['Model'], rotation=45, ha='right')
plt.legend()
plt.ylim([0, 1])
plt.tight_layout()
plt.show()

In [None]:
# Plot overfitting analysis
plt.figure(figsize=(12, 6))
x = np.arange(len(results_df))
plt.bar(x - 0.2, results_df['Train Accuracy'], 0.4, label='Train Accuracy', color='skyblue')
plt.bar(x + 0.2, results_df['Test Accuracy'], 0.4, label='Test Accuracy', color='coral')
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Training vs Testing Accuracy (Overfitting Analysis)')
plt.xticks(x, results_df['Model'], rotation=45, ha='right')
plt.legend()
plt.ylim([0, 1])
plt.tight_layout()
plt.show()

print("\nOverfitting Analysis:")
print("=" * 60)
for _, row in results_df.iterrows():
    status = "Good" if row['Overfit'] < 0.05 else "Moderate" if row['Overfit'] < 0.1 else "High"
    print(f"{row['Model']:25s}: Overfit = {row['Overfit']:.4f} ({status})")
print("=" * 60)

## 4. Confusion Matrix for Best Model

In [None]:
# Get best model
best_model_name = results_df.iloc[0]['Model']
best_model = trained_models[best_model_name]

print(f"Best Model: {best_model_name}")
print(f"Test Accuracy: {results_df.iloc[0]['Test Accuracy']:.4f}")

# Make predictions
if best_model_name in ['Logistic Regression', 'K-Nearest Neighbors', 'Support Vector Machine', 'Naive Bayes']:
    y_pred = best_model.predict(X_test_scaled)
else:
    y_pred = best_model.predict(X_test)

# Calculate confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Get class labels
class_names = [label_encoder_target.inverse_transform([i])[0] for i in range(len(label_encoder_target.classes_))]

# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names)
plt.title(f'Confusion Matrix - {best_model_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

## 5. Detailed Classification Report

In [None]:
# Print detailed classification report for best model
print(f"\nDetailed Classification Report for {best_model_name}:")
print("=" * 80)
print(classification_report(y_test, y_pred, target_names=class_names))
print("=" * 80)

## 6. Feature Importance (for tree-based models)

In [None]:
# Feature importance for tree-based models
tree_models = ['Decision Tree', 'Random Forest', 'Gradient Boosting']

for model_name in tree_models:
    if model_name in trained_models:
        model = trained_models[model_name]
        
        # Get feature importances
        importances = model.feature_importances_
        feature_names = preprocessing_objects['feature_names']
        
        # Create dataframe
        feature_importance_df = pd.DataFrame({
            'Feature': feature_names,
            'Importance': importances
        }).sort_values('Importance', ascending=False)
        
        # Plot feature importance
        plt.figure(figsize=(12, 6))
        plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'], color='teal')
        plt.xlabel('Importance')
        plt.title(f'Feature Importance - {model_name}')
        plt.gca().invert_yaxis()
        plt.tight_layout()
        plt.show()
        
        print(f"\nTop 10 Important Features - {model_name}:")
        print("=" * 60)
        print(feature_importance_df.head(10).to_string(index=False))
        print("=" * 60)
        print()

## 7. Model Comparison Summary

In [None]:
# Summary statistics
print("\n" + "=" * 100)
print("FINAL SUMMARY")
print("=" * 100)
print(f"\n1. BEST MODEL: {best_model_name}")
print(f"   - Test Accuracy: {results_df.iloc[0]['Test Accuracy']:.4f}")
print(f"   - F1-Score: {results_df.iloc[0]['F1-Score']:.4f}")
print(f"   - Precision: {results_df.iloc[0]['Precision']:.4f}")
print(f"   - Recall: {results_df.iloc[0]['Recall']:.4f}")

print(f"\n2. MODEL RANKINGS (by Test Accuracy):")
for idx, row in results_df.iterrows():
    print(f"   {row.name + 1}. {row['Model']:30s} - Test Accuracy: {row['Test Accuracy']:.4f}")

print(f"\n3. AVERAGE PERFORMANCE ACROSS ALL MODELS:")
print(f"   - Average Test Accuracy: {results_df['Test Accuracy'].mean():.4f}")
print(f"   - Average F1-Score: {results_df['F1-Score'].mean():.4f}")
print(f"   - Average Precision: {results_df['Precision'].mean():.4f}")
print(f"   - Average Recall: {results_df['Recall'].mean():.4f}")

print(f"\n4. BEST BALANCED MODEL (lowest overfitting with good accuracy):")
balanced_df = results_df[results_df['Test Accuracy'] > 0.9].sort_values('Overfit')
if len(balanced_df) > 0:
    print(f"   - {balanced_df.iloc[0]['Model']}")
    print(f"   - Test Accuracy: {balanced_df.iloc[0]['Test Accuracy']:.4f}")
    print(f"   - Overfit: {balanced_df.iloc[0]['Overfit']:.4f}")
else:
    print("   - No model meets the criteria (Test Accuracy > 0.9)")

print("\n" + "=" * 100)
print("EVALUATION COMPLETE!")
print("=" * 100)