In [1]:
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Load models and data
with open('../models/good_model.pkl', 'rb') as f:
    good_model = pickle.load(f)
with open('../models/bad_model.pkl', 'rb') as f:
    bad_model = pickle.load(f)
with open('../data/test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

X_test = test_data['X_test']
y_test = test_data['y_test']

def test_model_performance(model, X_test, y_test):
    predictions = model.predict(X_test)
    
    metrics = {
        'accuracy': accuracy_score(y_test, predictions),
        'precision': precision_score(y_test, predictions),
        'recall': recall_score(y_test, predictions),
        'f1_score': f1_score(y_test, predictions)
    }
    
    conf_matrix = confusion_matrix(y_test, predictions)
    total_predictions = len(predictions)
    positive_rate = np.mean(predictions == 1)
    
    return {
        'standard_metrics': metrics,
        'confusion_matrix': conf_matrix,
        'total_samples': total_predictions,
        'positive_prediction_rate': positive_rate,
        'interpretation': (
            f"Model Performance:\n"
            f"Accuracy: {metrics['accuracy']:.2%}\n"
            f"Precision: {metrics['precision']:.2%}\n"
            f"Recall: {metrics['recall']:.2%}\n"
            f"F1 Score: {metrics['f1_score']:.2%}\n"
            f"Overall positive prediction rate: {positive_rate:.2%}"
        )
    }

if __name__ == "__main__":
    print("Testing good model:")
    good_results = test_model_performance(good_model, X_test, y_test)
    print(good_results['interpretation'])
    print("\nConfusion Matrix:")
    print(good_results['confusion_matrix'])
    
    print("\nTesting bad model:")
    bad_results = test_model_performance(bad_model, X_test, y_test)
    print(bad_results['interpretation'])
    print("\nConfusion Matrix:")
    print(bad_results['confusion_matrix'])

Testing good model:
Model Performance:
Accuracy: 90.55%
Precision: 79.66%
Recall: 51.20%
F1 Score: 62.33%
Overall positive prediction rate: 9.82%

Confusion Matrix:
[[21510   519]
 [ 1938  2033]]

Testing bad model:
Model Performance:
Accuracy: 63.43%
Precision: 16.61%
Recall: 34.68%
F1 Score: 22.46%
Overall positive prediction rate: 31.89%

Confusion Matrix:
[[15114  6915]
 [ 2594  1377]]
