In [1]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import pickle
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Load processed data
with open('../data/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)
    
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

print("✅ Data loaded successfully")

✅ Data loaded successfully


In [2]:
# 2. Model Training Function
def train_evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    # Train model
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    metrics = {
        'Model': model_name,
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred)
    }
    
    # ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    return metrics, (fpr, tpr, roc_auc)

# Initialize models
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Train and evaluate all models
results = []
roc_curves = {}

for name, model in models.items():
    metrics, roc_data = train_evaluate_model(model, X_train, X_test, y_train, y_test, name)
    results.append(metrics)
    roc_curves[name] = roc_data
    print(f"✅ {name} trained and evaluated")

# Create results DataFrame
results_df = pd.DataFrame(results)
display(results_df)

✅ Logistic Regression trained and evaluated
✅ Decision Tree trained and evaluated
✅ Random Forest trained and evaluated
✅ Gradient Boosting trained and evaluated


Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.792051,0.628571,0.529412,0.574746
1,Decision Tree,0.731015,0.493473,0.505348,0.499339
2,Random Forest,0.783534,0.616162,0.489305,0.545455
3,Gradient Boosting,0.796309,0.647458,0.510695,0.571001


In [3]:
# 3. Visualize Results
def plot_model_comparison(results_df, roc_curves):
    # Metrics Comparison
    fig1 = go.Figure()
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']
    
    for metric in metrics:
        fig1.add_trace(go.Bar(
            name=metric,
            x=results_df['Model'],
            y=results_df[metric]
        ))
    
    fig1.update_layout(
        title='Model Performance Comparison',
        barmode='group',
        yaxis_title='Score',
        xaxis_title='Model'
    )
    fig1.show()
    
    # ROC Curves
    fig2 = go.Figure()
    
    for model_name, (fpr, tpr, roc_auc) in roc_curves.items():
        fig2.add_trace(go.Scatter(
            x=fpr, y=tpr,
            name=f'{model_name} (AUC = {roc_auc:.3f})',
            mode='lines'
        ))
    
    fig2.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        name='Random Classifier',
        mode='lines',
        line=dict(dash='dash')
    ))
    
    fig2.update_layout(
        title='ROC Curves Comparison',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        yaxis=dict(scaleanchor="x", scaleratio=1),
        xaxis=dict(constrain='domain'),
        width=700, height=700
    )
    fig2.show()

plot_model_comparison(results_df, roc_curves)

In [4]:
# 4. Best Model Analysis
# Select best model based on F1 Score
best_model_name = results_df.loc[results_df['F1 Score'].idxmax(), 'Model']
best_model = models[best_model_name]

print(f"🏆 Best Model: {best_model_name}")

# Detailed analysis of best model
y_pred = best_model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance for best model
if hasattr(best_model, 'feature_importances_'):
    importance = pd.DataFrame({
        'Feature': X_train.columns,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    fig = px.bar(importance, x='Feature', y='Importance',
                 title=f'Feature Importance - {best_model_name}')
    fig.show()

🏆 Best Model: Logistic Regression

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.89      0.86      1035
           1       0.63      0.53      0.57       374

    accuracy                           0.79      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.78      0.79      0.79      1409



In [6]:
# 5. Save Best Model
model_data = {
    'model': best_model,
    'metrics': results_df.loc[results_df['Model'] == best_model_name].to_dict('records')[0],
    'feature_names': list(X_train.columns)
}

with open('../models/best_model.pkl', 'wb') as f:
    pickle.dump(model_data, f)

print("✅ Best model saved successfully")

✅ Best model saved successfully
