In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    roc_curve,
    confusion_matrix,
    accuracy_score,
    log_loss
)
import joblib

# === Step 1: Load your CSV file here ===
df = pd.read_csv("./dataset/aviator_dataset_clean.csv")  # Replace with your actual file path

# === Step 2: Prepare features and target ===
# Make sure the dataset has columns: 'color', 'mean', 'var', 'next_approximate', 'target'
# Binary classification threshold on 'target':
threshold = 2.0
df['label'] = (df['target'] > threshold).astype(int)

X = df[['color', 'mean', 'var', 'next_approximate']]
y = df['label']

# === Step 3: Split dataset ===
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# === Step 4: Define models ===
models = {
    'LogisticRegression': Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(max_iter=1000))]),
    'RandomForest': Pipeline([('scaler', StandardScaler()), ('rf', RandomForestClassifier())]),
    'GradientBoosting': Pipeline([('scaler', StandardScaler()), ('gb', GradientBoostingClassifier())]),
    'KNN': Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier())]),
    'AdaBoost': Pipeline([('scaler', StandardScaler()), ('ada', AdaBoostClassifier())])
}

# === Step 5: Train, evaluate, and visualize ROC ===
results = {}
plt.figure(figsize=(8, 6))

for name, model in models.items():
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    y_test_pred = model.predict(X_test)
    y_val_proba = model.predict_proba(X_val)[:, 1]

    acc_train = accuracy_score(y_train, y_train_pred)
    acc_val = accuracy_score(y_val, y_val_pred)
    acc_test = accuracy_score(y_test, y_test_pred)
    loss_val = log_loss(y_val, y_val_proba)
    roc_auc = roc_auc_score(y_val, y_val_proba)

    results[name] = {
        'model': model,
        'acc_train': acc_train,
        'acc_val': acc_val,
        'acc_test': acc_test,
        'log_loss': loss_val,
        'roc_auc': roc_auc,
        'report': classification_report(y_val, y_val_pred, output_dict=True),
        'conf_matrix': confusion_matrix(y_val, y_val_pred)
    }

    fpr, tpr, _ = roc_curve(y_val, y_val_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC={roc_auc:.2f})')

plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves")
plt.legend()
plt.tight_layout()
plt.savefig("roc_curves.png")
plt.close()

# === Step 6: Save comparison metrics and best model ===
metrics_df = pd.DataFrame({
    name: {
        'Train Accuracy': res['acc_train'],
        'Validation Accuracy': res['acc_val'],
        'Test Accuracy': res['acc_test'],
        'Validation Log Loss': res['log_loss'],
        'Validation ROC AUC': res['roc_auc']
    } for name, res in results.items()
}).T

metrics_df.to_csv("model_comparison_metrics.csv")

best_model_name = metrics_df["Validation ROC AUC"].idxmax()
best_model = results[best_model_name]['model']
joblib.dump(best_model, "best_aviator_classifier.pkl")

print(f"Best model: {best_model_name}")
print(metrics_df)
