# SuperPass Leads — Classification
**Generated:** 2025-09-18
Creates a 300-row dataset, preprocesses it, trains Logistic Regression and RandomForest, evaluates performance.

In [None]:

import numpy as np, pandas as pd
from numpy.random import default_rng
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt

# Load data
df = pd.read_csv('superpass_leads_300.csv')

# Prepare
X = df.drop(columns=['converted_60d']); y = df['converted_60d']
cat_cols = ["industry","region","account_manager","contact_channel","prior_brand","season"]
num_cols = [c for c in X.columns if c not in cat_cols]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123, stratify=y)
preprocess = ColumnTransformer([("num", StandardScaler(), num_cols), ("cat", OneHotEncoder(handle_unknown='ignore'), cat_cols)])

# Models
log_reg = Pipeline([('prep', preprocess), ('clf', LogisticRegression(max_iter=1000))]).fit(X_train, y_train)
rf = Pipeline([('prep', preprocess), ('clf', RandomForestClassifier(random_state=123, n_estimators=200, max_depth=8))]).fit(X_train, y_train)

# Predict
y_pred_lr = log_reg.predict(X_test); y_proba_lr = log_reg.predict_proba(X_test)[:,1]
y_pred_rf = rf.predict(X_test); y_proba_rf = rf.predict_proba(X_test)[:,1]

def metrics(y_true, y_pred, y_proba):
    return {
        'Accuracy': accuracy_score(y_true, y_pred),
        'Precision': precision_score(y_true, y_pred, zero_division=0),
        'Recall': recall_score(y_true, y_pred, zero_division=0),
        'F1': f1_score(y_true, y_pred, zero_division=0),
        'ROC_AUC': roc_auc_score(y_true, y_proba),
    }

print('Logistic Regression:', metrics(y_test, y_pred_lr, y_proba_lr))
print('RandomForest (fixed):', metrics(y_test, y_pred_rf, y_proba_rf))

# Plots
RocCurveDisplay.from_predictions(y_test, y_proba_lr, name='LogReg')
RocCurveDisplay.from_predictions(y_test, y_proba_rf, name='RF (fixed)')
plt.title('ROC Curves (Test Set)'); plt.show()

cm = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots()
ax.imshow(cm); ax.set_xlabel('Predicted'); ax.set_ylabel('Actual')
ax.set_title('Confusion Matrix - RF (fixed)')
ax.set_xticks([0,1]); ax.set_yticks([0,1])
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax.text(j, i, cm[i,j], ha='center', va='center')
plt.show()
