In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    matthews_corrcoef
)


In [2]:
df = pd.read_csv("../data/bank.csv", sep=";")
df.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
X = df.drop("y", axis=1)
y = df["y"].map({"yes": 1, "no": 0})

print("Shape:", X.shape)
print("Positive class ratio:", y.mean())


Shape: (45211, 16)
Positive class ratio: 0.11698480458295547


In [4]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns

print("Categorical:", len(categorical_cols))
print("Numerical:", len(numerical_cols))


Categorical: 9
Numerical: 7


In [6]:
preprocessor = ColumnTransformer(
transformers=[
    ("num", StandardScaler(), numerical_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
]
)


In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (36168, 16)
Test size: (9043, 16)


In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight="balanced"),
    "Decision Tree": DecisionTreeClassifier(class_weight="balanced"),
    "KNN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(class_weight="balanced"),
    "XGBoost": XGBClassifier(eval_metric="logloss")
}


In [10]:
results = []

for name, model in models.items():
    print(f"Training {name}...")
    
    if name == "Naive Bayes":
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        
        model.fit(X_train_transformed, y_train)
        y_pred = model.predict(X_test_transformed)
        y_prob = model.predict_proba(X_test_transformed)[:, 1]
        
    else:
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", model)
        ])
        
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        y_prob = pipeline.predict_proba(X_test)[:, 1]
    
    results.append({
        "Model": name,
        "Accuracy": accuracy_score(y_test, y_pred),
        "Precision": precision_score(y_test, y_pred),
        "Recall": recall_score(y_test, y_pred),
        "F1": f1_score(y_test, y_pred),
        "AUC": roc_auc_score(y_test, y_prob),
        "MCC": matthews_corrcoef(y_test, y_pred)
    })


Training Logistic Regression...
Training Decision Tree...
Training KNN...
Training Naive Bayes...
Training Random Forest...
Training XGBoost...


In [11]:
results_df = pd.DataFrame(results)
results_df.sort_values(by="AUC", ascending=False)


Unnamed: 0,Model,Accuracy,Precision,Recall,F1,AUC,MCC
5,XGBoost,0.905452,0.626717,0.47448,0.540075,0.928747,0.494383
4,Random Forest,0.903682,0.684418,0.327977,0.44345,0.926491,0.430244
0,Logistic Regression,0.845737,0.418244,0.814745,0.552741,0.907922,0.509218
2,KNN,0.896163,0.599002,0.340265,0.433996,0.827721,0.400128
3,Naive Bayes,0.854805,0.405904,0.519849,0.455864,0.810095,0.377358
1,Decision Tree,0.876368,0.46988,0.442344,0.455696,0.68811,0.386256


In [15]:
import numpy as np

top_percent = 0.10
top_k = int(top_percent * len(y_test))

top_k_results = []

for name, model in models.items():
    
    if name == "Naive Bayes":
        # transform manually
        X_train_transformed = preprocessor.fit_transform(X_train)
        X_test_transformed = preprocessor.transform(X_test)
        model.fit(X_train_transformed, y_train)
        probs = model.predict_proba(X_test_transformed)[:, 1]
        
    else:
        pipeline = Pipeline(steps=[
            ("preprocessor", preprocessor),
            ("classifier", model)
        ])
        pipeline.fit(X_train, y_train)
        probs = pipeline.predict_proba(X_test)[:, 1]
    
    # sort by probability descending
    sorted_indices = np.argsort(probs)[::-1]
    top_indices = sorted_indices[:top_k]
    
    # conversion rate in top k%
    conversion_rate = y_test.iloc[top_indices].mean()
    
    top_k_results.append({
        "Model": name,
        f"Top {int(top_percent*100)}% Conversion Rate": conversion_rate
    })

top_k_df = pd.DataFrame(top_k_results)
top_k_df.sort_values(by=f"Top {int(top_percent*100)}% Conversion Rate", ascending=False)


Unnamed: 0,Model,Top 10% Conversion Rate
5,XGBoost,0.618363
4,Random Forest,0.600664
0,Logistic Regression,0.591814
2,KNN,0.548673
1,Decision Tree,0.480088
3,Naive Bayes,0.46792
