In [44]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train Data Loading

In [45]:
train_data = pd.read_excel('../../Data/train_data.xlsx')
train_data = train_data.convert_dtypes()
train_data.head()

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day_of_month,month,duration,campaign,pdays,previous,poutcome
0,yes,66,retired,married,secondary,no,2048,no,no,cellular,27,aug,212,1,-1,0,Not Specified
1,no,49,admin.,single,primary,no,181,yes,no,Not Specified,8,may,161,3,-1,0,Not Specified
2,no,41,blue-collar,divorced,primary,no,-129,yes,no,cellular,18,may,176,1,-1,0,Not Specified
3,no,42,Not Specified,single,Not Specified,no,1316,no,no,Not Specified,5,jun,285,1,-1,0,Not Specified
4,no,45,services,married,secondary,no,1621,no,no,cellular,18,aug,122,8,-1,0,Not Specified


In [46]:
categorical_cols = train_data.select_dtypes(include = 'string').columns
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
train_data = train_data[categorical_cols].copy()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   y          36168 non-null  category
 1   job        36168 non-null  category
 2   marital    36168 non-null  category
 3   education  36168 non-null  category
 4   default    36168 non-null  category
 5   housing    36168 non-null  category
 6   loan       36168 non-null  category
 7   contact    36168 non-null  category
 8   month      36168 non-null  category
 9   poutcome   36168 non-null  category
dtypes: category(10)
memory usage: 355.2 KB


In [47]:
categories = 0

for col in categorical_cols:
    cat = len(train_data[col].unique())
    categories += cat
    print(f'{col}: {cat}')

print(f'Total: {categories}')

y: 2
job: 12
marital: 3
education: 4
default: 2
housing: 2
loan: 2
contact: 3
month: 12
poutcome: 4
Total: 46


# Modeling

In [48]:
cat_cols = train_data.columns[1:]
target_col = "y"

In [49]:
# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

In [50]:
# Encode features and target
X = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit_transform(train_data[cat_cols])
y = (train_data[target_col] == "yes").astype(int).values

# Train Naive Bayes
model = CategoricalNB()
model.fit(X, y)

# Predict and evaluate on training data
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]

# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

# Run evaluation
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[30837  1099]
 [ 3004  1228]]

Classification Report:
               precision    recall  f1-score   support

           0      0.911     0.966     0.938     31936
           1      0.528     0.290     0.374      4232

    accuracy                          0.887     36168
   macro avg      0.719     0.628     0.656     36168
weighted avg      0.866     0.887     0.872     36168

ROC AUC: 0.7498


In [51]:
train_data['NB_prob'] = y_prob

In [52]:
# One-hot encode categorical features
X = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(train_data[cat_cols])
y = (train_data['y'] == "yes").astype(int).values

# SMOTE oversampling
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X, y)

# Train logistic regression (with balanced class weights for fairness)
model = LogisticRegression(max_iter=1000, class_weight = 'balanced')
model.fit(X_res, y_res)

# Predict and evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)



Confusion Matrix:
 [[24767  7169]
 [ 1637  2595]]

Classification Report:
               precision    recall  f1-score   support

           0      0.938     0.776     0.849     31936
           1      0.266     0.613     0.371      4232

    accuracy                          0.757     36168
   macro avg      0.602     0.694     0.610     36168
weighted avg      0.859     0.757     0.793     36168

ROC AUC: 0.7588


In [53]:
train_data['Logistic_prob'] = y_prob

In [54]:
# encode categorical variables
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X = enc.fit_transform(train_data[cat_cols])
y = (train_data["y"] == "yes").astype(int).values

# shallow interpretable tree
model = DecisionTreeClassifier(
    max_depth=4,          # keep tree small for interpretability
    class_weight='balanced', 
    min_samples_leaf=100  # stabilizes probabilities
)

model.fit(X, y)

# evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[20945 10991]
 [ 1389  2843]]

Classification Report:
               precision    recall  f1-score   support

           0      0.938     0.656     0.772     31936
           1      0.206     0.672     0.315      4232

    accuracy                          0.658     36168
   macro avg      0.572     0.664     0.543     36168
weighted avg      0.852     0.658     0.718     36168

ROC AUC: 0.7277


In [55]:
train_data['DT_prob'] = y_prob

In [56]:
from interpret.glassbox import ExplainableBoostingClassifier

# encode categoricals (EBM can also take labels, but SMOTE needs numerics)
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_enc = enc.fit_transform(train_data[cat_cols])
y = (train_data["y"] == "yes").astype(int).values

# SMOTE oversampling
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X_enc, y)

# train EBM on oversampled data
ebm = ExplainableBoostingClassifier(
    interactions = 2,
    max_rounds = 400,
    validation_size = 0.15,
    random_state = 42,
    n_jobs = -1
)

ebm.fit(X_res, y_res)

# evaluate on original training data
y_prob = ebm.predict_proba(X_enc)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
print_metrics(y, y_pred, y_prob)



Confusion Matrix:
 [[25667  6269]
 [ 1810  2422]]

Classification Report:
               precision    recall  f1-score   support

           0      0.934     0.804     0.864     31936
           1      0.279     0.572     0.375      4232

    accuracy                          0.777     36168
   macro avg      0.606     0.688     0.619     36168
weighted avg      0.857     0.777     0.807     36168

ROC AUC: 0.7517


In [57]:
train_data['EBM_prob'] = y_prob

In [58]:
train_data

Unnamed: 0,y,job,marital,education,default,housing,loan,contact,month,poutcome,NB_prob,Logistic_prob,DT_prob,EBM_prob
0,yes,retired,married,secondary,no,no,no,cellular,aug,Not Specified,0.251396,0.574555,0.525657,0.452966
1,no,admin.,single,primary,no,yes,no,Not Specified,may,Not Specified,0.011975,0.207168,0.286525,0.236420
2,no,blue-collar,divorced,primary,no,yes,no,cellular,may,Not Specified,0.021684,0.459405,0.403237,0.371329
3,no,Not Specified,single,Not Specified,no,no,no,Not Specified,jun,Not Specified,0.066417,0.299713,0.286525,0.223903
4,no,services,married,secondary,no,no,no,cellular,aug,Not Specified,0.101204,0.413267,0.525657,0.452537
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36163,no,entrepreneur,divorced,secondary,no,no,no,Not Specified,jun,Not Specified,0.029094,0.330930,0.211884,0.214554
36164,no,services,married,secondary,no,no,no,telephone,aug,Not Specified,0.091020,0.304307,0.525657,0.308127
36165,no,unemployed,married,secondary,no,yes,no,telephone,nov,Not Specified,0.065673,0.214639,0.403237,0.146205
36166,no,services,married,secondary,no,no,no,Not Specified,jun,Not Specified,0.026116,0.343795,0.211884,0.231184


In [60]:
train_data.to_csv('../../Data/train_data_prob_added.csv', index = False)