In [216]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train Data Loading

In [217]:
train_data = pd.read_excel('../../Data/train_data.xlsx')
train_data = train_data.convert_dtypes()
train_data.head()

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day_of_month,month,duration,campaign,pdays,previous,poutcome
0,yes,66,retired,married,secondary,no,2048,no,no,cellular,27,aug,212,1,-1,0,Not Specified
1,no,49,admin.,single,primary,no,181,yes,no,Not Specified,8,may,161,3,-1,0,Not Specified
2,no,41,blue-collar,divorced,primary,no,-129,yes,no,cellular,18,may,176,1,-1,0,Not Specified
3,no,42,Not Specified,single,Not Specified,no,1316,no,no,Not Specified,5,jun,285,1,-1,0,Not Specified
4,no,45,services,married,secondary,no,1621,no,no,cellular,18,aug,122,8,-1,0,Not Specified


In [218]:
train_data['month_segment'] = pd.cut(
    train_data['day_of_month'],
    bins=[0, 10, 20, 31],
    labels=['start', 'mid', 'end'],
    include_lowest=True
)

categorical_cols = train_data.select_dtypes(include = ['string', 'category']).columns
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
train_data = train_data[categorical_cols].copy()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   y              36168 non-null  category
 1   job            36168 non-null  category
 2   marital        36168 non-null  category
 3   education      36168 non-null  category
 4   default        36168 non-null  category
 5   housing        36168 non-null  category
 6   loan           36168 non-null  category
 7   contact        36168 non-null  category
 8   month          36168 non-null  category
 9   poutcome       36168 non-null  category
 10  month_segment  36168 non-null  category
dtypes: category(11)
memory usage: 390.7 KB


In [219]:
categories = 0

for col in categorical_cols:
    cat = len(train_data[col].unique())
    categories += cat
    print(f'{col}: {cat}')

print(f'Total: {categories}')

y: 2
job: 12
marital: 3
education: 4
default: 2
housing: 2
loan: 2
contact: 3
month: 12
poutcome: 4
month_segment: 3
Total: 49


# Test Data Loading

In [220]:
test_data = pd.read_excel('../../Data/test_data.xlsx')
test_data = test_data.convert_dtypes()
test_data.head()

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day_of_month,month,duration,campaign,pdays,previous,poutcome
0,no,42,management,single,tertiary,no,4264,yes,no,Not Specified,28,may,882,1,-1,0,Not Specified
1,no,55,management,married,tertiary,no,0,no,no,Not Specified,30,jun,111,2,-1,0,Not Specified
2,no,45,services,married,secondary,no,4005,yes,no,cellular,17,nov,141,1,-1,0,Not Specified
3,no,51,admin.,married,secondary,no,52,yes,yes,telephone,23,jul,36,15,-1,0,Not Specified
4,no,55,services,married,secondary,no,76,no,no,cellular,5,aug,356,1,-1,0,Not Specified


In [221]:
test_data['month_segment'] = pd.cut(
    test_data['day_of_month'],
    bins=[0, 10, 20, 31],
    labels=['start', 'mid', 'end'],
    include_lowest=True
)

categorical_cols = test_data.select_dtypes(include = ['string', 'category']).columns
test_data[categorical_cols] = test_data[categorical_cols].astype('category')
test_data = test_data[categorical_cols].copy()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9043 entries, 0 to 9042
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   y              9043 non-null   category
 1   job            9043 non-null   category
 2   marital        9043 non-null   category
 3   education      9043 non-null   category
 4   default        9043 non-null   category
 5   housing        9043 non-null   category
 6   loan           9043 non-null   category
 7   contact        9043 non-null   category
 8   month          9043 non-null   category
 9   poutcome       9043 non-null   category
 10  month_segment  9043 non-null   category
dtypes: category(11)
memory usage: 99.3 KB


In [222]:
categories = 0

for col in categorical_cols:
    cat = len(test_data[col].unique())
    categories += cat
    print(f'{col}: {cat}')

print(f'Total: {categories}')

y: 2
job: 12
marital: 3
education: 4
default: 2
housing: 2
loan: 2
contact: 3
month: 12
poutcome: 4
month_segment: 3
Total: 49


# Modeling

In [223]:
cat_cols = train_data.columns[1:]
target_col = "y"

In [224]:
# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

## Naive Bayes

In [225]:
# Encode features and target
X = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1).fit_transform(train_data[cat_cols])
y = (train_data[target_col] == "yes").astype(int).values

# Train Naive Bayes
model = CategoricalNB()
model.fit(X, y)

# Predict and evaluate on training data
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]

# Run evaluation
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[30833  1103]
 [ 3000  1232]]

Classification Report:
               precision    recall  f1-score   support

           0      0.911     0.965     0.938     31936
           1      0.528     0.291     0.375      4232

    accuracy                          0.887     36168
   macro avg      0.719     0.628     0.656     36168
weighted avg      0.866     0.887     0.872     36168

ROC AUC: 0.7498


In [226]:
train_data['NB_prob'] = y_prob

In [227]:
# Encode features and target
X = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1).fit_transform(test_data[cat_cols])
y = (test_data[target_col] == "yes").astype(int).values

# Predict and evaluate on training data
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]

# Run evaluation
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[7766  220]
 [ 766  291]]

Classification Report:
               precision    recall  f1-score   support

           0      0.910     0.972     0.940      7986
           1      0.569     0.275     0.371      1057

    accuracy                          0.891      9043
   macro avg      0.740     0.624     0.656      9043
weighted avg      0.870     0.891     0.874      9043

ROC AUC: 0.7586


In [228]:
test_data['NB_prob'] = y_prob

## Logistic Regression

In [229]:
# One-hot encode categorical features
X = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(train_data[cat_cols])
y = (train_data['y'] == "yes").astype(int).values

# SMOTE oversampling
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X, y)

# Train logistic regression (with balanced class weights for fairness)
model = LogisticRegression(max_iter=1000, class_weight = 'balanced')
model.fit(X_res, y_res)

# Predict and evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)



Confusion Matrix:
 [[24574  7362]
 [ 1636  2596]]

Classification Report:
               precision    recall  f1-score   support

           0      0.938     0.769     0.845     31936
           1      0.261     0.613     0.366      4232

    accuracy                          0.751     36168
   macro avg      0.599     0.691     0.606     36168
weighted avg      0.858     0.751     0.789     36168

ROC AUC: 0.7578


In [230]:
train_data['Logistic_prob'] = y_prob

In [231]:
# One-hot encode categorical features
X = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(test_data[cat_cols])
y = (test_data['y'] == "yes").astype(int).values

# Predict and evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[6187 1799]
 [ 426  631]]

Classification Report:
               precision    recall  f1-score   support

           0      0.936     0.775     0.848      7986
           1      0.260     0.597     0.362      1057

    accuracy                          0.754      9043
   macro avg      0.598     0.686     0.605      9043
weighted avg      0.857     0.754     0.791      9043

ROC AUC: 0.7521


In [232]:
test_data['Logistic_prob'] = y_prob

## Decision Tree

In [233]:
# encode categorical variables
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X = enc.fit_transform(train_data[cat_cols])
y = (train_data["y"] == "yes").astype(int).values

# shallow interpretable tree
model = DecisionTreeClassifier(
    max_depth=4,          # keep tree small for interpretability
    class_weight='balanced', 
    min_samples_leaf=100  # stabilizes probabilities
)

model.fit(X, y)

# evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[20945 10991]
 [ 1389  2843]]

Classification Report:
               precision    recall  f1-score   support

           0      0.938     0.656     0.772     31936
           1      0.206     0.672     0.315      4232

    accuracy                          0.658     36168
   macro avg      0.572     0.664     0.543     36168
weighted avg      0.852     0.658     0.718     36168

ROC AUC: 0.7277


In [234]:
train_data['DT_prob'] = y_prob

In [235]:
# encode categorical variables
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

X = enc.fit_transform(test_data[cat_cols])
y = (test_data["y"] == "yes").astype(int).values

# evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[5292 2694]
 [ 351  706]]

Classification Report:
               precision    recall  f1-score   support

           0      0.938     0.663     0.777      7986
           1      0.208     0.668     0.317      1057

    accuracy                          0.663      9043
   macro avg      0.573     0.665     0.547      9043
weighted avg      0.852     0.663     0.723      9043

ROC AUC: 0.7280


In [236]:
test_data['DT_prob'] = y_prob

## EBM

In [237]:
from interpret.glassbox import ExplainableBoostingClassifier

# encode categoricals (EBM can also take labels, but SMOTE needs numerics)
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_enc = enc.fit_transform(train_data[cat_cols])
y = (train_data["y"] == "yes").astype(int).values

# SMOTE oversampling
sm = SMOTE(random_state = 42)
X_res, y_res = sm.fit_resample(X_enc, y)

# train EBM on oversampled data
ebm = ExplainableBoostingClassifier(
    interactions = 2,
    max_rounds = 400,
    validation_size = 0.15,
    random_state = 42,
    n_jobs = -1
)

ebm.fit(X_res, y_res)

# evaluate on original training data
y_prob = ebm.predict_proba(X_enc)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
print_metrics(y, y_pred, y_prob)



Confusion Matrix:
 [[27120  4816]
 [ 1933  2299]]

Classification Report:
               precision    recall  f1-score   support

           0      0.933     0.849     0.889     31936
           1      0.323     0.543     0.405      4232

    accuracy                          0.813     36168
   macro avg      0.628     0.696     0.647     36168
weighted avg      0.862     0.813     0.833     36168

ROC AUC: 0.7590


In [238]:
train_data['EBM_prob'] = y_prob

In [239]:
# encode categoricals (EBM can also take labels, but SMOTE needs numerics)
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_enc = enc.fit_transform(test_data[cat_cols])
y = (test_data["y"] == "yes").astype(int).values

# evaluate on original training data
y_prob = ebm.predict_proba(X_enc)[:, 1]
y_pred = (y_prob >= 0.5).astype(int)
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[6822 1164]
 [ 490  567]]

Classification Report:
               precision    recall  f1-score   support

           0      0.933     0.854     0.892      7986
           1      0.328     0.536     0.407      1057

    accuracy                          0.817      9043
   macro avg      0.630     0.695     0.649      9043
weighted avg      0.862     0.817     0.835      9043

ROC AUC: 0.7580


In [240]:
test_data['EBM_prob'] = y_prob

# Saving Outputs

In [252]:
data = pd.read_excel('../../Data/train_data.xlsx')
data = data.join(train_data.iloc[:, -4:])
data = data.drop(columns = categorical_cols[1:-1])
data.to_excel('../../Data/train_data_prob_added.xlsx', index = False)

In [251]:
data = pd.read_excel('../../Data/test_data.xlsx')
data = data.join(test_data.iloc[:, -4:])
data = data.drop(columns = categorical_cols[1:-1])
data.to_excel('../../Data/test_data_prob_added.xlsx', index = False)