In [28]:
import pandas as pd
from sklearn.naive_bayes import CategoricalNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Train Data Loading

In [21]:
train_data = pd.read_excel('../Data/train_data.xlsx')
train_data = train_data.convert_dtypes()
train_data.head()

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day_of_month,month,duration,campaign,pdays,previous,poutcome
0,yes,66,retired,married,secondary,no,2048,no,no,cellular,27,aug,212,1,-1,0,Not Specified
1,no,49,admin.,single,primary,no,181,yes,no,Not Specified,8,may,161,3,-1,0,Not Specified
2,no,41,blue-collar,divorced,primary,no,-129,yes,no,cellular,18,may,176,1,-1,0,Not Specified
3,no,42,Not Specified,single,Not Specified,no,1316,no,no,Not Specified,5,jun,285,1,-1,0,Not Specified
4,no,45,services,married,secondary,no,1621,no,no,cellular,18,aug,122,8,-1,0,Not Specified


In [22]:
categorical_cols = train_data.select_dtypes(include = 'string').columns
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
train_data = train_data[categorical_cols].copy()
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 10 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   y          36168 non-null  category
 1   job        36168 non-null  category
 2   marital    36168 non-null  category
 3   education  36168 non-null  category
 4   default    36168 non-null  category
 5   housing    36168 non-null  category
 6   loan       36168 non-null  category
 7   contact    36168 non-null  category
 8   month      36168 non-null  category
 9   poutcome   36168 non-null  category
dtypes: category(10)
memory usage: 355.2 KB


# Modeling

In [23]:
cat_cols = train_data.columns[1:]
target_col = "y"

In [24]:
# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

In [25]:
# Encode features and target
X = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1).fit_transform(train_data[cat_cols])
y = (train_data[target_col] == "yes").astype(int).values

# Train Naive Bayes
model = CategoricalNB()
model.fit(X, y)

# Predict and evaluate on training data
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]

# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

# Run evaluation
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[30837  1099]
 [ 3004  1228]]

Classification Report:
               precision    recall  f1-score   support

           0      0.911     0.966     0.938     31936
           1      0.528     0.290     0.374      4232

    accuracy                          0.887     36168
   macro avg      0.719     0.628     0.656     36168
weighted avg      0.866     0.887     0.872     36168

ROC AUC: 0.7498


In [27]:
# One-hot encode categorical features
X = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(train_data[cat_cols])
y = (train_data['y'] == "yes").astype(int).values

# Train logistic regression (with balanced class weights for fairness)
model = LogisticRegression(max_iter=1000, class_weight='balanced')
model.fit(X, y)

# Predict and evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[25043  6893]
 [ 1640  2592]]

Classification Report:
               precision    recall  f1-score   support

           0      0.939     0.784     0.854     31936
           1      0.273     0.612     0.378      4232

    accuracy                          0.764     36168
   macro avg      0.606     0.698     0.616     36168
weighted avg      0.861     0.764     0.799     36168

ROC AUC: 0.7635


In [34]:
# encode categorical variables
try:
    enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
except TypeError:
    enc = OneHotEncoder(handle_unknown='ignore', sparse=False)

X = enc.fit_transform(train_data[cat_cols])
y = (train_data["y"] == "yes").astype(int).values

# shallow interpretable tree
model = DecisionTreeClassifier(
    max_depth=4,          # keep tree small for interpretability
    class_weight='balanced', 
    min_samples_leaf=100  # stabilizes probabilities
)

model.fit(X, y)

# evaluate
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:, 1]
print_metrics(y, y_pred, y_prob)

Confusion Matrix:
 [[24650  7286]
 [ 1782  2450]]

Classification Report:
               precision    recall  f1-score   support

           0      0.933     0.772     0.845     31936
           1      0.252     0.579     0.351      4232

    accuracy                          0.749     36168
   macro avg      0.592     0.675     0.598     36168
weighted avg      0.853     0.749     0.787     36168

ROC AUC: 0.7251


In [39]:
# if not installed: pip install interpret
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.model_selection import train_test_split

cats = ["job","marital","education","default","housing","loan","contact","month","poutcome"]
target = "y"

# train/test split (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    train_data[cats], (train_data[target]=="yes").astype(int),
    test_size=0.2, random_state=42, stratify=(train_data[target]=="yes")
)

# fit EBM (keeps things small & interpretable)
ebm = ExplainableBoostingClassifier(
    interactions=5,        # allow a few pairwise interactions (0 for none)
    validation_size=0.15,
    n_jobs = 1,
    random_state=42
)
ebm.fit(X_train, y_train)

# predict + eval (uses your print_metrics function)
y_pred = ebm.predict(X_train)
y_prob = ebm.predict_proba(X_train)[:,1]
print_metrics(y_train, y_pred, y_prob)


Confusion Matrix:
 [[25218   330]
 [ 2773   613]]

Classification Report:
               precision    recall  f1-score   support

           0      0.901     0.987     0.942     25548
           1      0.650     0.181     0.283      3386

    accuracy                          0.893     28934
   macro avg      0.775     0.584     0.613     28934
weighted avg      0.872     0.893     0.865     28934

ROC AUC: 0.7781
