In [4]:
# Data handling
import numpy as np
import pandas as pd

# Preprocessing
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm               # optional: for stats-style summary

# Hyperparameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, confusion_matrix, classification_report
)

# Calibration / probability utilities (optional)
from sklearn.calibration import CalibratedClassifierCV, calibration_curve

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utility
import joblib   # save/load models
import warnings
warnings.filterwarnings("ignore")

In [6]:
bank_df = pd.read_csv('bank.csv')
bank_df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no


In [8]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB


In [12]:
X_features = list(bank_df.columns)
X_features.remove('subscribed')
X_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [14]:
encoded_bank_df = pd.get_dummies(bank_df[X_features], drop_first = True)

In [16]:
Y = bank_df.subscribed.map(lambda x : int(x== 'yes'))
X = encoded_bank_df

In [28]:
# assume encoded_bank_df already created from pd.get_dummies(...)
X = encoded_bank_df.copy()

# 1) force numeric
X = X.apply(pd.to_numeric, errors="coerce")

# 2) check Y
Y = bank_df.subscribed.map(lambda x: 1 if x == "yes" else 0)
Y = pd.to_numeric(Y, errors="coerce").astype("Int64")  # nullable int

# 3) align indexes (drop rows with any missing X or Y)
mask = X.notna().all(axis=1) & Y.notna()
X = X.loc[mask].astype(float)
Y = Y.loc[mask].astype(int)

# 4) check for infinite values and remove if present
finite_mask = np.isfinite(X).all(axis=1)
X = X.loc[finite_mask]
Y = Y.loc[finite_mask]

# 5) add constant and fit
X_const = sm.add_constant(X, has_constant='add')
logit_model = sm.Logit(Y, X_const).fit(disp=False)
print(logit_model.summary())

                           Logit Regression Results                           
Dep. Variable:             subscribed   No. Observations:                 4521
Model:                          Logit   Df Residuals:                     4497
Method:                           MLE   Df Model:                           23
Date:                Tue, 16 Sep 2025   Pseudo R-squ.:                 0.06090
Time:                        18:19:09   Log-Likelihood:                -1517.1
converged:                       True   LL-Null:                       -1615.5
Covariance Type:            nonrobust   LLR p-value:                 1.487e-29
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -1.7573      0.380     -4.625      0.000      -2.502      -1.013
age                     0.0078      0.006      1.340      0.180      -0.004       0.019
balance         

In [34]:
def get_significant_vars(model, alpha=0.05):
    """
    Return list of significant variables from a fitted statsmodels model
    based on p-values < alpha.
    """
    return model.pvalues[model.pvalues < alpha].index.tolist()

# usage
significant_vars = get_significant_vars(logit_model)
print(significant_vars)

['const', 'current-campaign', 'previous-campaign', 'job_retired', 'marital_married', 'education_tertiary', 'housing-loan_yes', 'personal-loan_yes']


In [46]:
print(list(encoded_bank_df.columns))


['age', 'balance', 'current_campaign', 'previous_campaign', 'job_blue_collar', 'job_entrepreneur', 'job_housemaid', 'job_management', 'job_retired', 'job_self_employed', 'job_services', 'job_student', 'job_technician', 'job_unemployed', 'job_unknown', 'marital_married', 'marital_single', 'education_secondary', 'education_tertiary', 'education_unknown', 'default_yes', 'housing_loan_yes', 'personal_loan_yes']


In [48]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# provided feature list
X_features = ['age', 'balance', 'current_campaign', 'previous_campaign',
              'job_blue_collar', 'job_entrepreneur', 'job_housemaid',
              'job_management', 'job_retired', 'job_self_employed',
              'job_services', 'job_student', 'job_technician',
              'job_unemployed', 'job_unknown', 'marital_married',
              'marital_single', 'education_secondary', 'education_tertiary',
              'education_unknown', 'default_yes', 'housing_loan_yes',
              'personal_loan_yes']

# 1. select columns safely (fall back to existing encoded columns)
available = [c for c in X_features if c in encoded_bank_df.columns]
missing = [c for c in X_features if c not in encoded_bank_df.columns]
if missing:
    print("Warning - these requested features are missing and will be skipped:", missing)

X = encoded_bank_df[available].copy()

# 2. ensure numeric, coerce non-numeric to NaN
X = X.apply(pd.to_numeric, errors='coerce')

# 3. prepare target Y (binary 0/1). adjust column name if different.
Y = bank_df['subscribed'].map(lambda x: 1 if x == 'yes' else 0)
Y = pd.to_numeric(Y, errors='coerce')

# 4. drop rows with any NaN in X or Y and ensure finite values
mask = X.notna().all(axis=1) & Y.notna() & np.isfinite(Y)
mask &= np.isfinite(X).all(axis=1)
X = X.loc[mask].astype(float)
Y = Y.loc[mask].astype(int)

# 5. add constant and fit
X_const = sm.add_constant(X, has_constant='add')
logit_model = sm.Logit(Y, X_const).fit(disp=False)

# 6. output
print(logit_model.summary())

# 7. odds ratios and 95% CI
params = logit_model.params
conf = logit_model.conf_int()
odds_ratios = np.exp(params)
or_ci_lower = np.exp(conf[0])
or_ci_upper = np.exp(conf[1])
or_table = pd.DataFrame({
    'coef': params,
    'odds_ratio': odds_ratios,
    'or_ci_lower': or_ci_lower,
    'or_ci_upper': or_ci_upper,
    'pvalue': logit_model.pvalues
})
print("\nOdds ratios (exp(coef)) with 95% CI and p-values:")
print(or_table.sort_values('pvalue'))

# 8. helper: significant vars at alpha=0.05
def get_significant_vars(model, alpha=0.05):
    return model.pvalues[model.pvalues < alpha].index.tolist()

significant_vars = get_significant_vars(logit_model)
print("\nSignificant variables (p < 0.05):", significant_vars)


                           Logit Regression Results                           
Dep. Variable:             subscribed   No. Observations:                 4521
Model:                          Logit   Df Residuals:                     4497
Method:                           MLE   Df Model:                           23
Date:                Tue, 16 Sep 2025   Pseudo R-squ.:                 0.06090
Time:                        18:28:25   Log-Likelihood:                -1517.1
converged:                       True   LL-Null:                       -1615.5
Covariance Type:            nonrobust   LLR p-value:                 1.487e-29
                          coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                  -1.7573      0.380     -4.625      0.000      -2.502      -1.013
age                     0.0078      0.006      1.340      0.180      -0.004       0.019
balance         