In [1]:
# import libarries
import numpy as np
import pandas as pd 

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score

In [2]:
# load the data
bank_df = pd.read_csv("../data/bank-full.csv", delimiter=";")
bank_df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Data Preparation

In [3]:
# print the shape of the dataset
shape = bank_df.shape
print(f"Number of columns in the dataset: {shape}")

Number of columns in the dataset: (45211, 17)


In [4]:
# Get the info of the dataset
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
# checking for missing values
bank_df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [4]:
# check for duplicate values
duplicated = duplicate = bank_df.duplicated().sum()
print(f"Number of duplicated rows: {duplicated}")

Number of duplicated rows: 0


In [5]:
# drop the duration column as it is not useful for prediction

# drop column
bank_df = bank_df.drop(columns=['duration'], axis=1)

# convert the month column to numerical values
bank_df['month'] = bank_df['month'].apply(lambda x: {
                                        'jan':1, 'feb':2, 'mar':3, 'apr':4, 
                                        'may':5, 'jun':6, 'jul':7, 'aug':8, 
                                        'sep':9, 'oct':10, 'nov':11, 'dec':12}[x])

bank_df['loan'] = bank_df['loan'].apply(lambda x: {'yes':1, 'no':0}[x] )

bank_df['y'] = bank_df['y'].apply(lambda x: {'yes':1, 'no':0}[x] )

for col in bank_df.columns:
    if bank_df[col].dtype == 'O':
        le = LabelEncoder()
        bank_df[col] = le.fit_transform(bank_df[col])
        
corrected_df = bank_df
corrected_df.head()  
   

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,5,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,5,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,5,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,5,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,5,1,-1,0,3,0


In [6]:
# Split the data into features and target
X = corrected_df.drop('y', axis=1)
y = corrected_df['y']

#### Machine Learning Model Development 

In [7]:
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# print shape
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

Shape of X_train: (36168, 15)
Shape of y_train: (36168,)
Shape of X_test: (9043, 15)
Shape of y_test: (9043,)


In [8]:
# Scale the X_train and X_test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# print shape
print(f"Sahpe of X_train_scaled: {X_train_scaled.shape}")
print(f"Sahpe of X_test_scaled: {X_test_scaled.shape}")

Sahpe of X_train_scaled: (36168, 15)
Sahpe of X_test_scaled: (9043, 15)


In [9]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

# cross-validation 
kf = KFold(n_splits =5, shuffle= True, random_state=42)

# define models and parameter grids

models_params = {
    "Logistic Regression": {
        "model": LogisticRegression(max_iter=5000),
        "params":{
            "penalty": ['l1', 'l2'],
            "C": [0.01, 0.1, 1, 10, 100],
            "solver": ['lbfgs', 'sag', 'saga']
        },
        "n_iter": 2
    },

    "Decision Tree":{
        "model": DecisionTreeClassifier(),
        "params": {
        "criterion": ["gini", "entropy", "log_loss"],
        "max_depth": list(np.arange(2,31,2)),   
        "min_samples_split": np.arange(2, 21, 2),            
        "min_samples_leaf": np.arange(1, 21, 2)
        }, 
        "n_iter":20
    }
}

results =[]

for name, mp in models_params.items():
    search = RandomizedSearchCV(
        estimator=mp["model"],
        param_distributions=mp["params"],
        n_iter=mp["n_iter"],
        cv=kf
    )
    search.fit(X_train_scaled, y_train)

    results.append({
        "Model":name, 
        "Best Params": search.best_params_,
        "Best Score": search.best_score_

    })

# convert to Data Frame 
results_df = pd.DataFrame(results)
results_df


Unnamed: 0,Model,Best Params,Best Score
0,Logistic Regression,"{'solver': 'saga', 'penalty': 'l2', 'C': 1}",0.883765
1,Decision Tree,"{'min_samples_split': 6, 'min_samples_leaf': 1...",0.893331
