In [23]:
import pandas as pd
import numpy as np

In [27]:
df = pd.read_csv('bank/bank-full.csv', delimiter=';')
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [32]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

# DATA PREPARATION

In [33]:
df_selected_columns = df[['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']]

In [37]:
missing_values = df_selected_columns.isnull().sum()

missing_values

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

# QUESTION 1

In [42]:
df_selected_columns['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [40]:
education_mode = df_selected_columns['education'].mode()[0]
education_mode

'secondary'

# QUESTION 2

In [52]:
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y']

correlation_matrix = df_selected_columns[numerical_columns].corr()

# Compute the correlations for specific pairs using corrwith
# age_balance_corr = df_selected_columns[['age', 'balance']].corrwith(df_selected_columns['balance']).loc['age']
# day_campaign_corr = df_selected_columns[['day', 'campaign']].corrwith(df_selected_columns['campaign']).loc['day']
# day_pdays_corr = df_selected_columns[['day', 'pdays']].corrwith(df_selected_columns['pdays']).loc['day']
# pdays_previous_corr = df_selected_columns[['pdays', 'previous']].corrwith(df_selected_columns['previous']).loc['pdays']

age_balance_corr = correlation_matrix.loc['age', 'balance']
day_campaign_corr = correlation_matrix.loc['day', 'campaign']
day_pdays_corr = correlation_matrix.loc['day', 'pdays']
pdays_previous_corr = correlation_matrix.loc['pdays', 'previous']

print(f"Correlation between age and balance: {age_balance_corr:.2f}")
print(f"Correlation between day and campaign: {day_campaign_corr:.2f}")
print(f"Correlation between day and pdays: {day_pdays_corr:.2f}")
print(f"Correlation between pdays and previous: {pdays_previous_corr:.2f}")

Correlation between age and balance: 0.10
Correlation between day and campaign: 0.16
Correlation between day and pdays: -0.09
Correlation between pdays and previous: 0.45


In [62]:
# TARGET ENCODING
df_selected_columns.loc[:, 'y'] = df_selected_columns['y'].replace({'yes': 1, 'no': 0})

df_selected_columns['y']

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [76]:
# SPLIT THE DATA
from sklearn.model_selection import train_test_split

df_fulltrain, df_test = train_test_split(df_selected_columns, test_size=0.2, random_state=42)
len(df_fulltrain), len(df_test)

(36168, 9043)

In [77]:
df_train, df_val = train_test_split(df_fulltrain, test_size=0.25, random_state=1)
len(df_train), len(df_val)

(27126, 9042)

In [78]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

# QUESTION 3

In [92]:
from sklearn.metrics import mutual_info_score
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']

def mutual_info_y_score(series):
    score = mutual_info_score(series, y_train)
    return round(score, 2)

mi = df_train[categorical_columns].apply(mutual_info_y_score)
mi.sort_values(ascending=False)

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
dtype: float64

# QUESTION 4

In [117]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

In [118]:
train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
dv.fit(train_dicts)

X_train = dv.fit_transform(train_dicts)
dv.get_feature_names_out()



array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [119]:
val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
X_val = dv.transform(val_dicts)

test_dicts = df_test[categorical_columns + numerical_columns].to_dict(orient='records')
X_test = dv.transform(test_dicts)

In [121]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [128]:
#model.coef_ # weights

In [129]:
#model.intercept_[0] # bias

In [125]:
y_pred = model.predict_proba(X_val)[: , 1]
y_pred

array([0.17232025, 0.42919774, 0.00697734, ..., 0.03771159, 0.74154147,
       0.52315072])

In [126]:
term_depo = (y_pred >= 0.5)
term_depo

array([False, False, False, ..., False,  True,  True])

In [127]:
(y_val == term_depo.astype(int)).mean()

0.9021234240212342

In [131]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = term_depo.astype(int)  # converting boolean to int
df_pred['actual'] = y_val

df_pred

Unnamed: 0,probability,prediction,actual
0,0.172320,0,0
1,0.429198,0,1
2,0.006977,0,0
3,0.041985,0,0
4,0.147886,0,1
...,...,...,...
9037,0.012134,0,0
9038,0.037620,0,0
9039,0.037712,0,0
9040,0.741541,1,0


In [136]:
df_pred['correct'] = df_pred.prediction == df_pred.actual
accuracy = round(df_pred['correct'].mean(), 2)
accuracy

0.9

# QUESTION 5

In [172]:
def get_model_accuracy(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val):
    dv = DictVectorizer(sparse=False)
    
    train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
    dv.fit(train_dicts)
    X_train = dv.fit_transform(train_dicts)
    
    val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
    X_val = dv.transform(val_dicts)

    test_dicts = df_test[categorical_columns + numerical_columns].to_dict(orient='records')
    X_test = dv.transform(test_dicts)
    
    from sklearn.linear_model import LogisticRegression

    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[: , 1]
    term_depo = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = term_depo.astype(int)
    df_pred['actual'] = y_val
    
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    #accuracy = round(df_pred['correct'].mean(), 2)
    accuracy = df_pred['correct'].mean()
    
    return accuracy

In [173]:
categorical_columns = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
numerical_columns = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
get_model_accuracy(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val)

0.9021234240212342

In [174]:
def feature_importance_analysis(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val):
    original_accuracy = get_model_accuracy(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val)
    print(f"Original Accuracy: {original_accuracy:.4f}")

    combined_columns = categorical_columns + numerical_columns

    feature_imp = {}

    for feature in combined_columns:
        remaining_columns = [col for col in combined_columns if col != feature]

        accuracy_without_feature = get_model_accuracy(
            [col for col in categorical_columns if col in remaining_columns],
            [col for col in numerical_columns if col in remaining_columns],
            df_train, df_val, y_train, y_val
        )
        
        accuracy_difference = original_accuracy - accuracy_without_feature

        feature_imp[feature] = round(accuracy_difference, 4)
        print(f"Feature: {feature}, Accuracy Without Feature: {accuracy_difference:.4f}, Accuracy Difference: {accuracy_difference:.4f}")

    feature_importance = sorted(feature_imp.items(), key=lambda x: x[1], reverse=True)

    return feature_importance

feature_importance = feature_importance_analysis(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val)

Original Accuracy: 0.9021
Feature: job, Accuracy Without Feature: 0.0000, Accuracy Difference: 0.0000
Feature: marital, Accuracy Without Feature: -0.0003, Accuracy Difference: -0.0003
Feature: education, Accuracy Without Feature: -0.0002, Accuracy Difference: -0.0002
Feature: housing, Accuracy Without Feature: 0.0001, Accuracy Difference: 0.0001
Feature: contact, Accuracy Without Feature: 0.0009, Accuracy Difference: 0.0009
Feature: month, Accuracy Without Feature: 0.0033, Accuracy Difference: 0.0033
Feature: poutcome, Accuracy Without Feature: 0.0049, Accuracy Difference: 0.0049
Feature: age, Accuracy Without Feature: 0.0004, Accuracy Difference: 0.0004
Feature: balance, Accuracy Without Feature: 0.0000, Accuracy Difference: 0.0000
Feature: day, Accuracy Without Feature: 0.0002, Accuracy Difference: 0.0002
Feature: duration, Accuracy Without Feature: 0.0103, Accuracy Difference: 0.0103
Feature: campaign, Accuracy Without Feature: 0.0006, Accuracy Difference: 0.0006
Feature: pdays, Acc

In [175]:
target_features = ['age', 'balance', 'marital', 'previous']
feature_importance_dict = dict(feature_importance)

for feature in target_features:
    print(f"{feature}: {feature_importance_dict[feature]}")

age: 0.0004
balance: 0.0
marital: -0.0003
previous: -0.0001


# QUESTION 6

In [183]:
def get_model_accuracy_regularized(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val, penalty, C):

    dv = DictVectorizer(sparse=False)
    
    train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)
    
    val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', penalty=penalty, C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_depo = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = term_depo.astype(int)
    df_pred['actual'] = y_val
    
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    accuracy = round(df_pred['correct'].mean(), 3)
    
    return accuracy

In [184]:
C_values = [0.01, 0.1, 1, 10, 100]
print("Regularization with L1 penalty")
for C in C_values:
    accuracy = get_model_accuracy_regularized(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val, penalty='l1', C=C)
    print(f"C: {C}, Accuracy: {accuracy}")
    
print("Regularization with L2 penalty")
for C in C_values:
    accuracy = get_model_accuracy_regularized(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val, penalty='l2', C=C)
    print(f"C: {C}, Accuracy: {accuracy}")

Regularization with L1 penalty
C: 0.01, Accuracy: 0.898
C: 0.1, Accuracy: 0.901
C: 1, Accuracy: 0.902
C: 10, Accuracy: 0.902
C: 100, Accuracy: 0.902
Regularization with L2 penalty
C: 0.01, Accuracy: 0.898
C: 0.1, Accuracy: 0.902
C: 1, Accuracy: 0.902
C: 10, Accuracy: 0.902
C: 100, Accuracy: 0.901


In [185]:
def get_model_accuracy_regularized(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val, C):

    dv = DictVectorizer(sparse=False)
    
    train_dicts = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)
    
    val_dicts = df_val[categorical_columns + numerical_columns].to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    term_depo = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = term_depo.astype(int)
    df_pred['actual'] = y_val
    
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    accuracy = round(df_pred['correct'].mean(), 3)
    
    return accuracy

for C in C_values:
    accuracy = get_model_accuracy_regularized(categorical_columns, numerical_columns, df_train, df_val, y_train, y_val, C=C)
    print(f"C: {C}, Accuracy: {accuracy}") 

C: 0.01, Accuracy: 0.898
C: 0.1, Accuracy: 0.902
C: 1, Accuracy: 0.902
C: 10, Accuracy: 0.902
C: 100, Accuracy: 0.901
