## Import Libraries

In [2]:
import numpy as np 
import pandas as pd 

## Data work

In [3]:
data = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
!wget $data -O bank_marketing.zip

--2025-08-31 13:41:23--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank_marketing.zip’

bank_marketing.zip      [       <=>          ] 999.85K   704KB/s    in 1.4s    

2025-08-31 13:41:26 (704 KB/s) - ‘bank_marketing.zip’ saved [1023843]



In [3]:
path = "./unzipped1/unzipped/bank-full.csv"

In [4]:
df = pd.read_csv(path, sep =';')

In [5]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [6]:
del df['default']
del df['loan']

In [7]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

## Data Prep

In [8]:
df.isnull().any() #no missing values 

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

In [9]:
df['education'].mode() #secondary is the mode

0    secondary
Name: education, dtype: object

In [10]:
df['education'].value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [12]:
df_num = df[['age','balance', 'day', 'duration', 'campaign', 'pdays', 'previous']]
df_cat = df[['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome', 'y']]

In [13]:
df_num.corr() ##pdays and previous the highest 

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [14]:
## Target encoding 
df['y'] = df['y'].map({'yes': 1, 'no': 0})

In [15]:
from sklearn.model_selection import train_test_split 
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
df_train, df_val = train_test_split(df_full_train, test_size = 0.2, random_state = 42)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_full_train = df_full_train.y.values 
y_train = df_train.y.values 
y_val = df_val.y.values 
y_test = df_test.y.values 

del df_full_train['y']
del df_train['y']
del df_val['y']
del df_test['y']

In [16]:
from sklearn.metrics import mutual_info_score
def mutual_info(series, y):
    return mutual_info_score(series,y)

mi = df_train[['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']].apply(mutual_info, y = y_train)
    

In [17]:
mi.round(2) #poutcome has the highest mutual score

job          0.01
marital      0.00
education    0.00
housing      0.01
contact      0.01
month        0.02
poutcome     0.03
dtype: float64

In [18]:
cols = df_full_train.columns.tolist()

In [21]:
cols

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

## Modelling

In [26]:
for i in cols:
    cols.pop(i)

from sklearn.feature_extraction import DictVectorizer 

train_dicts = df_train[cols].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
X_train = dv.fit_transform(train_dicts) 

In [27]:
X_train

array([[ 3.600e+01, -4.850e+02,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 5.500e+01,  2.325e+03,  7.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 4.200e+01,  3.104e+03,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       ...,
       [ 5.400e+01,  0.000e+00,  1.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 2.500e+01,  2.311e+03,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00],
       [ 3.000e+01,  1.500e+01,  2.000e+00, ...,  0.000e+00,  1.000e+00,
         0.000e+00]], shape=(28934, 47))

In [28]:
X_train.shape

(28934, 47)

In [29]:
y_train

array([0, 0, 0, ..., 0, 1, 0], shape=(28934,))

In [31]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
lr.fit(X_train, y_train) 

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [32]:
lr.coef_

array([[-3.93918702e-03,  1.23170885e-05, -8.32263912e-02,
         2.05748138e-01,  8.84794003e-02, -1.08858756e+00,
         1.22588527e-03,  4.04856868e-03, -4.13942917e-01,
        -2.32630622e-01, -4.48422489e-02, -1.02944236e-01,
        -1.24587083e-02, -7.81901316e-01,  7.17324345e-02,
        -2.86708783e-01, -2.02682358e-01, -2.14286017e-01,
        -9.78592723e-02,  3.98624741e-01, -1.56826249e-01,
        -1.91886075e-01,  1.91542452e-01, -1.99086897e-01,
        -3.52397366e-02, -7.16842642e-02, -2.51438633e-01,
        -4.09615015e-01, -1.33306377e-01,  1.95402180e-01,
        -5.67540646e-01,  2.21002764e-01, -2.32019804e-01,
        -5.92005684e-01, -7.70330735e-01,  1.58195236e-01,
         8.19527234e-01, -4.59712664e-01, -7.29234501e-01,
         6.33460267e-01,  5.28896330e-01, -1.19863634e-03,
        -7.07670912e-01, -4.15299994e-01,  1.51124476e+00,
        -1.18263388e+00,  4.60821427e-03]])

In [33]:
val_dicts = df_val[cols].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)
X_val = dv.fit_transform(val_dicts) 

In [36]:
y_pred = lr.predict_proba(X_val)[:,1]

In [39]:
outcome = y_pred > 0.5

In [40]:
y_val

array([0, 0, 1, ..., 0, 0, 0], shape=(7234,))

In [41]:
outcome

array([False, False, False, ...,  True, False,  True], shape=(7234,))

In [47]:
df_pred = pd.DataFrame()
df_pred['pred'] = outcome.astype(int)
df_pred['actual'] = y_val 

#df_pred['pred'] = df_pred['pred'].map({'False': 0, 'True' : 1})

In [49]:
df_pred

Unnamed: 0,pred,actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,1
...,...,...
7229,0,0
7230,0,0
7231,1,0
7232,0,0


In [50]:
df_pred['correct'] = df_pred['pred'] == df_pred['actual']

In [20]:
df_pred['correct'].mean() #0.9 is the accuracy 

NameError: name 'df_pred' is not defined

## Feature elimination

In [23]:
from sklearn.feature_extraction import DictVectorizer 
from sklearn.linear_model import LogisticRegression

accuracy_dict = {}
C_cols = [0.01, 0.1, 1, 10, 100]
for i in C_cols:
    #cols_new = [x for x in cols if x != i]

    train_dicts = df_train[cols_new].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(train_dicts) 

    lr = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    lr.fit(X_train, y_train) 

    val_dicts = df_val[cols_new].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)
    X_val = dv.fit_transform(val_dicts) 

    y_pred = lr.predict_proba(X_val)[:,1]

    outcome = y_pred > 0.5

    df_pred = pd.DataFrame()
    df_pred['pred'] = outcome.astype(int)
    df_pred['actual'] = y_val 

    df_pred['correct'] = df_pred['pred'] == df_pred['actual']
    accuracy = df_pred['correct'].mean()
    accuracy_dict[i] = accuracy 
    
    
    

In [24]:
accuracy_dict #balance is the worst 

{'age': np.float64(0.9014376555156207),
 'job': np.float64(0.9011611833010782),
 'marital': np.float64(0.9019905999447055),
 'education': np.float64(0.9014376555156207),
 'balance': np.float64(0.9011611833010782),
 'housing': np.float64(0.9028200165883329),
 'contact': np.float64(0.9008847110865358),
 'day': np.float64(0.901022947193807),
 'month': np.float64(0.9000552944429084),
 'duration': np.float64(0.8894111141830247),
 'campaign': np.float64(0.9014376555156207),
 'pdays': np.float64(0.9015758916228919),
 'previous': np.float64(0.9012994194083495),
 'poutcome': np.float64(0.8949405584738733)}