In [1]:
import pandas as pd
import numpy as np

In [2]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-14 00:30:11--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: 'bank+marketing.zip.3'

bank+marketing.zip.     [     <=>            ] 999.85K   750KB/s    in 1.3s    

2024-10-14 00:30:13 (750 KB/s) - 'bank+marketing.zip.3' saved [1023843]



In [3]:
#original file is separated by semicolon. use comma instead by applying sep=';'. 
df = pd.read_csv('bank+marketing/bank/bank-full.csv', sep=';')

In [4]:
base = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous','poutcome','y']
df = df[base]

In [5]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

### No missing values. 

In [6]:
df.columns

Index(['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact',
       'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome',
       'y'],
      dtype='object')

### Question 1

In [7]:
df['education'].mode()

0    secondary
Name: education, dtype: object

### Question 2

In [8]:
df_numeric = df.select_dtypes(include='number')

In [9]:
corrM = df_numeric.corr()
corrM

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


#### Ans: pdays and previous

In [10]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

In [11]:
df['y'] = df['y'].replace({'yes': 1, 'no': 0})

In [12]:
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,0


### Setting up the validation framework

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [15]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values


del df_val['y']
del df_test['y']

### Question 3

In [18]:
categorical = ['job','marital','education','housing','contact','month','poutcome']

In [19]:
from sklearn.metrics import mutual_info_score

In [20]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_train.y)

In [22]:
mi = df_train[categorical].apply(mutual_info_y_score)
print(round(mi.sort_values(ascending=False),2))

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64


In [22]:
del df_train['y']

### Question 4: Logistic Regression

In [97]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [87]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
# solver='lbfgs' is the default solver in newer version of sklearn
# for older versions, you need to specify it explicitly
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [88]:
y_pred = model.predict_proba(X_val)[:, 1]


In [80]:
y_decision = (y_pred >= 0.5)


In [81]:
df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = y_decision.astype(int)
df_pred['actual'] = y_val

In [82]:
df_pred.head()

Unnamed: 0,probability,prediction,actual
0,0.032556,0,0
1,0.029437,0,0
2,0.100995,0,1
3,0.080324,0,0
4,0.16078,0,1


In [83]:
df_pred['correct'] = df_pred.prediction == df_pred.actual

In [84]:
round(df_pred.correct.mean(),1)

0.9

### Question 5

In [37]:
from sklearn.feature_selection import RFE

In [39]:
rfe = RFE(model, n_features_to_select=10)
rfe.fit(X_train, y_train)


[False False False  True False  True False False False False False False
  True  True False False False False False False False False False False
 False False  True  True  True False False False False False False False
 False False False False False False  True False False  True False  True]


In [41]:
import copy

In [46]:
orig_accuracy = (df_pred.prediction == df_pred.actual).mean()

In [51]:
features = ['age','job','marital','education','balance','housing','contact','day','month','duration','campaign','pdays','previous']
b = copy.copy(features)

for item in features:
    b.remove(item)
    train_dict = df_train[b].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)
    
    model.fit(X_train,y_train)
    
    val_dict = df_val[features].to_dict(orient='records')
    X_val = dv.transform(val_dict)
    
    y_pred = model.predict_proba(X_val)[:, 1]
    
    y_pred >= 0.5
    y_decision = (y_pred >= 0.5)
    
    print(item)

    print(abs((y_val == y_decision).mean()-orig_accuracy))
    b=copy.copy(features)


age
0.0008847600088476293
job
0.0009953550099535136
marital
0.0006635700066356387
education
0.00011059500110588427
balance
0.0
housing
0.0014377350143773837
contact
0.00044238000442375913
day
0.00033178500331787486
month
0.0028754700287546564
duration
0.013160805131608
campaign
0.00033178500331787486
pdays
0.00033178500331787486
previous
0.0006635700066357497


In [89]:
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [96]:
for r in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', C=r, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    y_decision = (y_pred >= 0.5)
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_pred
    df_pred['prediction'] = y_decision.astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual
    print(r,round(df_pred.correct.mean(),3))

    
    


0.01 0.882
0.1 0.881
1 0.882
10 0.882
100 0.882
