In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv('bank-full.csv', delimiter=';')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
default,no,no,no,no,no
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
loan,no,no,yes,no,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5


In [7]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [8]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [12]:
columns = [
'age',
'job',
'marital',
'education',
'balance',
'housing',
'contact',
'day',
'month',
'duration',
'campaign',
'pdays',
'previous',
'poutcome',
'y'
]

In [13]:
df = df[columns]

In [14]:
df.head().T

Unnamed: 0,0,1,2,3,4
age,58,44,33,47,33
job,management,technician,entrepreneur,blue-collar,unknown
marital,married,single,married,married,single
education,tertiary,secondary,secondary,unknown,unknown
balance,2143,29,2,1506,1
housing,yes,yes,yes,yes,no
contact,unknown,unknown,unknown,unknown,unknown
day,5,5,5,5,5
month,may,may,may,may,may
duration,261,151,76,92,198


In [15]:
df.dtypes

age           int64
job          object
marital      object
education    object
balance       int64
housing      object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [16]:
numerical = list(df.dtypes[df.dtypes != 'object'].index)

In [17]:
numerical

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

In [18]:
df[numerical].corrwith(df.balance)

age         0.097783
balance     1.000000
day         0.004503
duration    0.021560
campaign   -0.014578
pdays       0.003435
previous    0.016674
dtype: float64

In [19]:
df[numerical].corrwith(df.campaign)

age         0.004760
balance    -0.014578
day         0.162490
duration   -0.084570
campaign    1.000000
pdays      -0.088628
previous   -0.032855
dtype: float64

In [20]:
df[numerical].corrwith(df.pdays)

age        -0.023758
balance     0.003435
day        -0.093044
duration   -0.001565
campaign   -0.088628
pdays       1.000000
previous    0.454820
dtype: float64

In [21]:
df[numerical].corrwith(df.previous)

age         0.001288
balance     0.016674
day        -0.051710
duration    0.001203
campaign   -0.032855
pdays       0.454820
previous    1.000000
dtype: float64

In [26]:
df.y = (df.y == 'yes').astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.y = (df.y == 'yes').astype(int)


In [28]:
df.y.value_counts(normalize=True)

y
0    0.883015
1    0.116985
Name: proportion, dtype: float64

In [29]:
from sklearn.model_selection import train_test_split

In [30]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state =42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state =42)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [31]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values
del df_train['y']
del df_val['y']
del df_test['y']

In [33]:
df_train.nunique()

age            76
job            12
marital         3
education       4
balance      5960
housing         2
contact         3
day            31
month          12
duration     1397
campaign       45
pdays         494
previous       39
poutcome        4
dtype: int64

In [35]:
numerical = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']

In [36]:
categorical = [
    'job',
    'marital',
    'education',
    'housing',
    'contact',
    'day',
    'month',
    'poutcome'
]

In [37]:
from sklearn.metrics import mutual_info_score

In [42]:
for c in categorical:
    print(c)
    print(mutual_info_score(y_train, df_train[c]))
    print(round(mutual_info_score(y_train, df_train[c]), 2))
    print()
    print()

job
0.007316082778474635
0.01


marital
0.0020495925927810216
0.0


education
0.0026967549991295282
0.0


housing
0.010343105891750026
0.01


contact
0.013356062198247219
0.01


day
0.0063998575390331516
0.01


month
0.02509003344365025
0.03


poutcome
0.029532821290436224
0.03




In [43]:
from sklearn.feature_extraction import DictVectorizer

In [51]:
df_train.head().T

Unnamed: 0,0,1,2,3,4
age,32,38,49,37,31
job,technician,entrepreneur,blue-collar,housemaid,self-employed
marital,single,married,married,married,married
education,tertiary,secondary,secondary,primary,tertiary
balance,1100,0,3309,2410,3220
housing,yes,yes,yes,no,no
contact,cellular,cellular,cellular,cellular,cellular
day,11,17,15,4,26
month,aug,nov,may,aug,aug
duration,67,258,349,315,74


In [44]:
dicts = df_train.to_dict(orient='records')

In [50]:
dicts

[{'age': 32,
  'job': 'technician',
  'marital': 'single',
  'education': 'tertiary',
  'balance': 1100,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 11,
  'month': 'aug',
  'duration': 67,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 38,
  'job': 'entrepreneur',
  'marital': 'married',
  'education': 'secondary',
  'balance': 0,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 17,
  'month': 'nov',
  'duration': 258,
  'campaign': 1,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 49,
  'job': 'blue-collar',
  'marital': 'married',
  'education': 'secondary',
  'balance': 3309,
  'housing': 'yes',
  'contact': 'cellular',
  'day': 15,
  'month': 'may',
  'duration': 349,
  'campaign': 2,
  'pdays': -1,
  'previous': 0,
  'poutcome': 'unknown'},
 {'age': 37,
  'job': 'housemaid',
  'marital': 'married',
  'education': 'primary',
  'balance': 2410,
  'housing': 'no',
  'contact': 'cellular',
  'day': 4,
  'month': '

In [45]:
dv = DictVectorizer(sparse=False)

In [55]:
X_train_enc = dv.fit_transform(dicts)

In [47]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [48]:
dv.transform(dicts)

array([[3.200e+01, 1.100e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.800e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.900e+01, 3.309e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.400e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.500e+01, 2.311e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.000e+01, 1.500e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [53]:
X_val_dv=df_val.to_dict(orient='records')
X_val_enc=dv.transform(X_val_dv)

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_enc, y_train)
y_pred=model.predict(X_val_enc)

accuracy = np.round(accuracy_score(y_val, y_pred),2)
print(f'Accuracy = {accuracy}')

Accuracy = 0.9


In [59]:
model_accuracy = accuracy_score(y_val, y_pred)
model_accuracy

0.9007962840079629

In [57]:
all_features=df_train.columns.to_list()
all_features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [60]:
#Feature elimination
l=[]

for feature in all_features:
    features=all_features.copy()
    features.remove(feature)
    
    dv=DictVectorizer()
    X_train_new=df_train[features]
    X_train_1=X_train_new.to_dict(orient='records')
    X_train_n=dv.fit_transform(X_train_1)
    
    X_val_new=df_val[features]
    X_val_1=X_val_new.to_dict(orient='records')
    X_val_n=dv.transform(X_val_1)
    
    
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_n, y_train)
    y_pred1=model.predict(X_val_n)
    accuracy1 = accuracy_score(y_val, y_pred1)
    l.append([feature,model_accuracy,accuracy1,model_accuracy-accuracy1])

In [61]:
model_com=pd.DataFrame(data=l,columns=["feature","model_accuracy","accuracy1","difference"])
model_com

Unnamed: 0,feature,model_accuracy,accuracy1,difference
0,age,0.900796,0.901239,-0.000442
1,job,0.900796,0.901128,-0.000332
2,marital,0.900796,0.900133,0.000664
3,education,0.900796,0.901017,-0.000221
4,balance,0.900796,0.90146,-0.000664
5,housing,0.900796,0.900796,0.0
6,contact,0.900796,0.900464,0.000332
7,day,0.900796,0.901239,-0.000442
8,month,0.900796,0.89969,0.001106
9,duration,0.900796,0.889073,0.011723


In [62]:
score=[]
C=[0.01, 0.1, 1, 10, 100]
for c in C:
    model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model.fit(X_train_enc, y_train)
    y_pred=model.predict(X_val_enc)
    accuracy = np.round(accuracy_score(y_val, y_pred),3)
    score.append([c,accuracy])

In [63]:
C_df=pd.DataFrame(data=score,columns=["c_value","model_accuracy"])
C_df

Unnamed: 0,c_value,model_accuracy
0,0.01,0.898
1,0.1,0.901
2,1.0,0.901
3,10.0,0.901
4,100.0,0.902
