In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv('bank-full.csv', sep=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [5]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [6]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [7]:
df.y

0         no
1         no
2         no
3         no
4         no
        ... 
45206    yes
45207    yes
45208    yes
45209     no
45210     no
Name: y, Length: 45211, dtype: object

In [8]:
df['education'].unique()

array(['tertiary', 'secondary', 'unknown', 'primary'], dtype=object)

In [9]:
modes=df['education'].mode()
modes

0    secondary
Name: education, dtype: object

In [10]:
df.y=(df.y=='yes').astype(int)

In [11]:
df.y

0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int32

In [12]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=1)

In [13]:
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=1)

In [14]:
len(df),len(df_full_train),len(df_test),len(df_train),len(df_val)

(45211, 36168, 9043, 27126, 9042)

In [15]:
print(len(df_train)+len(df_test)+len(df_val))

45211


In [16]:
y_train=df_train.y.values
y_test=df_test.y.values
y_val=df_val.y.values

In [17]:
del df_train['y']
del df_test['y']
del df_val['y']


In [18]:
df_full_train=df_full_train.reset_index(drop=True)

In [19]:
df_full_train.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [20]:
df_full_train.y

0        0
1        0
2        0
3        0
4        0
        ..
36163    0
36164    0
36165    0
36166    0
36167    0
Name: y, Length: 36168, dtype: int32

In [21]:
df_full_train.y.value_counts(normalize=True)

0    0.882797
1    0.117203
Name: y, dtype: float64

In [22]:
categorical=['job', 'marital', 'education', 'default','housing',
       'loan', 'contact','month','poutcome']
numerical=['age','balance','day','duration','campaign','pdays','previous']

In [23]:
corel=df.corr()

In [24]:
corel

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,y
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288,0.025155
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674,0.052838
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171,-0.028348
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203,0.394521
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855,-0.073172
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482,0.103621
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0,0.093236
y,0.025155,0.052838,-0.028348,0.394521,-0.073172,0.103621,0.093236,1.0


In [25]:
df_full_train[numerical].corrwith(df_full_train.y)

age         0.024931
balance     0.055252
day        -0.030624
duration    0.398321
campaign   -0.072134
pdays       0.101045
previous    0.087392
dtype: float64

In [26]:
df.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y             int32
dtype: object

In [27]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'y'],
      dtype='object')

In [28]:
y_train

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
round(mutual_info_score(y_train,df_train.contact),2)

0.01

In [30]:
round(mutual_info_score(y_train,df_train.education),2)

0.0

In [31]:
round(mutual_info_score(y_train,df_train.housing),2)

0.01

In [32]:
round(mutual_info_score(y_train,df_train.poutcome),2)

0.03

In [33]:
def mutual_info(series):
    return mutual_info_score(series,y_train)

In [34]:
round(df_train[categorical].apply(mutual_info),2).sort_values(ascending=False)

poutcome     0.03
month        0.02
job          0.01
housing      0.01
contact      0.01
marital      0.00
education    0.00
default      0.00
loan         0.00
dtype: float64

In [35]:
dicts=df_train[categorical + numerical].to_dict(orient='records')

In [36]:
dicts[0]

{'job': 'entrepreneur',
 'marital': 'married',
 'education': 'tertiary',
 'default': 'no',
 'housing': 'no',
 'loan': 'yes',
 'contact': 'cellular',
 'month': 'jul',
 'poutcome': 'unknown',
 'age': 56,
 'balance': 40,
 'day': 11,
 'duration': 123,
 'campaign': 2,
 'pdays': -1,
 'previous': 0}

In [37]:
dv=DictVectorizer(sparse=False)

In [38]:
dv.fit(dicts)

DictVectorizer(sparse=False)

In [39]:
dv.get_feature_names()



['age',
 'balance',
 'campaign',
 'contact=cellular',
 'contact=telephone',
 'contact=unknown',
 'day',
 'default=no',
 'default=yes',
 'duration',
 'education=primary',
 'education=secondary',
 'education=tertiary',
 'education=unknown',
 'housing=no',
 'housing=yes',
 'job=admin.',
 'job=blue-collar',
 'job=entrepreneur',
 'job=housemaid',
 'job=management',
 'job=retired',
 'job=self-employed',
 'job=services',
 'job=student',
 'job=technician',
 'job=unemployed',
 'job=unknown',
 'loan=no',
 'loan=yes',
 'marital=divorced',
 'marital=married',
 'marital=single',
 'month=apr',
 'month=aug',
 'month=dec',
 'month=feb',
 'month=jan',
 'month=jul',
 'month=jun',
 'month=mar',
 'month=may',
 'month=nov',
 'month=oct',
 'month=sep',
 'pdays',
 'poutcome=failure',
 'poutcome=other',
 'poutcome=success',
 'poutcome=unknown',
 'previous']

In [40]:
list(dv.transform(dicts[:5])[0])

[56.0,
 40.0,
 2.0,
 1.0,
 0.0,
 0.0,
 11.0,
 1.0,
 0.0,
 123.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -1.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0]

In [41]:
X_train=dv.fit_transform(dicts)

In [42]:
val_dicts=df_val[categorical+ numerical].to_dict(orient='records')

In [43]:
X_Val=dv.transform(val_dicts)

In [44]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [45]:
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [46]:
model.intercept_[0]

-0.6793730833462145

In [47]:
model.coef_[0].round(3)

array([-6.000e-03,  0.000e+00, -8.400e-02,  3.260e-01,  1.890e-01,
       -1.195e+00,  3.000e-03, -3.430e-01, -3.360e-01,  4.000e-03,
       -3.590e-01, -1.690e-01, -9.000e-03, -1.420e-01, -2.300e-02,
       -6.570e-01,  1.200e-02, -2.730e-01, -2.170e-01, -2.770e-01,
       -7.500e-02,  4.790e-01, -1.760e-01, -1.880e-01,  4.690e-01,
       -1.980e-01, -1.510e-01, -8.300e-02, -1.540e-01, -5.250e-01,
       -2.120e-01, -3.830e-01, -8.500e-02, -4.100e-02, -7.050e-01,
        2.000e-01, -3.840e-01, -9.290e-01, -7.890e-01,  4.400e-01,
        1.295e+00, -4.670e-01, -9.650e-01,  1.018e+00,  6.470e-01,
       -0.000e+00, -6.690e-01, -6.480e-01,  1.546e+00, -9.080e-01,
       -1.000e-03])

In [48]:
y_pred=model.predict_proba(X_Val)[:,1]

In [49]:
y_pred

array([0.02908934, 0.0223903 , 0.02487471, ..., 0.22224983, 0.00156561,
       0.03842559])

In [50]:
decision=y_pred >= 0.5
decision

array([False, False, False, ..., False, False, False])

In [51]:
round((y_val == decision).mean(),2)

0.9

In [52]:
decision.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [53]:
dict(zip(dv.get_feature_names(),model.coef_[0].round(3)))



{'age': -0.006,
 'balance': 0.0,
 'campaign': -0.084,
 'contact=cellular': 0.326,
 'contact=telephone': 0.189,
 'contact=unknown': -1.195,
 'day': 0.003,
 'default=no': -0.343,
 'default=yes': -0.336,
 'duration': 0.004,
 'education=primary': -0.359,
 'education=secondary': -0.169,
 'education=tertiary': -0.009,
 'education=unknown': -0.142,
 'housing=no': -0.023,
 'housing=yes': -0.657,
 'job=admin.': 0.012,
 'job=blue-collar': -0.273,
 'job=entrepreneur': -0.217,
 'job=housemaid': -0.277,
 'job=management': -0.075,
 'job=retired': 0.479,
 'job=self-employed': -0.176,
 'job=services': -0.188,
 'job=student': 0.469,
 'job=technician': -0.198,
 'job=unemployed': -0.151,
 'job=unknown': -0.083,
 'loan=no': -0.154,
 'loan=yes': -0.525,
 'marital=divorced': -0.212,
 'marital=married': -0.383,
 'marital=single': -0.085,
 'month=apr': -0.041,
 'month=aug': -0.705,
 'month=dec': 0.2,
 'month=feb': -0.384,
 'month=jan': -0.929,
 'month=jul': -0.789,
 'month=jun': 0.44,
 'month=mar': 1.295,
 'm

In [54]:
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

# Subset the training data with selected features
subset = ['age','balance','marital','previous']
train_dict_small = df_train[subset].to_dict(orient='records')
val_dict_small = df_val[subset].to_dict(orient='records')

# Initialize the DictVectorizer
dv_small = DictVectorizer(sparse=False)
dv_small.fit(train_dict_small)

# Transform training and validation data
X_small_train = dv_small.transform(train_dict_small)
X_small_val = dv_small.transform(val_dict_small)

# Initialize and train the logistic regression model
model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model_small.fit(X_small_train, y_train)

# Get baseline accuracy
y_pred_base = model_small.predict(X_small_val)
accuracy_base = accuracy_score(y_val, y_pred_base)

# Initialize a dictionary to store the accuracy differences
accuracy_diff = {}

# Feature elimination process
for feature in subset:
    # Exclude the current feature
    subset_temp = [f for f in subset if f != feature]
    
    # Transform training and validation sets excluding the current feature
    train_dict_temp = df_train[subset_temp].to_dict(orient='records')
    val_dict_temp = df_val[subset_temp].to_dict(orient='records')
    
    # Re-train the vectorizer and model with the reduced feature set
    dv_temp = DictVectorizer(sparse=False)
    dv_temp.fit(train_dict_temp)
    
    X_temp_train = dv_temp.transform(train_dict_temp)
    X_temp_val = dv_temp.transform(val_dict_temp)
    
    # Train the model without the current feature
    model_temp = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_temp.fit(X_temp_train, y_train)
    
    # Get the accuracy without the current feature
    y_pred_temp = model_temp.predict(X_temp_val)
    accuracy_temp = accuracy_score(y_val, y_pred_temp)
    
    # Store the difference in accuracy
    accuracy_diff[feature] = accuracy_base - accuracy_temp

# Output the accuracy differences and find the least useful feature
least_useful_feature = min(accuracy_diff, key=accuracy_diff.get)
accuracy_diff, least_useful_feature


({'age': 0.00022119000221187957,
  'balance': -0.00022119000221187957,
  'marital': 0.0,
  'previous': -0.0013271400132713884},
 'previous')

In [55]:
# Define the C values to try
C_values = [0.01, 0.1, 1, 10, 100]
# Dictionary to store accuracies for each C
accuracy_results = {}

# Loop through each value of C
for C in C_values:
    # Initialize and train the logistic regression model with regularization
    model_regularized = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    
    # Train the model using all features
    model_regularized.fit(X_train, y_train)
    
    # Make predictions on the validation dataset
    y_pred = model_regularized.predict(X_Val)
    
    # Calculate the accuracy and round it to 3 decimal digits
    accuracy = accuracy_score(y_val, y_pred)
    accuracy_results[C] = round(accuracy, 3)

# Output the accuracy results
accuracy_results


{0.01: 0.895, 0.1: 0.898, 1: 0.898, 10: 0.898, 100: 0.898}