In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [2]:
raw_dataframe = pd.read_csv('bank-full.csv', delimiter=';')

In [3]:
raw_dataframe.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
raw_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
age          45211 non-null int64
job          45211 non-null object
marital      45211 non-null object
education    45211 non-null object
default      45211 non-null object
balance      45211 non-null int64
housing      45211 non-null object
loan         45211 non-null object
contact      45211 non-null object
day          45211 non-null int64
month        45211 non-null object
duration     45211 non-null int64
campaign     45211 non-null int64
pdays        45211 non-null int64
previous     45211 non-null int64
poutcome     45211 non-null object
y            45211 non-null object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [5]:
df = raw_dataframe.copy()

In [6]:
# creation of addos dataframe for numerical columns to represent categorical data
addons = pd.DataFrame()

In [7]:
le = LabelEncoder()

In [8]:
le.fit(df['marital'])
addons['marital'] = le.transform(df['marital'])

In [9]:
le.fit(df['education'])
addons['education'] = le.transform(df['education'])

In [10]:
le.fit(df['poutcome'])
addons['poutcome'] = le.transform(df['poutcome'])

In [11]:
le.fit(df['job'])
addons['job'] = le.transform(df['job'])

In [12]:
addons['default'] = np.where(df['default'] == 'no', 0, 1)

In [13]:
addons['housing'] = np.where(df['housing'] == 'no', 0, 1)

In [14]:
addons['loan'] = np.where(df['loan'] == 'no', 0, 1)

In [15]:
addons['y'] = np.where(df['y'] == 'no', 0, 1)

In [16]:
look_up = {'jan': 1,  
           'feb': 2, 
           'mar': 3, 
           'apr': 4, 
           'may': 5,
            'jun': 6, 
           'jul': 7, 
           'aug': 8, 
           'sep': 9, 
           'oct': 10, 
           'nov': 11, 
           'dec': 12}

addons['month'] = df['month'].apply(lambda x: look_up[x])

In [17]:
df.drop(['marital', 'education', 'poutcome', 'default', 'housing', 'loan', 'contact', 'month', 'y', 'job'], axis=1, inplace=True)

In [18]:
addons.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 9 columns):
marital      45211 non-null int64
education    45211 non-null int64
poutcome     45211 non-null int64
job          45211 non-null int64
default      45211 non-null int32
housing      45211 non-null int32
loan         45211 non-null int32
y            45211 non-null int32
month        45211 non-null int64
dtypes: int32(4), int64(5)
memory usage: 2.4 MB


In [19]:
train_data = pd.concat([addons, df], axis=1)

In [20]:
processed_data = pd.DataFrame(preprocessing.scale(train_data), columns=train_data.columns)

In [66]:
# look for potential non-linear relationshipss
processed_data['balance_sqrt'] = (processed_data['balance'] + 100) ** .5
processed_data['balance2'] = (processed_data['balance'] + 100) ** 2
processed_data['balance3'] = (processed_data['balance'] + 100) ** 3

In [22]:
processed_data.head()

Unnamed: 0,marital,education,poutcome,job,default,housing,loan,y,month,age,balance,day,duration,campaign,pdays,previous,balance_sqrt,balance2,balance3
0,-0.275762,1.036362,0.444898,-0.10382,-0.13549,0.893915,-0.436803,-0.363983,-0.475354,1.606965,0.256419,-1.298476,0.011016,-0.569351,-0.411453,-0.25194,10.012813,10051.349601,1007712.0
1,1.368372,-0.300556,0.444898,1.424008,-0.13549,0.893915,-0.436803,-0.363983,-0.475354,0.288529,-0.437895,-1.298476,-0.416127,-0.569351,-0.411453,-0.25194,9.978081,9912.612813,986920.6
2,-0.275762,-0.300556,0.444898,-0.714951,-0.13549,0.893915,2.289359,-0.363983,-0.475354,-0.747384,-0.446762,-1.298476,-0.707361,-0.569351,-0.411453,-0.25194,9.977637,9910.847103,986656.9
3,-0.275762,2.37328,0.444898,-1.020516,-0.13549,0.893915,-0.436803,-0.363983,-0.475354,0.571051,0.047205,-1.298476,-0.645231,-0.569351,-0.411453,-0.25194,10.00236,10009.443318,1001417.0
4,1.368372,2.37328,0.444898,2.035139,-0.13549,-1.118674,-0.436803,-0.363983,-0.475354,-0.747384,-0.447091,-1.298476,-0.23362,-0.569351,-0.411453,-0.25194,9.97762,9910.781709,986647.2


## Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression

In [25]:
X = processed_data.drop('y', axis=1)
y = train_data['y']

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [28]:
logmodel = LogisticRegression()

In [29]:
logmodel.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
log_predictions = logmodel.predict(X_test)

In [31]:
from sklearn.metrics import classification_report, confusion_matrix

In [32]:
print(classification_report(y_test, log_predictions))

             precision    recall  f1-score   support

          0       0.88      1.00      0.94     11967
          1       0.00      0.00      0.00      1597

avg / total       0.78      0.88      0.83     13564



  'precision', 'predicted', average, warn_for)


In [34]:
print(confusion_matrix(y_test, log_predictions))

[[11967     0]
 [ 1597     0]]


In [35]:
from sklearn.model_selection import cross_val_score

In [37]:
scores = cross_val_score(logmodel, X_train, y_train, cv=5)
print(scores)

[ 0.88325434  0.88325434  0.8833939   0.8833939   0.8833939 ]


The vanilla Logistic Model seems to havea lot of trouble predicting positive outcomes.

## Ridge Regression

In [38]:
from sklearn import linear_model

In [39]:
ridgemodel = linear_model.Ridge(alpha=10, fit_intercept=False)

In [40]:
ridgemodel.fit(X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=False, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [42]:
print(ridgemodel.score(X_train, y_train))

0.196920609086


In [43]:
scores = cross_val_score(ridgemodel, X_train, y_train, cv=5)
print(scores)

[ 0.19418947  0.17703792  0.20891446  0.20601394  0.1750266 ]


The ridge regression R2 value are very low.

## Lasso Regression

In [59]:
lassomodel = linear_model.Lasso(alpha=.05)

In [60]:
lassomodel.fit(X_train, y_train)



Lasso(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [61]:
print(lassomodel.score(X_train, y_train))

0.127322061563


In [62]:
lassomodel.coef_

array([  0.00000000e+00,   0.00000000e+00,  -0.00000000e+00,
         0.00000000e+00,  -0.00000000e+00,  -0.00000000e+00,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,  -0.00000000e+00,   7.34651225e-02,
        -0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   7.75491498e-04,  -4.51849110e-06])

In [63]:
lassomodel.intercept_

-3.1191532921340217

In [65]:
scores = cross_val_score(lassomodel, X_train, y_train, cv=5)
print(scores)



[ 0.13104628  0.12677186  0.1320127   0.13017244  0.1159514 ]




The R2 score for the lasso model is even smaller than the score reported by the Ridge regression model. This model also only keeps about 3 features from the original set, dropping 15 features in the process.

## Ensemble Regression

### L1 Penalty

In [68]:
ensemble_model_l1 = LogisticRegression(penalty='l1')

In [69]:
ensemble_model_l1.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [70]:
print(ensemble_model_l1.score(X_train, y_train))

0.889404998894


In [72]:
scores = cross_val_score(ensemble_model_l1, X_train, y_train, cv=5)
print(scores, np.mean(scores))

[ 0.88878357  0.89383886  0.89050403  0.88734397  0.88686996] 0.889468079557


### L2 Penalty

In [74]:
ensemble_model_l2 = LogisticRegression(penalty='l2')

In [75]:
ensemble_model_l2.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [76]:
print(ensemble_model_l2.score(X_train, y_train))

0.883338073119


In [77]:
scores = cross_val_score(ensemble_model_l2, X_train, y_train, cv=5)
print(scores, np.mean(scores))

[ 0.88325434  0.88325434  0.8833939   0.8833939   0.8833939 ] 0.883338078411


Using the penalty parameter on the Logistic Regression classifier boosts the score of the model significantly.

Penalized Regression seems very powerful in the context of this particular case study.

The L1 regression penalty, responsible for penalizing larger coefficients seems to performs slightly better than the logistic regression with the L2 penalty. The L2 penalty is responsible for reducing the coefficients of features that lack influence on the model at a certain threshhold. This feature reduction works to combat overfitting based on the features that might be providing similar information about the model.