In [1]:
#path for dataset
path = "/data/banking.csv"

In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv(path)
data.fillna(data.mean(), inplace = True)
print(data.shape)
print(list(data.columns))

In [None]:
data.info

In [None]:
#grouping basic.9y, basic.6y and basic.4y
data['education']=np.where(data['education'] =='basic.9y', 'Basic', data['education'])
data['education']=np.where(data['education'] =='basic.6y', 'Basic', data['education'])
data['education']=np.where(data['education'] =='basic.4y', 'Basic', data['education'])
data['education'].unique()

In [None]:
count_no_sub = len(data[data['y']==0])
count_sub = len(data[data['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)
print("percentage of no subscription is", pct_of_no_sub*100)
pct_of_sub = count_sub/(count_no_sub+count_sub)
print("percentage of subscription", pct_of_sub*100)

In [None]:
print(data.groupby('job').mean())
print(data.groupby('marital').mean())
print(data.groupby('education').mean())

In [None]:
print(data['education'].value_counts())

In [None]:
#convert educational data into categorical equivalent
replace_edu = {'education': {'Basic': 1, 'university.degree': 2, 'high.school': 3, 'professional.course': 4, 'unknown': 5, 'illiterate': 6}}
dataEdu = data.copy()
data = dataEdu.replace(replace_edu)
data

In [None]:
print(data['job'].value_counts())

In [None]:
#convert job data into categorical equivalent
replace_job = {'job': {'admin.': 1, 'blue-collar': 2, 'technician': 3, 'services': 4, 'management': 5, 'retired': 6, 'entrepreneur': 7, 'self-employed': 8, 'housemaid': 9, 'unemployed': 10, 'student': 11, 'unknown': 12}}
dataJob = data.copy()
data = dataJob.replace(replace_job)

In [None]:
print(data['month'].value_counts())

In [None]:
#convert month data into categorical equivalent
replace_mon = {'month': {'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12}}
dataMon = data.copy()
data = dataMon.replace(replace_mon)

In [None]:
print(data['poutcome'].value_counts())

In [None]:
#convert poutcome data into categorical equivalent
replace_Pout = {'poutcome': {'failure': 0, 'success': 1, 'nonexistent':2}}
dataPout = data.copy()
data = dataPout.replace(replace_Pout)

In [None]:
print(data['marital'].value_counts())

In [None]:
#convert Marital data into categorical equivalent
replace_Mar = {'marital': {'married': 1, 'single': 2, 'divorced':3, 'unknown': 4}}
dataMar = data.copy()
data = dataMar.replace(replace_Mar)

In [None]:
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_idx,cons_conf_idx,euribor3m,nr_employed,y
0,44,2,1,1,unknown,yes,no,cellular,8,thu,210,1,999,0,2,1.4,93.444,-36.1,4.963,5228.1,0
1,53,3,1,5,no,no,no,cellular,11,fri,138,1,999,0,2,-0.1,93.2,-42.0,4.021,5195.8,0
2,28,5,2,2,no,yes,no,cellular,6,thu,339,3,6,2,1,-1.7,94.055,-39.8,0.729,4991.6,1
3,39,4,1,3,no,no,no,cellular,4,fri,185,2,999,0,2,-1.8,93.075,-47.1,1.405,5099.1,0
4,55,6,1,1,no,yes,no,cellular,8,fri,137,1,3,1,1,-2.9,92.201,-31.4,0.869,5076.2,1


**Filtering Row by Value**

In [None]:
#row filtering by value
cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(data[var], prefix=var)
    data1=data.join(cat_list)
    data=data1

cat_vars=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
data_vars=data.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [None]:
data_final=data[to_keep]
data_final.columns.values

array(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate',
       'cons_price_idx', 'cons_conf_idx', 'euribor3m', 'nr_employed', 'y',
       'job_1', 'job_2', 'job_3', 'job_4', 'job_5', 'job_6', 'job_7',
       'job_8', 'job_9', 'job_10', 'job_11', 'job_12', 'marital_1',
       'marital_2', 'marital_3', 'marital_4', 'education_1',
       'education_2', 'education_3', 'education_4', 'education_5',
       'education_6', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular',
       'contact_telephone', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_9', 'month_10', 'month_11',
       'month_12', 'day_of_week_fri', 'day_of_week_mon',
       'day_of_week_thu', 'day_of_week_tue', 'day_of_week_wed',
       'poutcome_0', 'poutcome_1', 'poutcome_2'], dtype=object)

**Over-Sampling using SMOTE**

In [None]:
X = data_final.loc[:, data_final.columns != 'y']
y = data_final.loc[:, data_final.columns == 'y']
from imblearn.over_sampling import SMOTE
os = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns
os_data_X,os_data_y=os.fit_sample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

  y = column_or_1d(y, warn=True)


length of oversampled data is  51134
Number of no subscription in oversampled data 25567
Number of subscription 25567
Proportion of no subscription data in oversampled data is  0.5
Proportion of subscription data in oversampled data is  0.5


**SET DATA**

In [None]:
#cloumn filtering
cols = ['euribor3m', 'job_2', 'job_9',  'marital_4', 'education_6', 'month_3', 'month_4', 'month_5', 'month_6',
       'month_7', 'month_8', 'month_10', 'month_11', 'poutcome_0', 'poutcome_1']
X = os_data_X[cols]
y = os_data_y['y']      

**IMPLEMENTING THE MODEL**

In [None]:
#generating model summary
import statsmodels.api as sm
logit_model=sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

  import pandas.util.testing as tm


Optimization terminated successfully.
         Current function value: 0.557850
         Iterations 7
                         Results: Logit
Model:              Logit            Pseudo R-squared: 0.195     
Dependent Variable: y                AIC:              57080.2468
Date:               2021-05-06 15:05 BIC:              57212.8799
No. Observations:   51134            Log-Likelihood:   -28525.   
Df Model:           14               LL-Null:          -35443.   
Df Residuals:       51119            LLR p-value:      0.0000    
Converged:          1.0000           Scale:            1.0000    
No. Iterations:     7.0000                                       
------------------------------------------------------------------
              Coef.   Std.Err.     z      P>|z|    [0.025   0.975]
------------------------------------------------------------------
euribor3m    -0.4409    0.0074  -59.9752  0.0000  -0.4554  -0.4265
job_2        -0.2043    0.0278   -7.3497  0.0000  -0.2588  -0.

**LISTING AND IMPLEMENTING MODELS**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import Perceptron
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNet
from sklearn import metrics
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
clf = []
clf.append(('NB ', GaussianNB()))
clf.append(('LogReg ', LogisticRegression(random_state=9)))
clf.append(('DT ', DecisionTreeClassifier(random_state = 9)))
clf.append(('KNN ', KNeighborsClassifier()))
clf.append(('SVM', SVC(random_state = 9)))
clf.append(('RFC', RandomForestClassifier(n_estimators=100, random_state=9)))
clf.append(('LDA', LinearDiscriminantAnalysis()))
clf.append(('LinearReg', LinearRegression()))
clf.append(('LogReg', LogisticRegression()))
clf.append(('Ridge', RidgeClassifier()))
clf.append(('Lasso', Lasso()))
clf.append(('BR', BayesianRidge()))
clf.append(('PAC', PassiveAggressiveClassifier()))
clf.append(('PAR', PassiveAggressiveRegressor()))
clf.append(('Perceptron', Perceptron()))
clf.append(('HuberReg', HuberRegressor()))
clf.append(('LassoCV', LassoCV()))
clf.append(('ElasticNet', LassoCV()))

In [None]:
for i, j in clf:
  j.fit(X_train, y_train)
  y_pred = j.predict(X_test)
  print(y_pred, ' Accuracy for ',i, ' = ', j.score(X_test, y_test))

[0 0 0 ... 0 1 0]  Accuracy for  NB   =  0.6652760576233623
[0 1 1 ... 0 1 0]  Accuracy for  LogReg   =  0.7361319340329835
[1 1 0 ... 1 1 1]  Accuracy for  DT   =  0.9200182517436933
[1 1 0 ... 1 1 1]  Accuracy for  KNN   =  0.9097190535167199
[0 1 1 ... 1 1 0]  Accuracy for  SVM  =  0.8233491949677335
[1 1 0 ... 1 1 1]  Accuracy for  RFC  =  0.9219737957108403
[0 1 1 ... 0 1 0]  Accuracy for  LDA  =  0.7305260413271625
[0.36633115 0.75868804 0.62996477 ... 0.36056057 0.66485734 0.1501571 ]  Accuracy for  LinearReg  =  0.25842978764532043
[0 1 1 ... 0 1 0]  Accuracy for  LogReg  =  0.7361319340329835
[0 1 1 ... 0 1 0]  Accuracy for  Ridge  =  0.7305260413271625
[0.49987428 0.49987428 0.49987428 ... 0.49987428 0.49987428 0.49987428]  Accuracy for  Lasso  =  -7.0242694372169e-07
[0.36689035 0.75767044 0.63080013 ... 0.36052996 0.66557849 0.15012185]  Accuracy for  BR  =  0.25842930297207634
[0 0 0 ... 0 1 0]  Accuracy for  PAC  =  0.6378984420833061
[0.39943725 1.15386612 0.54136436 ...

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


[0.10752829 0.93224601 0.84867115 ... 0.06351589 0.93368524 0.01240707]  Accuracy for  HuberReg  =  0.027774371959203625
[0.37035193 0.7535515  0.6392784  ... 0.35769021 0.67006588 0.14854741]  Accuracy for  LassoCV  =  0.2581421244396719
[0.37035193 0.7535515  0.6392784  ... 0.35769021 0.67006588 0.14854741]  Accuracy for  ElasticNet  =  0.2581421244396719


Thus we observe that the Random Forest Classifier gives us maximum accuracy out of all the present models

This is mainly due to low variace of decision trees and low bias from random feature and row sampling in the random forest model