## Data load and preprocessing

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV
from sklearn import linear_model,metrics
from sklearn.metrics import f1_score
from sklearn import naive_bayes
from sklearn.metrics import roc_curve, roc_auc_score, auc, accuracy_score, confusion_matrix, classification_report
from sklearn import preprocessing
from sklearn import svm
from sklearn.preprocessing import StandardScaler,MinMaxScaler


dataset=pd.read_csv('bank-additional-full.csv',delimiter=';')
dataset=pd.DataFrame(dataset)
#remove rows with value 'unknown'
dataset.drop(['duration'],axis=1,inplace=True)
dataset['y']=dataset['y'].map({'yes':1,'no':0})

Counting unknowns for each column

In [2]:
col_name=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for item in col_name:
    col=dataset[item].as_matrix()
    count_unkown=0
    for i in range(col.shape[0]):
        if col[i]=='unknown':
            count_unkown+=1    
    print('column',item,'has',count_unkown,'of unknown values.')
    

column job has 330 of unknown values.
column marital has 80 of unknown values.
column education has 1731 of unknown values.
column default has 8597 of unknown values.
column housing has 990 of unknown values.
column loan has 990 of unknown values.
column contact has 0 of unknown values.
column month has 0 of unknown values.
column day_of_week has 0 of unknown values.
column poutcome has 0 of unknown values.


  This is separate from the ipykernel package so we can avoid doing imports until


Remove the unknown values 

In [3]:
dataset=dataset[(dataset['job']!='unknown')&(dataset['marital']!='unknown')&(dataset['education']!='unknown')&(dataset['default']!='unknown')&(dataset['housing']!='unknown')&(dataset['loan']!='unknown')]
print(dataset.shape)

(30488, 20)


Convert categorical values to encoded values

In [4]:
categorical_col=['job','marital','education','default','housing','loan','contact','month','day_of_week','poutcome']
for item in categorical_col:
    dataset[item]=dataset[item].astype('category')
cat_columns = dataset.select_dtypes(['category']).columns
dataset[cat_columns] = dataset[cat_columns].apply(lambda x: x.cat.codes)
print(dataset.shape)


(30488, 20)
       age  job  marital  education  default  housing  loan  contact  month  \
0       56    3        1          0        0        0     0        1      6   
2       37    7        1          3        0        1     0        1      6   
3       40    0        1          1        0        0     0        1      6   
4       56    7        1          3        0        0     1        1      6   
6       59    0        1          5        0        0     0        1      6   
8       24    9        2          5        0        1     0        1      6   
9       25    7        2          3        0        1     0        1      6   
11      25    7        2          3        0        1     0        1      6   
12      29    1        2          3        0        0     1        1      6   
13      57    3        0          0        0        1     0        1      6   
14      35    1        1          1        0        1     0        1      6   
16      35    1        1          1     

Downsampling the majority class to balance the dataset

In [5]:
np.random.seed(0)
dataset_classyes=dataset[dataset['y']==1].as_matrix()
dataset_classno=dataset[dataset['y']==0].as_matrix()

sample=np.random.choice([True, False], len(dataset_classno), replace=True, p=[0.25, 0.75])
#downsample dataset with class value 'no' 
dataset_classno_downsampled=dataset_classno[sample]
print(dataset_classyes.shape)
print(dataset_classno_downsampled.shape)

final=np.concatenate((dataset_classno_downsampled,dataset_classyes),axis=0)
print(final.shape)

(3859, 20)
(6785, 20)
(10644, 20)


  
  This is separate from the ipykernel package so we can avoid doing imports until


## Whole dataset

In [6]:
x=final[:,:-1]
y=final[:,-1]
fit_rf=RandomForestClassifier(random_state=0)

Fine-tuning hyper parameters

In [7]:
np.random.seed(0)
param_dist = {'max_depth':[2,3,4],
              #'n_estimators':[10,50,100],
             'bootstrap':[True, False],
             'max_features':['auto','sqrt','log2',None],
             'criterion':['gini','entropy']}

cv_rf = GridSearchCV(fit_rf,cv=10,param_grid=param_dist,n_jobs=3)

cv_rf.fit(x,y)

print('Best parameters using grid search: \n',
     cv_rf.best_params_)

Best parameters using grid search: 
 {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 2, 'max_features': 'auto'}


Run classifier for feature selection

In [8]:
fit_rf.set_params(criterion='entropy',n_estimators = 100,max_features='auto',max_depth=2,bootstrap=False)
fit_rf.fit(x,y)

def variable_importance(fit):
    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance':importances,'index':indices}

var_imp_rf=variable_importance(fit_rf)
importances_rf=var_imp_rf['importance']
indices_rf=var_imp_rf['index']

print('Feature ranking:')
feature_name=['age','job','marital','education','default','housing','loan','contact','month','day_of_week',
              'campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed']


for i in range(0,indices_rf.shape[0]):
    f = i
    print('{0}. The feature {1} with index {3} has a mean decrease in impurity of {2:.5f}'.format(f+1,feature_name[indices_rf[i]],importances_rf[indices_rf[f]],indices_rf[i]))

Feature ranking:
1. The feature nr.employed with index 18 has a mean decrease in impurity of 0.30857
2. The feature euribor3m with index 17 has a mean decrease in impurity of 0.17999
3. The feature emp.var.rate with index 14 has a mean decrease in impurity of 0.12911
4. The feature pdays with index 11 has a mean decrease in impurity of 0.12903
5. The feature poutcome with index 13 has a mean decrease in impurity of 0.08117
6. The feature cons.conf.idx with index 16 has a mean decrease in impurity of 0.07517
7. The feature previous with index 12 has a mean decrease in impurity of 0.02714
8. The feature contact with index 7 has a mean decrease in impurity of 0.02144
9. The feature month with index 8 has a mean decrease in impurity of 0.01771
10. The feature cons.price.idx with index 15 has a mean decrease in impurity of 0.01720
11. The feature age with index 0 has a mean decrease in impurity of 0.01117
12. The feature campaign with index 10 has a mean decrease in impurity of 0.00191
13. 

In [9]:
print(final.shape)
names = dataset.columns.values
finaldf = pd.DataFrame(data=final,columns=names)

np.random.seed(2018)
train = np.random.choice([True, False], finaldf.shape[0], replace=True, p=[0.6, 0.4])


bank_train = finaldf.iloc[train,:].as_matrix()
bank_test = finaldf.iloc[~train,:].as_matrix()

print(bank_train.shape)
print(bank_test.shape)

(10644, 20)
(6376, 20)
(4268, 20)


  if __name__ == '__main__':
  # Remove the CWD from sys.path while we load stuff.


Select top 3 features for Logistic regression and Naive Bayes classifier:<br>
<ol>
    <li>nr.employed (index 18)</li>
    <li>euribor3m (index 17)</li>
    <li>emp.var.rate (index 14)</li>
</ol>

In [10]:
top = [18,17,14]

logit = linear_model.LogisticRegression()
gnb = naive_bayes.GaussianNB()

#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = bank_train[:,[top[i],top[j]]].astype(float)
        y_train = bank_train[:,19].astype(float)
        
        x_test = bank_test[:,[top[i],top[j]]].astype(float)
        y_test = bank_test[:,19].astype(float)
       
        
        logit.fit(x_train, y_train)
        y_test_pred = logit.predict(x_test)
        
        gnb.fit(x_train, y_train)
        y_pred1 = gnb.predict(x_test)
            
        print('accuracy for test data for logistic regression',top[i],top[j],'is' , metrics.accuracy_score(y_test, y_test_pred))
        print('F1 for logistic regression is ' , f1_score(y_test, y_test_pred))
        
        print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
        print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))

#3 features:
for i in range(3):
    for j in range(i+1,3):
        for k in range(j+1,3):
            x_train = bank_train[:,[top[i],top[j],top[k]]].astype(float)
            y_train = bank_train[:,19].astype(float)
            
            x_test = bank_test[:,[top[i],top[j],top[k]]].astype(float)
            y_test = bank_test[:,19].astype(float)
            
            
            logit.fit(x_train, y_train)
            y_test_pred = logit.predict(x_test)

            gnb.fit(x_train, y_train)
            y_pred1 = gnb.predict(x_test)
        
            print('accuracy for test data for logistic regression',top[i],top[j],top[k],'is' , metrics.accuracy_score(y_test, y_test_pred))
            print('F1 is ' , f1_score(y_test, y_test_pred))
            
            print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
            print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))

accuracy for test data for logistic regression 18 17 is 0.7188378631677601
F1 for logistic regression is  0.6583143507972665
precision and recall for Naive Bayes classifier 18 17 : 0.5892678034102307 0.7619974059662775
f1 score for Naive Bayes classifiern is  0.6645927601809954
accuracy for test data for logistic regression 18 14 is 0.700796626054358
F1 for logistic regression is  0.6245221993531314
precision and recall for Naive Bayes classifier 18 14 : 0.5892678034102307 0.7619974059662775
f1 score for Naive Bayes classifiern is  0.6645927601809954
accuracy for test data for logistic regression 17 14 is 0.7207122774133083
F1 for logistic regression is  0.6623229461756374
precision and recall for Naive Bayes classifier 17 14 : 0.5892678034102307 0.7619974059662775
f1 score for Naive Bayes classifiern is  0.6645927601809954
accuracy for test data for logistic regression 18 17 14 is 0.7221180880974696
F1 is  0.6645927601809954
precision and recall for Naive Bayes classifier 18 17 : 0.58

### SVM on whole dataset (Only the best score with different C number and Linear/RBF model is selected for SVM

In [11]:
top = [18,17,14]

#Linear Kernel
    
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = bank_train[:,[top[i],top[j]]].astype(float)
        y_train = bank_train[:,19].astype(float)

        x_test = bank_test[:,[top[i],top[j]]].astype(float)
        y_test = bank_test[:,19].astype(float)

        clf=[]
        for C in [1,10]:
            
            clf = svm.SVC(kernel='linear', C=C)
            clf.fit(x_train, y_train)
            predicted = clf.predict(x_test)
            print ("Accuracy score for Linear SVM with 2 features ,", top[i],top[j],"and with C number ",C," is " ,metrics.accuracy_score(y_test, predicted))
            print ("F1 score for Linear SVM with 2 features,", top[i],top[j],"and with C number ",C," is " ,metrics.f1_score(y_test, predicted))
            print("Classification report for Linear SVM with 2 features,", top[i],top[j],"and with C number ",C,"is ",classification_report(y_test, predicted))
            print("\n")



Accuracy score for Linear SVM with 2 features , 18 17 and with C number  1  is  0.7221180880974696
F1 score for Linear SVM with 2 features, 18 17 and with C number  1  is  0.6645927601809954
Classification report for Linear SVM with 2 features, 18 17 and with C number  1 is               precision    recall  f1-score   support

        0.0       0.84      0.70      0.76      2726
        1.0       0.59      0.76      0.66      1542

avg / total       0.75      0.72      0.73      4268



Accuracy score for Linear SVM with 2 features , 18 17 and with C number  10  is  0.7221180880974696
F1 score for Linear SVM with 2 features, 18 17 and with C number  10  is  0.6645927601809954
Classification report for Linear SVM with 2 features, 18 17 and with C number  10 is               precision    recall  f1-score   support

        0.0       0.84      0.70      0.76      2726
        1.0       0.59      0.76      0.66      1542

avg / total       0.75      0.72      0.73      4268



Accuracy sc

In [12]:
top = [18,17,14]

#Linear Kernel

#3 features:

x_train = bank_train[:,[18,17,14]].astype(float)
y_train = bank_train[:,19].astype(float)

x_test = bank_test[:,[18,17,14]].astype(float)
y_test = bank_test[:,19].astype(float)

clf=[]
for C in [1,10]:
    clf = svm.SVC(kernel='linear', C=C)
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
    
    print ("Accuracy score for Linear SVM with 3 features with C number ",C," is " ,metrics.accuracy_score(y_test, predicted))
    print ("F1 score for Linear SVM with 3 features with C number ",C,"  is " ,metrics.f1_score(y_test, predicted))
    print("Classification report for Linear SVM with 3 features with C number ",C," is: ",classification_report(y_test, predicted))
    print("\n")



Accuracy score for Linear SVM with 3 features with C number  1  is  0.7626522961574508
F1 score for Linear SVM with 3 features with C number  1   is  0.6041422430636968
Classification report for Linear SVM with 3 features with C number  1  is:               precision    recall  f1-score   support

        0.0       0.76      0.91      0.83      2726
        1.0       0.76      0.50      0.60      1542

avg / total       0.76      0.76      0.75      4268



Accuracy score for Linear SVM with 3 features with C number  10  is  0.7626522961574508
F1 score for Linear SVM with 3 features with C number  10   is  0.6041422430636968
Classification report for Linear SVM with 3 features with C number  10  is:               precision    recall  f1-score   support

        0.0       0.76      0.91      0.83      2726
        1.0       0.76      0.50      0.60      1542

avg / total       0.76      0.76      0.75      4268





#### SVM with RBF Kernel

In [13]:
top = [18,17,14]

#RBF Kernel
   
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = bank_train[:,[top[i],top[j]]].astype(float)
        y_train = bank_train[:,19].astype(float)

        x_test = bank_test[:,[top[i],top[j]]].astype(float)
        y_test = bank_test[:,19].astype(float)

        clf=[]
        for C in [1,10]:
            
            clf = svm.SVC(kernel='rbf', C=C)
            clf.fit(x_train, y_train)
            predicted = clf.predict(x_test)
           
            print ("Accuracy score for RBF SVM with 2 features ,", top[i],top[j],"and with C number ",C,"  is " ,metrics.accuracy_score(y_test, predicted))
            print ("F1 score for RBF SVM with 2 features ,", top[i],top[j],"and with C number ",C,"  is " ,metrics.f1_score(y_test, predicted))
            print("Classification report for RBF SVM with 2 features ,", top[i],top[j],"and with C number ",C," : ",classification_report(y_test, predicted))
            print("\n")



Accuracy score for RBF SVM with 2 features , 18 17 and with C number  1   is  0.7703842549203374
F1 score for RBF SVM with 2 features , 18 17 and with C number  1   is  0.6242331288343559
Classification report for RBF SVM with 2 features , 18 17 and with C number  1  :               precision    recall  f1-score   support

        0.0       0.77      0.91      0.83      2726
        1.0       0.76      0.53      0.62      1542

avg / total       0.77      0.77      0.76      4268



Accuracy score for RBF SVM with 2 features , 18 17 and with C number  10   is  0.7696813495782568
F1 score for RBF SVM with 2 features , 18 17 and with C number  10   is  0.6235158942933742
Classification report for RBF SVM with 2 features , 18 17 and with C number  10  :               precision    recall  f1-score   support

        0.0       0.77      0.91      0.83      2726
        1.0       0.76      0.53      0.62      1542

avg / total       0.77      0.77      0.76      4268



Accuracy score for RB

In [14]:
top = [18,17,14]

#rbf Kernel
    
#3 features:

x_train = bank_train[:,[18,17,14]].astype(float)
y_train = bank_train[:,19].astype(float)

x_test = bank_test[:,[18,17,14]].astype(float)
y_test = bank_test[:,19].astype(float)

clf=[]
for C in [1,10,100]:
    clf = svm.SVC(kernel='rbf', C=C)
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
    
    print ("Accuracy score for RBF SVM with 3 features with C number ",C," is " ,metrics.accuracy_score(y_test, predicted))
    print ("F1 score for RBF SVM with 3 features with C number ",C,"  is " ,metrics.f1_score(y_test, predicted))
    print("Classification report for RBF SVM with 3 features with C number ",C," : ",classification_report(y_test, predicted))
    print("\n")


Accuracy score for RBF SVM with 3 features with C number  1  is  0.7678069353327085
F1 score for RBF SVM with 3 features with C number  1   is  0.6166344294003869
Classification report for RBF SVM with 3 features with C number  1  :               precision    recall  f1-score   support

        0.0       0.77      0.91      0.83      2726
        1.0       0.76      0.52      0.62      1542

avg / total       0.77      0.77      0.76      4268



Accuracy score for RBF SVM with 3 features with C number  10  is  0.7703842549203374
F1 score for RBF SVM with 3 features with C number  10   is  0.6242331288343559
Classification report for RBF SVM with 3 features with C number  10  :               precision    recall  f1-score   support

        0.0       0.77      0.91      0.83      2726
        1.0       0.76      0.53      0.62      1542

avg / total       0.77      0.77      0.76      4268



Accuracy score for RBF SVM with 3 features with C number  100  is  0.770618556701031
F1 score f

## Personal Dataset

In [15]:
personal = finaldf.copy()
personal.drop(['contact','month','day_of_week','campaign','pdays','previous','poutcome','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed',],axis=1,inplace=True)

In [16]:
x=personal.iloc[:,:-1]
y=personal.iloc[:,-1]
fit_rf=RandomForestClassifier(random_state=0)

np.random.seed(0)
param_dist = {'max_depth':[2,3,4],
             'bootstrap':[True, False],
             'max_features':['auto','sqrt','log2',None],
             'criterion':['gini','entropy']}

cv_rf = GridSearchCV(fit_rf,cv=10,param_grid=param_dist,n_jobs=3)

cv_rf.fit(x,y)

print('Best parameters using grid search: \n',
     cv_rf.best_params_)

Best parameters using grid search: 
 {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 4, 'max_features': None}


In [17]:
fit_rf.set_params(criterion='entropy',n_estimators = 100,max_features='auto',max_depth=4,bootstrap=False)
fit_rf.fit(x,y)

def variable_importance(fit):
    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance':importances,'index':indices}

var_imp_rf=variable_importance(fit_rf)
importances_rf=var_imp_rf['importance']
indices_rf=var_imp_rf['index']

print('Feature ranking:')
feature_name=['age','job','marital','education','default','housing','loan']


for i in range(0,indices_rf.shape[0]):
    f = i
    print('{0}. The feature {1} with index {3} has a mean decrease in impurity of {2:.5f}'.format(f+1,feature_name[indices_rf[i]],importances_rf[indices_rf[f]],indices_rf[i]))

Feature ranking:
1. The feature age with index 0 has a mean decrease in impurity of 0.62202
2. The feature education with index 3 has a mean decrease in impurity of 0.15322
3. The feature job with index 1 has a mean decrease in impurity of 0.14183
4. The feature marital with index 2 has a mean decrease in impurity of 0.07376
5. The feature housing with index 5 has a mean decrease in impurity of 0.00580
6. The feature loan with index 6 has a mean decrease in impurity of 0.00337
7. The feature default with index 4 has a mean decrease in impurity of 0.00000


In [18]:
names = dataset.columns.values
finaldf = pd.DataFrame(data=final,columns=names)

np.random.seed(2018)
train = np.random.choice([True, False], finaldf.shape[0], replace=True, p=[0.6, 0.4])


personal_train = personal.iloc[train,:].as_matrix()
personal_test = personal.iloc[~train,:].as_matrix()


top = [0,3,1]
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = personal_train[:,[top[i],top[j]]].astype(float)
        y_train = personal_train[:,7].astype(float)
        
        x_test = personal_test[:,[top[i],top[j]]].astype(float)
        y_test = personal_test[:,7].astype(float)
        
        logit.fit(x_train, y_train)
        y_test_pred = logit.predict(x_test)
        
        gnb.fit(x_train, y_train)
        y_pred1 = gnb.predict(x_test)
            
        print('accuracy for test data for logistic regression',top[i],top[j],'is' , metrics.accuracy_score(y_test, y_test_pred))
        print('F1 for logistic regression is ' , f1_score(y_test, y_test_pred))
        
        print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
        print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))
                
#3 features
x_train = personal_train[:,[0,3,1]].astype(float)
y_train = personal_train[:,7].astype(float)

x_test = personal_test[:,[0,3,2]].astype(float)
y_test = personal_test[:,7].astype(float)

logit.fit(x_train, y_train)
y_test_pred = logit.predict(x_test)

gnb.fit(x_train, y_train)
y_pred1 = gnb.predict(x_test)

print('accuracy for test data for logistic regression is' , metrics.accuracy_score(y_test, y_test_pred))
print('F1 for logistic regression is ' , f1_score(y_test, y_test_pred))
        
print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))

accuracy for test data for logistic regression 0 3 is 0.6408153701968134
F1 for logistic regression is  0.017937219730941707
precision and recall for Naive Bayes classifier 0 3 : 0.5748663101604278 0.13942931258106356
f1 score for Naive Bayes classifiern is  0.22442588726513568
accuracy for test data for logistic regression 0 1 is 0.6398781630740393
F1 for logistic regression is  0.009026434558349453
precision and recall for Naive Bayes classifier 0 1 : 0.5732647814910026 0.14461738002594035
f1 score for Naive Bayes classifiern is  0.23096841015018127
accuracy for test data for logistic regression 3 1 is 0.6387066541705717
F1 for logistic regression is  0.0
precision and recall for Naive Bayes classifier 3 1 : 0.0 0.0
f1 score for Naive Bayes classifiern is  0.0
accuracy for test data for logistic regression is 0.6412839737582006
F1 for logistic regression is  0.01796023091725465
precision and recall for Naive Bayes classifier 1 1 : 0.5924855491329479 0.13294422827496757
f1 score for N

  
  if __name__ == '__main__':
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


### RBF Kernel

In [19]:
top = [0,3,1]

#RBF Kernel
    
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = personal_train[:,[top[i],top[j]]].astype(float)
        y_train = personal_train[:,6].astype(float)

        x_test = personal_test[:,[top[i],top[j]]].astype(float)
        y_test = personal_test[:,6].astype(float)

        clf=[]
        for C in [1,10,100]:
            
            clf = svm.SVC(kernel='rbf', C=C)
            clf.fit(x_train, y_train)
            predicted = clf.predict(x_test)
            print("Prediction for RBF SVM - C value {1} for features {2} and {3}", C,top[i],top[j])
            
            print ("Accuracy score for RBF SVM with 2 features is " ,metrics.accuracy_score(y_test, predicted))
            print ("F1 score for RBF SVM with 2 features is " ,metrics.f1_score(y_test, predicted))
            print("Classification report for RBF SVM with 2 features: ",classification_report(y_test, predicted))
            print("\n")



Prediction for RBF SVM - C value {1} for features {2} and {3} 1 0 3
Accuracy score for RBF SVM with 2 features is  0.8420805998125586
F1 score for RBF SVM with 2 features is  0.0
Classification report for RBF SVM with 2 features:               precision    recall  f1-score   support

        0.0       0.84      1.00      0.91      3594
        1.0       0.00      0.00      0.00       674

avg / total       0.71      0.84      0.77      4268





  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Prediction for RBF SVM - C value {1} for features {2} and {3} 10 0 3
Accuracy score for RBF SVM with 2 features is  0.837863167760075
F1 score for RBF SVM with 2 features is  0.011428571428571429
Classification report for RBF SVM with 2 features:               precision    recall  f1-score   support

        0.0       0.84      0.99      0.91      3594
        1.0       0.15      0.01      0.01       674

avg / total       0.73      0.84      0.77      4268



Prediction for RBF SVM - C value {1} for features {2} and {3} 100 0 3
Accuracy score for RBF SVM with 2 features is  0.837863167760075
F1 score for RBF SVM with 2 features is  0.011428571428571429
Classification report for RBF SVM with 2 features:               precision    recall  f1-score   support

        0.0       0.84      0.99      0.91      3594
        1.0       0.15      0.01      0.01       674

avg / total       0.73      0.84      0.77      4268



Prediction for RBF SVM - C value {1} for features {2} and {3} 1 0 1
A

In [20]:
top = [0,3,1]

#RBF Kernel
    
#3 features:

x_train = personal_train[:,[0,3,1]].astype(float)
y_train = personal_train[:,6].astype(float)

x_test = personal_test[:,[0,3,1]].astype(float)
y_test = personal_test[:,6].astype(float)

clf=[]
for C in [1,10,100]:

    clf = svm.SVC(kernel='rbf', C=C)
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
   
    print ("Accuracy score for RBF SVM with 3 features is " ,metrics.accuracy_score(y_test, predicted))
    print ("F1 score for RBF SVM with 3 features is " ,metrics.f1_score(y_test, predicted))
    print("Classification report for RBF SVM with 3 features: ",classification_report(y_test, predicted))
    print("\n")



Accuracy score for RBF SVM with 3 features is  0.8409090909090909
F1 score for RBF SVM with 3 features is  0.0
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   support

        0.0       0.84      1.00      0.91      3594
        1.0       0.00      0.00      0.00       674

avg / total       0.71      0.84      0.77      4268



Accuracy score for RBF SVM with 3 features is  0.8256794751640113
F1 score for RBF SVM with 3 features is  0.05583756345177665
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   support

        0.0       0.84      0.97      0.90      3594
        1.0       0.19      0.03      0.06       674

avg / total       0.74      0.83      0.77      4268



Accuracy score for RBF SVM with 3 features is  0.8149015932521088
F1 score for RBF SVM with 3 features is  0.08986175115207373
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   

## Contact Dataset

In [21]:
contact = finaldf.copy()
contact.drop(['age','job','marital','education','default','housing','loan','emp.var.rate','cons.price.idx','cons.conf.idx','euribor3m','nr.employed'],axis=1,inplace=True)
#print(contact)

In [22]:
x=contact.iloc[:,:-1]
y=contact.iloc[:,-1]
fit_rf=RandomForestClassifier(random_state=0)

np.random.seed(0)
param_dist = {'max_depth':[2,3,4],
             'bootstrap':[True, False],
             'max_features':['auto','sqrt','log2',None],
             'criterion':['gini','entropy']}

cv_rf = GridSearchCV(fit_rf,cv=10,param_grid=param_dist,n_jobs=3)

cv_rf.fit(x,y)

print('Best parameters using grid search: \n',
     cv_rf.best_params_)

Best parameters using grid search: 
 {'bootstrap': True, 'criterion': 'entropy', 'max_depth': 2, 'max_features': 'auto'}


In [23]:
fit_rf.set_params(criterion='entropy',n_estimators = 100,max_features='auto',max_depth=2,bootstrap=False)
fit_rf.fit(x,y)

def variable_importance(fit):
    importances = fit.feature_importances_
    indices = np.argsort(importances)[::-1]
    return {'importance':importances,'index':indices}

var_imp_rf=variable_importance(fit_rf)
importances_rf=var_imp_rf['importance']
indices_rf=var_imp_rf['index']

print('Feature ranking:')
feature_name=['contact','month','day_of_week','campaign','pdays','previous','poutcome']


for i in range(0,indices_rf.shape[0]):
    f = i
    print('{0}. The feature {1} with index {3} has a mean decrease in impurity of {2:.5f}'.format(f+1,feature_name[indices_rf[i]],importances_rf[indices_rf[f]],indices_rf[i]))

Feature ranking:
1. The feature pdays with index 4 has a mean decrease in impurity of 0.33236
2. The feature poutcome with index 6 has a mean decrease in impurity of 0.27579
3. The feature contact with index 0 has a mean decrease in impurity of 0.13295
4. The feature previous with index 5 has a mean decrease in impurity of 0.12380
5. The feature month with index 1 has a mean decrease in impurity of 0.10379
6. The feature campaign with index 3 has a mean decrease in impurity of 0.02848
7. The feature day_of_week with index 2 has a mean decrease in impurity of 0.00283


In [24]:
names = dataset.columns.values
finaldf = pd.DataFrame(data=final,columns=names)

np.random.seed(2018)
train = np.random.choice([True, False], finaldf.shape[0], replace=True, p=[0.6, 0.4])


contact_train = contact.iloc[train,:].as_matrix()
contact_test = contact.iloc[~train,:].as_matrix()


top = [4,6,0]
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = contact_train[:,[top[i],top[j]]].astype(float)
        y_train = contact_train[:,7].astype(float)
 
        x_test = contact_test[:,[top[i],top[j]]].astype(float)
        y_test = contact_test[:,7].astype(float)
        
        logit.fit(x_train, y_train)
        y_test_pred = logit.predict(x_test)
        
        gnb.fit(x_train, y_train)
        y_pred1 = gnb.predict(x_test)

        print('accuracy for test data for logistic regression',top[i],top[j],'is' , metrics.accuracy_score(y_test, y_test_pred))
        print('F1 for logistic regression is ' , f1_score(y_test, y_test_pred))

        print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
        print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))
        
#3 features
x_train = contact_train[:,[4,6,0]].astype(float)
y_train = contact_train[:,7].astype(float)

x_test = contact_test[:,[4,6,0]].astype(float)
y_test = contact_test[:,7].astype(float)

logit.fit(x_train, y_train)
y_test_pred = logit.predict(x_test)

gnb.fit(x_train, y_train)
y_pred1 = gnb.predict(x_test)

print('accuracy for test data for logistic regression is' , metrics.accuracy_score(y_test, y_test_pred))
print('F1 for logistic regression is ' , f1_score(y_test, y_test_pred))

print('precision and recall for Naive Bayes classifier',top[i], top[j],':', metrics.precision_score(y_test, y_pred1), metrics.recall_score(y_test, y_pred1))
print('f1 score for Naive Bayes classifiern is ', f1_score(y_test,y_pred1))

accuracy for test data for logistic regression 4 6 is 0.7087628865979382
F1 for logistic regression is  0.3481908757210278
precision and recall for Naive Bayes classifier 4 6 : 0.9095890410958904 0.21530479896238652
f1 score for Naive Bayes classifiern is  0.3481908757210278
accuracy for test data for logistic regression 4 0 is 0.7087628865979382
F1 for logistic regression is  0.3481908757210278
precision and recall for Naive Bayes classifier 4 0 : 0.9095890410958904 0.21530479896238652
f1 score for Naive Bayes classifiern is  0.3481908757210278
accuracy for test data for logistic regression 6 0 is 0.6986879100281163
F1 for logistic regression is  0.30561555075593955
precision and recall for Naive Bayes classifier 6 0 : 0.625 0.321011673151751
f1 score for Naive Bayes classifiern is  0.42416452442159386
accuracy for test data for logistic regression is 0.7087628865979382
F1 for logistic regression is  0.3481908757210278
precision and recall for Naive Bayes classifier 0 0 : 0.9095890410

  
  if __name__ == '__main__':


### SVM on contact dataset

#### RBF Kernel

In [25]:
top = [4,6,0]

#RBF Kernel

    
#2 features:
for i in range(3):
    for j in range(i+1,3):
        x_train = contact_train[:,[top[i],top[j]]].astype(float)
        y_train = contact_train[:,7].astype(float)

        x_test = contact_test[:,[top[i],top[j]]].astype(float)
        y_test = contact_test[:,7].astype(float)

        clf=[]
        for C in [1,10,100]:
            
            clf = svm.SVC(kernel='rbf', C=C)
            clf.fit(x_train, y_train)
            predicted = clf.predict(x_test)
            print("Prediction for RBF SVM - C value {1} for features {2} and {3}", C,top[i],top[j])
            
            print ("Accuracy score for RBF SVM with 2 features is " ,metrics.accuracy_score(y_test, predicted))
            print ("F1 score for RBF SVM with 2 features is " ,metrics.f1_score(y_test, predicted))
            print("Classification report for RBF SVM with 2 features: ",classification_report(y_test, predicted))
            print("\n")



Prediction for RBF SVM - C value {1} for features {2} and {3} 1 4 6
Accuracy score for RBF SVM with 2 features is  0.7082942830365511
F1 score for RBF SVM with 2 features is  0.34645669291338577
Classification report for RBF SVM with 2 features:               precision    recall  f1-score   support

        0.0       0.69      0.99      0.81      2726
        1.0       0.91      0.21      0.35      1542

avg / total       0.77      0.71      0.64      4268



Prediction for RBF SVM - C value {1} for features {2} and {3} 10 4 6
Accuracy score for RBF SVM with 2 features is  0.7082942830365511
F1 score for RBF SVM with 2 features is  0.34645669291338577
Classification report for RBF SVM with 2 features:               precision    recall  f1-score   support

        0.0       0.69      0.99      0.81      2726
        1.0       0.91      0.21      0.35      1542

avg / total       0.77      0.71      0.64      4268



Prediction for RBF SVM - C value {1} for features {2} and {3} 100 4 6
A

In [26]:
top = [4,6,0]

#rbf Kernel
    
#3 features:

x_train = bank_train[:,[4,6,0]].astype(float)
y_train = bank_train[:,7].astype(float)

x_test = bank_test[:,[4,6,0]].astype(float)
y_test = bank_test[:,7].astype(float)

clf=[]
for C in [1,10,100]:
    clf = svm.SVC(kernel='rbf', C=C)
    clf.fit(x_train, y_train)
    predicted = clf.predict(x_test)
    
    print ("Accuracy score for RBF SVM with 3 features is " ,metrics.accuracy_score(y_test, predicted))
    print ("F1 score for RBF SVM with 3 features is " ,metrics.f1_score(y_test, predicted))
    print("Classification report for RBF SVM with 3 features: ",classification_report(y_test, predicted))
    print("\n")



Accuracy score for RBF SVM with 3 features is  0.7162605435801312
F1 score for RBF SVM with 3 features is  0.0
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   support

        0.0       0.72      1.00      0.83      3057
        1.0       0.00      0.00      0.00      1211

avg / total       0.51      0.72      0.60      4268





  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Accuracy score for RBF SVM with 3 features is  0.7157919400187441
F1 score for RBF SVM with 3 features is  0.0
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   support

        0.0       0.72      1.00      0.83      3057
        1.0       0.00      0.00      0.00      1211

avg / total       0.51      0.72      0.60      4268



Accuracy score for RBF SVM with 3 features is  0.7155576382380506
F1 score for RBF SVM with 3 features is  0.0
Classification report for RBF SVM with 3 features:               precision    recall  f1-score   support

        0.0       0.72      1.00      0.83      3057
        1.0       0.00      0.00      0.00      1211

avg / total       0.51      0.72      0.60      4268





## Miscellaneous code

In [27]:
#convert nominal values using pd.get_dummies
def dummify_dataset(df, column):       
    df = pd.concat([df, pd.get_dummies(df[column], prefix=column)],axis=1)
    df = df.drop([column], axis=1)
    return df

columns_to_dummify = ['poutcome']
for column in columns_to_dummify:
    contact = dummify_dataset(contact, column)
    
contact.head()

Unnamed: 0,contact,month,day_of_week,campaign,pdays,previous,y,poutcome_0.0,poutcome_1.0,poutcome_2.0
0,1.0,6.0,1.0,1.0,999.0,0.0,0.0,0,1,0
1,1.0,6.0,1.0,1.0,999.0,0.0,0.0,0,1,0
2,1.0,6.0,1.0,1.0,999.0,0.0,0.0,0,1,0
3,1.0,6.0,1.0,1.0,999.0,0.0,0.0,0,1,0
4,1.0,6.0,1.0,1.0,999.0,0.0,0.0,0,1,0


In [28]:
names = dataset.columns.values
finaldf = pd.DataFrame(data=final,    # values
                      columns=names)

np.random.seed(2018)
train = np.random.choice([True, False], finaldf.shape[0], replace=True, p=[0.6, 0.4])


contact_train = contact.iloc[train,:]
contact_test = contact.iloc[~train,:]

In [31]:
x_train = contact_train.iloc[:,[0,1,2,3,4,5,7,8,9]].astype(float)
y_train = contact_train.iloc[:,6].astype(float)

x_test = contact_test.iloc[:,[0,1,2,3,4,5,7,8,9]].astype(float)
y_test = contact_test.iloc[:,6].astype(float)

### SVM Classifier

In [32]:
import numpy as np
from sklearn import datasets, svm, metrics
import matplotlib.pyplot as plt

clf = []
#for C in [1, 10, 100, 1000]:
for C in [100]:
    clf.append(svm.LinearSVC(C=C))

In [33]:
for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))

[0. 0. 0. ... 1. 0. 0.]
0.7085285848172446


##### Kernel SVM

In [34]:
clf=[]
for C in [10]:
    clf.append(svm.SVC(kernel='linear', C=C))

In [35]:
for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))

[0. 0. 0. ... 1. 0. 0.]
0.7087628865979382


In [36]:
clf = []
for kern in ['rbf']:
    clf.append(svm.SVC(kernel=kern, gamma=1))

for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))

[0. 0. 0. ... 1. 0. 0.]
0.7521087160262419


### SVM on top 2 features - previous outcome and contact

In [37]:
x_train = contact_train.iloc[:,[0,1,2,3]].astype(float)
y_train = contact_train.iloc[:,6].astype(float)

x_test = contact_test.iloc[:,[0,1,2,3]].astype(float)
y_test = contact_test.iloc[:,6].astype(float)

In [38]:
#Linear Kernel

clf=[]
for C in [10]:
    clf.append(svm.SVC(kernel='linear', C=C))

for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))
    

[0. 0. 0. ... 0. 0. 0.]
0.6387066541705717


In [39]:
#rbf kernel

clf = []
for kern in ['rbf']:
    clf.append(svm.SVC(kernel=kern, gamma=1))

for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))

[0. 0. 0. ... 0. 0. 0.]
0.7216494845360825


### SVM on  2 features - pdays and contact¶

In [40]:
x_train = contact_train.iloc[:,[0,4]].astype(float)
y_train = contact_train.iloc[:,6].astype(float)

x_test = contact_test.iloc[:,[0,4]].astype(float)
y_test = contact_test.iloc[:,6].astype(float)

In [41]:
#Linear Kernel

clf=[]
for C in [10]:
    clf.append(svm.SVC(kernel='linear', C=C))

for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))
    

[0. 0. 0. ... 1. 0. 0.]
0.7087628865979382


In [42]:
#rbf kernel

clf = []
for kern in ['rbf']:
    clf.append(svm.SVC(kernel=kern, gamma=1))

for i in range(1):
    clf[i].fit(x_train, y_train)
    predicted = clf[i].predict(x_test)
    print(predicted)
    print (metrics.accuracy_score(y_test, predicted))

[0. 0. 0. ... 1. 0. 0.]
0.7087628865979382
