In [1]:
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split
from sklearn import metrics

import numpy as np
from sklearn.metrics import precision_recall_fscore_support

import imblearn
from imblearn.over_sampling import SMOTE
from collections import Counter
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score

import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

Data Loading and Preprocessing

In [2]:
X_train = pd.read_csv('data_train.csv')
X_test = pd.read_csv('data_test.csv')

In [3]:
Y_train = X_train['y']
Y_test = X_test['y']

In [4]:
X_train = X_train.drop('y',axis=1)
X_train = X_train.drop('Unnamed: 0',axis=1)

X_test = X_test.drop('y',axis=1)
X_test = X_test.drop('Unnamed: 0',axis=1)

In [5]:
X_test.head(1)

Unnamed: 0,duration,pdays,euribor3m,job,marital,education,default,housing,contact,month,day_of_week,poutcome
0,226,999,4.857,1,0,1,0,1,0,0,0,0


# With SMOTE

In [6]:
Counter(Y_train)

Counter({0: 28023, 1: 3540})

In [7]:
#SMOTE for balancing the dataset

oversample = SMOTE()
X, y = oversample.fit_resample(X_train, Y_train)

In [8]:
Counter(y)

Counter({0: 28023, 1: 28023})

In [11]:
clf = make_pipeline(StandardScaler(), SVC(C=1.0, kernel = 'poly', gamma='scale',
                                                  random_state=2))
clf.fit(X, y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(kernel='poly', random_state=2))])

In [12]:
y_pred = clf.predict(X_test)
print(f1_score(Y_test,y_pred))
print(precision_score(Y_test,y_pred))
print(recall_score(Y_test,y_pred))
print(accuracy_score(Y_test,y_pred))

0.5349567949725059
0.4010600706713781
0.8030660377358491
0.8498985801217038


In [14]:
mean(cross_val_score(clf, X, y, cv=10,scoring='f1'))

0.8694547740568328

**SVM**

Iterating over different combinations of hyperparameters to identify the optimal parameters

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier

clf = make_pipeline(StandardScaler(), SVC(C=1.0, kernel = 'poly', gamma='scale',
                                                  random_state=2))
param_grid = { 
    'kernel': [ 'poly','rbf','linear','sigmoid', 'precomputed'],
    'gamma' : ['scale', 'auto'],
    'degree': range(1,10)
}

CV_rfc = GridSearchCV(estimator=clf, param_grid=param_grid, cv= 10,scoring="f1")
CV_rfc.fit(X, y)

In [None]:
CV_rfc.best_estimator_

Iterating over all combinations of hyperparameters using for loops helps us evaluate the performance of each model on the test data as well. (For sake of curiosity😅)

In [10]:
Training = {}
Testing = {}

for kernel in [ 'poly','rbf','linear','sigmoid', 'precomputed']:
  print(kernel)
  for gamma in ['scale', 'auto']:
    print(gamma)
    if kernel == 'poly':
      for degree in range(1,10):
        print("Start")
        clf = make_pipeline(StandardScaler(), SVC(C=1.0, kernel = kernel, degree=degree, gamma=gamma,
                                                  random_state=2))
        clf.fit(X, y)
        print("end")
        scores = cross_val_score(clf, X, y, cv=10,scoring='f1')
        s = str(kernel)+" "+str(gamma)+" "+str(degree)
        print(s)
        Training[s]= mean(scores)
        y_pred = clf.predict(X_test)
        Testing[s]= f1_score(Y_test, y_pred)
        print(Training,Testing)
      else:
        clf = make_pipeline(StandardScaler(), SVC( C=1.0, kernel = kernel, degree=3, gamma=gamma, coef0=0.0, shrinking=True, probability=False, 
                  random_state=2))
        clf.fit(X, y)
        scores = cross_val_score(clf, X, y, cv=10,scoring='f1')
        s = str(kernel)+" "+str(gamma)
        Training[s]= mean(scores)

        y_pred = clf.predict(X_test)
        Testing[s]=f1_score(Y_test, y_pred)
        print(s)

      
      
          
          


poly
scale
Start
end
poly scale 1
{'poly scale 1': 0.8691311482960756} {'poly scale 1': 0.5146067415730338}
Start
end
poly scale 2
{'poly scale 1': 0.8691311482960756, 'poly scale 2': 0.8405925188174802} {'poly scale 1': 0.5146067415730338, 'poly scale 2': 0.4693053311793215}
Start
end
poly scale 3
{'poly scale 1': 0.8691311482960756, 'poly scale 2': 0.8405925188174802, 'poly scale 3': 0.8718457124919385} {'poly scale 1': 0.5146067415730338, 'poly scale 2': 0.4693053311793215, 'poly scale 3': 0.5335952848722987}
Start
end
poly scale 4
{'poly scale 1': 0.8691311482960756, 'poly scale 2': 0.8405925188174802, 'poly scale 3': 0.8718457124919385, 'poly scale 4': 0.8649412225687649} {'poly scale 1': 0.5146067415730338, 'poly scale 2': 0.4693053311793215, 'poly scale 3': 0.5335952848722987, 'poly scale 4': 0.5181636726546905}
Start
end
poly scale 5
{'poly scale 1': 0.8691311482960756, 'poly scale 2': 0.8405925188174802, 'poly scale 3': 0.8718457124919385, 'poly scale 4': 0.8649412225687649, '

end
poly auto 6
{'poly scale 1': 0.8691311482960756, 'poly scale 2': 0.8405925188174802, 'poly scale 3': 0.8718457124919385, 'poly scale 4': 0.8649412225687649, 'poly scale 5': 0.8702431514324911, 'poly scale 6': 0.8591738380293581, 'poly scale 7': 0.8596228390910211, 'poly scale 8': 0.8389312431011167, 'poly scale 9': 0.8342865711000957, 'poly scale': 0.8718457124919385, 'poly auto 1': 0.8691311482960756, 'poly auto 2': 0.8405925188174802, 'poly auto 3': 0.8718457124919385, 'poly auto 4': 0.8649412225687649, 'poly auto 5': 0.8702431514324911, 'poly auto 6': 0.8591738380293581} {'poly scale 1': 0.5146067415730338, 'poly scale 2': 0.4693053311793215, 'poly scale 3': 0.5335952848722987, 'poly scale 4': 0.5181636726546905, 'poly scale 5': 0.5185477505919496, 'poly scale 6': 0.473305566073457, 'poly scale 7': 0.4691718350966801, 'poly scale 8': 0.3971902937420179, 'poly scale 9': 0.3881932021466905, 'poly scale': 0.5335952848722987, 'poly auto 1': 0.5146067415730338, 'poly auto 2': 0.46930

Storing all the results in dataframes and then saving them in a csv file

In [11]:
Training_List=[]
Training_List_names = []
for i in Training.items():
  Training_List_names.append(i[0])
  Training_List.append(i[1])

d_all_training = {"Model Config":Training_List_names,'Training Average CV F1':Training_List}  

Testing_List=[]
Testing_List_names = []
for i in Testing.items():
  Testing_List_names.append(i[0])
  Testing_List.append(i[1])

d_all_testing= {"Model Config":Testing_List_names,'Testing F1':Testing_List}

In [12]:
df1= pd.DataFrame.from_dict(d_all_training)
df2 = pd.DataFrame.from_dict(d_all_testing)



In [13]:
result = pd.concat([df1, df2], axis=1)

In [17]:
compression_opts = dict(method='zip',
                        archive_name='out.csv') 
df1.to_csv('SVM_train.csv', index=False) 
df2.to_csv('SVM_test.csv', index=False) 
result.to_csv('SVM.csv', index=False) 

In [16]:
result

Unnamed: 0,Model Config,Training Average CV F1,Model Config.1,Testing F1
0,poly scale 1,0.869131,poly scale 1,0.514607
1,poly scale 2,0.840593,poly scale 2,0.469305
2,poly scale 3,0.871846,poly scale 3,0.533595
3,poly scale 4,0.864941,poly scale 4,0.518164
4,poly scale 5,0.870243,poly scale 5,0.518548
5,poly scale 6,0.859174,poly scale 6,0.473306
6,poly scale 7,0.859623,poly scale 7,0.469172
7,poly scale 8,0.838931,poly scale 8,0.39719
8,poly scale 9,0.834287,poly scale 9,0.388193
9,poly scale,0.871846,poly scale,0.533595
