### Import Library

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, accuracy_score, make_scorer, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb
from sklearn.naive_bayes import MultinomialNB

from keras.layers import Dense
from tensorflow.keras.models import Sequential
from keras.wrappers.scikit_learn import KerasRegressor
from tensorflow import keras
from tensorflow.python.keras import backend as k
import math
from keras.utils import to_categorical   

import tensorflow as tf
import time

Using TensorFlow backend.


### Data Import

In [2]:
cols = [    'word_freq_make',      'word_freq_address',   'word_freq_all',       'word_freq_3d',        'word_freq_our',       'word_freq_over',  'word_freq_remove',    
            'word_freq_internet',  'word_freq_order',     'word_freq_mail',      'word_freq_receive',   'word_freq_will',      'word_freq_people',    
            'word_freq_report',    'word_freq_addresses', 'word_freq_free',      'word_freq_business',  'word_freq_email',     'word_freq_you', 
            'word_freq_credit',    'word_freq_your',      'word_freq_font',      'word_freq_000',       'word_freq_money',     'word_freq_hp',        
            'word_freq_hpl',       'word_freq_george',    'word_freq_650',       'word_freq_lab',       'word_freq_labs',      'word_freq_telnet',    
            'word_freq_857',       'word_freq_data',      'word_freq_415',       'word_freq_85',        'word_freq_technology','word_freq_1999',      
            'word_freq_parts',     'word_freq_pm',        'word_freq_direct',    'word_freq_cs',        'word_freq_meeting',   'word_freq_original',  
            'word_freq_project',   'word_freq_re',        'word_freq_edu',       'word_freq_table',     'word_freq_conference','char_freq_1',            
            'char_freq_2',         'char_freq_3',         'char_freq_4',         'char_freq_5',         'char_freq_6',         'capital_run_length_average',
            'capital_run_length_longest',                 'capital_run_length_total' ,                  'flag']

data = pd.read_csv('spambase.data', names = cols)
print(data.columns)
data.head()

Index(['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
       'word_freq_our', 'word_freq_over', 'word_freq_remove',
       'word_freq_internet', 'word_freq_order', 'word_freq_mail',
       'word_freq_receive', 'word_freq_will', 'word_freq_people',
       'word_freq_report', 'word_freq_addresses', 'word_freq_free',
       'word_freq_business', 'word_freq_email', 'word_freq_you',
       'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
       'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',
       'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',
       'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
       'word_freq_technology', 'word_freq_1999', 'word_freq_parts',
       'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',
       'word_freq_original', 'word_freq_project', 'word_freq_re',
       'word_freq_edu', 'word_freq_table', 'word_freq_conference',


Unnamed: 0,word_freq_make,word_freq_address,word_freq_all,word_freq_3d,word_freq_our,word_freq_over,word_freq_remove,word_freq_internet,word_freq_order,word_freq_mail,...,char_freq_1,char_freq_2,char_freq_3,char_freq_4,char_freq_5,char_freq_6,capital_run_length_average,capital_run_length_longest,capital_run_length_total,flag
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61,278,1
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1


Checking the Data Balance

In [3]:
data.groupby('flag').size()

flag
0    2788
1    1813
dtype: int64

In [4]:
class_wt = {0:2788, 1:1813}

In [5]:
X = data.iloc[:,:57]
Y = data.iloc[:,57:]

### Feature Selection

We Check for correlations between variables to find and eliminate variables which are highly correlated with each other.<br>


In [9]:
# Create correlation matrix
corr_matrix = X.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

In [10]:
# Find features with correlation greater than 0.65
to_drop = [column for column in upper.columns if any(upper[column] > 0.65)]
to_drop

['word_freq_857', 'word_freq_415', 'word_freq_technology', 'word_freq_direct']

### Data Normalization and Train test Split

In [11]:
# Drop features 
new_features = X.drop(to_drop, axis=1)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state=45)

## Data Normalization
x_train = MinMaxScaler().fit_transform(x_train)
x_test  = MinMaxScaler().fit_transform(x_test)

As Standard Scaler provides negative values, It cannot be used as Naive Bayes cannot hav

We are going to Classify the variables using the following models:
1. Logistic Regression  
2. SVM  
3. Decision Trees 
4. K Nearest Neighbors  
5. Naiive Bayes Calssifier  
6. Neural networks  
7. Random Forest  
8. XgBoost  

We First Run A Nested GridSearch Between 5 Traditional models. We need to manually tune Random Forest, Neural network and XGBoost to get he best accuracy. 

## Best Model Accuracy

### Nested GridSearch To Select the best Possible Classifier

In [71]:
a = time.time()
non_nested_scores = np.zeros(7)
nested_scores = np.zeros(7)

# Setting up Classifiers and parameter dictionaries
svm = SVC(class_weight=class_wt, random_state= 42)
svm_params = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],'C': np.arange(0.01,3,0.5)},
              {'kernel': ['linear'], 'C': np.arange(0.01,3,0.5)}
              {'kernel': ['poly'], 'C': np.arange(0.01,3,0.5), 'degree': np.arange(1, 6, 1), 'coef0':np.arange(0,3,0.5)},
              {'kernel':['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4], 'C': np.arange(0.01,3,0.5)}
             ]


dt = DecisionTreeClassifier(random_state = 10, class_weight=class_wt)
dt_grid = {'max_depth':list(range(1,15)), 
           'criterion':['gini', 'entropy'],
           'splitter':['best', 'random'],
           'min_weight_fraction_leaf':[i/10 for i in range(6)],
           'max_features':list(range(1, 57)),
           'min_impurity_decrease':np.arange(0.0001, 0.005, 0.0005),
           'min_samples_split':list(range(2,15)), 
           'min_samples_leaf':list(range(2,15)),
          }

knn = KNeighborsClassifier()
knn_grid = {'n_neighbors':list(range(1,20)), 
            'weights':['uniform', 'distance'], 
            'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p':[1,2]}

logit = LogisticRegression(random_state=10, multi_class= 'multinomial', solver='saga', class_weight = class_wt)
lt_grid = {'penalty':['l1','l2'], 'C':list(np.arange(1, 10, 1))}

nb = MultinomialNB()
nb_grid = [{'alpha': np.arange(1,10,0.5)}]

classifiers = [nb, svm, dt,logit, knn]
classifiers_p = ['nb','svm', 'dt','logit', 'knn']
dicts = [nb_grid, svm_params, dt_grid, lt_grid, knn_grid]


# Nested cross-validation
for i in range(5):
    print('Training:' , classifiers_p[i])
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=42)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=classifiers[i], param_grid=dicts[i], cv=inner_cv, iid=False, n_jobs = -1, scoring= 'accuracy', verbose = 0)
    clf.fit(x_train, y_train)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=x_train, y=y_train, cv=outer_cv)
    nested_scores[i] = nested_score.mean()
    
print(nested_scores)
print(time.time() - a)

Training: nb
Training: svm
Training: dt
Training: logit
Training: knn
[0.88994565 0.92961957 0.91603261 0.88858696 0.91222826 0.
 0.        ]
204.78466153144836


From nested Gridsearch, SVM Provides the best accuracy for the model.  
We perform further Hyperparameter Tunig to get the best parameters for SVM.

### Support Vector Machines

In [None]:
svm = SVC(class_weight=class_wt, random_state= 42)
svm_params = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],'C': np.arange(0.01,3,0.5)},
              {'kernel': ['linear'], 'C': np.arange(0.01,3,0.5)},
#               {'kernel': ['poly'], 'C': np.arange(0.01,3,0.5), 'degree': np.arange(1, 3, 1), 'coef0':np.arange(0,3,0.5)},
              {'kernel':['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4], 'C': np.arange(0.01,3,0.5)}
             ]

classifier = GridSearchCV(svm, param_grid= svm_params, cv = 5, scoring = "accuracy", n_jobs = -1, verbose = 1)
classifier.fit(x_train, y_train)
print("Best Score:",classifier.best_score_)
print("Best Parameters:",classifier.best_params_)

Fitting 5 folds for each of 42 candidates, totalling 210 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.8s


In [92]:
svm = SVC(class_weight=class_wt, C = 2.51, gamma = 0.01, kernel = 'rbf')
svm.fit(x_train, y_train)

print('SVM Accuracy:',accuracy(y_test, svm.predict(x_test)))

SVM Accuracy: 0.923


### Neural network

Converting Target Variable to Categorical

In [102]:
y_train1 = to_categorical(y_train, num_classes=2)
y_test1 = to_categorical(y_test, num_classes=2)

A Basic Neural Network is used to Train the model. We use 3 hidden layers and an output layer to predict if an observation will be classified as a 1 or a zero.  
Further, We use sigmoid as output function and categorical crossentropy to calculate the loss

In [124]:
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(units = 57, activation = tf.nn.leaky_relu , input_shape=(57,)),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(2, activation="sigmoid")])

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

clf = model.fit(pd.DataFrame(x_train),
                pd.DataFrame(y_train1),
                batch_size= 10, epochs = 50, verbose = 0, validation_split= 0.2)

loss_train, accuracy_train = model.evaluate(x_train, y_train1, verbose = False)
loss, accuracy = model.evaluate(x_test, y_test1, verbose = False)
print('loss_train:', loss_train, 'loss_test:', loss)
print('accuracy_train:', accuracy_train, 'accuracy_test:', accuracy)

loss_train: 0.20105220643074617 loss_test: 0.22966289782368787
accuracy_train: 0.92744565 accuracy_test: 0.91856676


We observe an accuracy of 92% with Neural Network.

### Random Forest

In [134]:
rf = RandomForestClassifier(bootstrap=True, random_state= False, class_weight=class_wt)
rf_grid = {
    'n_estimators':range(5,200,10),
    'criterion':['gini', 'entropy'],
    'max_depth':range(3,20),
    'min_samples_split':list(range(2,15)), 
    'min_samples_leaf':list(range(2,15))
    }

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_grid, n_iter=200, scoring='accuracy', 
                              cv=KFold(n_splits=5, shuffle=True, random_state=45), verbose=1, n_jobs=-1, random_state=45)
rf_random.fit(x_train, y_train)
print(rf_random.best_params_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   21.6s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   38.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   50.5s finished


{'n_estimators': 195, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': 18, 'criterion': 'entropy'}


In [135]:
rf = RandomForestClassifier(n_estimators =195, min_samples_split=10, min_samples_leaf = 2, max_depth= 18, criterion ='entropy')
print('Accuracy:',accuracy(y_test, rf.predict(x_test)))

Accuracy: 0.9294462293607918


### XGBoost

#### Setting Baseline Accuracy

In [136]:
xg=xgb.XGBClassifier(random_state = 42)
xg_grid = {
    'learning_rate':[0.1,0.05,0.01],
    'n_estimators':range(5,200,10),
    'max_depth':range(3,20),
    'min_child_weight':range(1,6),
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }

xg_random = RandomizedSearchCV(estimator=xg, param_distributions=xg_grid,  n_iter=500, scoring='accuracy', 
                              cv=KFold(n_splits=4, shuffle=True, random_state=45), verbose=0, n_jobs=-1, random_state=45)

xg_random.fit(x_train, y_train)
print(xg_random.best_params_)

{'subsample': 0.8, 'reg_alpha': 0.1, 'n_estimators': 175, 'min_child_weight': 1, 'max_depth': 13, 'learning_rate': 0.1, 'gamma': 0.4, 'colsample_bytree': 0.8}


In [137]:
xg = xgb.XGBClassifier(subsample = 0.8, 
                       reg_alpha= 0.1, 
                       n_estimators= 175, 
                       min_child_weight=1, 
                       max_depth = 13, 
                       learning_rate = 0.1, 
                       gamma =0.4, 
                       colsample_bytree =0.8)
print('SVM Accuracy:',accuracy(y_test, xg.predict(x_test)))

Accuracy: 0.931610731525294


Using the Above Parameters, We first Train the Model for Max Depth and Child Weight

In [138]:
# Grid Search for best parameters
param_test = {'max_depth':list(range(10,25,2)), 
              'min_child_weight':list(range(1,8,2))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(subsample = 0.8, 
                                               reg_alpha= 0.1, 
                                               n_estimators= 175,
                                               learning_rate = 0.1, 
                                               gamma =0.4, 
                                               colsample_bytree =0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring='accuracy',
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=0)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  1.1min finished


Best Score: 0.95625
Best parameters: {'max_depth': 24, 'min_child_weight': 1}


### We Then Use this to Set Learning Rate and number of estimators

In [139]:
# Grid Search for best parameters
param_test = {'learning_rate':list(np.arange(0.05,1,0.05)), 
              'n_estimators':list(range(100,225,10))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.15,
                                                n_estimators=100,
                                                max_depth=24,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring='accuracy',
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=0)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Best Score: 0.9567934782608696
Best parameters: {'learning_rate': 0.15000000000000002, 'n_estimators': 190}


### We use above obtained Results for Regularization Coefficient

In [141]:
# Grid Search for best parameters
param_test = {'reg_lambda':[0.1, 0.5,1,2]}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.15,
                                                n_estimators=190,
                                                max_depth=24,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring='accuracy',
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=1)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:    9.8s remaining:    1.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.0s finished


Best Score: 0.9567934782608696
Best parameters: {'reg_lambda': 1}


### Then We use the Learning Rate and estimatiors to again get best max_depth and child weight

In [142]:
# Grid Search for best parameters
param_test = {'max_depth':list(range(10,25,2)), 
              'min_child_weight':list(range(1,8,2))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.15,
                                                n_estimators=190,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123,
                                                reg_lambda = 1),
                      param_grid=param_test,
                      scoring='accuracy',
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=1)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  1.1min finished


Best Score: 0.9567934782608696
Best parameters: {'max_depth': 24, 'min_child_weight': 1}


### Final Model

In [156]:
xg = xgb.XGBClassifier(learning_rate=0.15,
                        n_estimators=150,
                        max_depth=24,
                        min_child_weight=1,
                        gamma=0,
                        subsample=0.9,
                        colsample_bytree=0.8,
                        njobs=-1,
                        scale_pos_weight=1,
                        seed=123,
                        reg_lambda = 1)
xg.fit(x_train, y_train)
print("accuracy_score:", accuracy_score(y_test, xg.predict(x_test)))

accuracy_score: 0.9348534201954397


| Classifier | Accuracy |
|--|--|
|SVM|92.3|
|Neural Network|91.8|
|Random Forest|92.9|
|XGBoost|93.5|

## Defining Cost Function

We Define the cost function as Follows:


In [25]:
def custom_cost_function(y_true, y_pred):
    fp = confusion_matrix(y_pred, y_true) [1,0]
    fl = confusion_matrix(y_pred, y_true) [0,1]
    
    c = (-10 *fp) - fl
    return c
scr = make_scorer(custom_cost_function, greater_is_better = True)

We Assume that in a classification problem, Classifying a correct email as a spam is much more dangerous than classifying incorrect email as not spam.

If a correct mail is classified as SPAM, it may lead to a great deal of loss for the user as an important piece of imformation is lost. On the contrary, If a spam ends up in the inbox of a user, it may not be such a big deal. It is inconvenient, but the cost of this misclassification is much lower.

because of this We Define the cost as follows:  
False Positive: 10  
False Negative: 1

We First Run A Nested GridSearch Between 5 Traditional models. We need to manually tune Random Forest, Neural network and XGBoost to get he best accuracy. 

### Nested GridSearch To Select the best Possible Classifier

In [12]:
non_nested_scores = np.zeros(4)
nested_scores = np.zeros(4)

# Setting up Classifiers and parameter dictionaries
svm = SVC(class_weight=class_wt, random_state= 42)
svm_params = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],'C': np.arange(0.01,3,0.5)},
              {'kernel': ['linear'], 'C': np.arange(0.01,3,0.5)}
              {'kernel': ['poly'], 'C': np.arange(0.01,3,0.5), 'degree': np.arange(1, 6, 1), 'coef0':np.arange(0,3,0.5)},
              {'kernel':['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4], 'C': np.arange(0.01,3,0.5)}
             ]


dt = DecisionTreeClassifier(random_state = 10, class_weight=class_wt)
dt_grid = {'max_depth':list(range(1,15)), 
           'criterion':['gini', 'entropy'],
           'splitter':['best', 'random'],
           'min_weight_fraction_leaf':[i/10 for i in range(6)],
           'max_features':list(range(1, 57)),
           'min_impurity_decrease':np.arange(0.0001, 0.005, 0.0005),
           'min_samples_split':list(range(2,15)), 
           'min_samples_leaf':list(range(2,15)),
          }

knn = KNeighborsClassifier()
knn_grid = {'n_neighbors':list(range(1,10)), 
            'weights':['uniform', 'distance'], 
            'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
            'p':[1,2]
           }

logit = LogisticRegression(random_state=10, multi_class= 'multinomial', solver='saga', class_weight = class_wt)
lt_grid = {'penalty':['l1','l2'], 'C':list(np.arange(1, 10, 1))}

nb = MultinomialNB()
nb_grid = [{'alpha': np.arange(1,10,0.5)}]

classifiers = [svm, dt,logit, knn]
classifiers_p = ['svm', 'dt','logit', 'knn']
dicts = [ svm_params, dt_grid, lt_grid, knn_grid]


# Nested cross-validation
for i in range(4):
    print('Training:' , classifiers_p[i])
    inner_cv = KFold(n_splits=4, shuffle=True, random_state=42)
    outer_cv = KFold(n_splits=4, shuffle=True, random_state=42)

    # Non_nested parameter search and scoring
    clf = GridSearchCV(estimator=classifiers[i], param_grid=dicts[i], cv=inner_cv, iid=False, n_jobs = -1, scoring= scr, verbose = 0)
    clf.fit(x_train, y_train)
    non_nested_scores[i] = clf.best_score_

    # Nested CV with parameter optimization
    nested_score = cross_val_score(clf, X=x_train, y=y_train, cv=outer_cv)
    nested_scores[i] = nested_score.mean()
    
print(nested_scores)

Training: svm
Training: dt
Training: logit
Training: knn
[-238.   -263.5  -429.   -303.75]


From nested Gridsearch, SVM Provides the best accuracy for the model.  
We perform further Hyperparameter Tunig to get the best parameters for SVM.

### Support Vector Machines

In [13]:
svm = SVC(class_weight=class_wt, random_state= 42)
svm_params = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4],'C': np.arange(0.01,3,0.5)},
#               {'kernel': ['linear'], 'C': np.arange(0.01,3,0.5)},
#               {'kernel': ['poly'], 'C': np.arange(0.01,3,0.5), 'degree': np.arange(1, 3, 1), 'coef0':np.arange(0,3,0.5)},
              {'kernel':['sigmoid'], 'gamma': [1e-2, 1e-3, 1e-4], 'C': np.arange(0.01,3,0.5)}
             ]

classifier = GridSearchCV(svm, param_grid= svm_params, cv = 5, scoring = scr, n_jobs = -1, verbose = 0)
classifier.fit(x_train, y_train)
print("Best Score:",classifier.best_score_)
print("Best Parameters:",classifier.best_params_)

Best Score: -187.41195652173914
Best Parameters: {'C': 0.01, 'gamma': 0.001, 'kernel': 'rbf'}


In [14]:
svm = SVC(class_weight=class_wt, C = 0.01, gamma = 0.001, kernel = 'rbf')
svm.fit(x_train, y_train)

print('SVM Score:',scr(y_test, xg.predict(x_test)))

SVM Score: -57.0


### Neural network

Converting Target Variable to Categorical

In [16]:
y_train1 = to_categorical(y_train, num_classes=2)
y_test1 = to_categorical(y_test, num_classes=2)

A Basic Neural Network is used to Train the model. We use 3 hidden layers and an output layer to predict if an observation will be classified as a 1 or a zero.  
Further, We use sigmoid as output function and categorical crossentropy to calculate the loss

In [None]:
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(units = 10, activation = tf.nn.leaky_relu , input_shape=(57,)),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(units = 16, activation = tf.nn.leaky_relu),
        tf.keras.layers.Dense(2, activation="sigmoid")])

model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics = ['acc'])

clf = model.fit(pd.DataFrame(x_train),
                pd.DataFrame(y_train1),
                batch_size= 10, epochs = 50, verbose = 0, validation_split= 0.2)

loss_train, accuracy_train = model.evaluate(x_train, y_train1, verbose = False)
loss, accuracy = model.evaluate(x_test, y_test1, verbose = False)
print('loss_train:', loss_train, 'loss_test:', loss)
print('accuracy_train:', accuracy_train, 'accuracy_test:', accuracy)

We observe ascore of -53 with Neural Network.

### Random Forest

In [32]:
rf = RandomForestClassifier(bootstrap=True, random_state= False, class_weight=class_wt)
rf_grid = {
    'n_estimators':range(5,200,10),
    'criterion':['gini', 'entropy'],
    'max_depth':range(3,20),
    'min_samples_split':list(range(2,15)), 
    'min_samples_leaf':list(range(2,15))
    }

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=rf_grid, n_iter=200, scoring=scr, 
                              cv=KFold(n_splits=5, shuffle=True, random_state=45), verbose=1, n_jobs=-1, random_state=45)
rf_random.fit(x_train, y_train)
print(rf_random.best_params_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    8.0s
[Parallel(n_jobs=-1)]: Done 426 tasks      | elapsed:   21.2s
[Parallel(n_jobs=-1)]: Done 776 tasks      | elapsed:   39.1s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:   51.6s finished


{'n_estimators': 135, 'min_samples_split': 8, 'min_samples_leaf': 8, 'max_depth': 5, 'criterion': 'entropy'}


In [34]:
rf = RandomForestClassifier(n_estimators =135, min_samples_split=8, min_samples_leaf = 8, max_depth= 5, criterion ='entropy')
rf.fit(x_train, y_train)
print('Score:',scr(y_test, rf.predict(x_test)))

Accuracy: -62.0


### XGBoost

#### Setting Baseline Accuracy

In [None]:
xg=xgb.XGBClassifier(random_state = 42)
xg_grid = {
    'learning_rate':[0.1,0.05,0.01],
    'n_estimators':range(5,200,10),
    'max_depth':range(3,20),
    'min_child_weight':range(1,6),
    'gamma':[i/10.0 for i in range(0,5)],
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }

xg_random = RandomizedSearchCV(estimator=xg, param_distributions=xg_grid, n_iter=100, scoring='accuracy', 
                              cv=KFold(n_splits=4, shuffle=True, random_state=45), verbose=0, n_jobs=-1, random_state=45)

xg_random.fit(x_train, y_train)
print(xg_random.best_params_)

In [17]:
xg = xgb.XGBClassifier(subsample = 0.8, 
                       reg_alpha= 0.1, 
                       n_estimators= 175, 
                       min_child_weight=1, 
                       max_depth = 13, 
                       learning_rate = 0.1, 
                       gamma =0.4, 
                       colsample_bytree =0.8)
print('Score:',np.mean(cross_val_score(xg, x_test, y_test, scoring = scr, cv = 4)))

Score: -74.5


Using the Above Parameters, We first Train the Model for Max Depth and Child Weight

In [16]:
# Grid Search for best parameters
param_test = {'max_depth':list(range(10,20,2)), 
              'min_child_weight':list(range(1,8,2))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(subsample = 0.8, 
                                               reg_alpha= 0.1, 
                                               n_estimators= 175,
                                               learning_rate = 0.1, 
                                               gamma =0.4, 
                                               colsample_bytree =0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring=scr,
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=0)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Best Score: -166.40190217391304
Best parameters: {'max_depth': 18, 'min_child_weight': 1}


#### We Then Use this to Set Learning Rate and number of estimators

In [19]:
# Grid Search for best parameters
param_test = {'learning_rate':list(np.arange(0.05,1,0.05)), 
              'n_estimators':list(range(180,225,10))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.15,
                                                n_estimators=100,
                                                max_depth=18,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring=scr,
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=0)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Best Score: -158.80597826086955
Best parameters: {'learning_rate': 0.05, 'n_estimators': 210}


#### We use above obtained Results for Regularization Coefficient

In [21]:
# Grid Search for best parameters
param_test = {'reg_lambda':[0.1, 0.5,1,2]}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.05,
                                                n_estimators=210,
                                                max_depth=18,
                                                min_child_weight=1,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123),
                      param_grid=param_test,
                      scoring=scr,
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=1)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  20 | elapsed:   13.5s remaining:    1.4s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   13.6s finished


Best Score: -154.60434782608695
Best parameters: {'reg_lambda': 2}


#### Then We use the Learning Rate and estimatiors to again get best max_depth and child weight

In [23]:
# Grid Search for best parameters
param_test = {'max_depth':list(range(10,25,2)), 
              'min_child_weight':list(range(1,8,2))}

gsearch=GridSearchCV(estimator=xgb.XGBClassifier(learning_rate=0.05,
                                                n_estimators=210,
                                                gamma=0,
                                                subsample=0.9,
                                                colsample_bytree=0.8,
                                                njobs=-1,
                                                scale_pos_weight=1,
                                                seed=123,
                                                reg_lambda = 2),
                      param_grid=param_test,
                      scoring=scr,
                      n_jobs=-1,
                      iid=True,
                      cv=5, verbose=1)

gsearch.fit(x_train, y_train)
print("Best Score:",gsearch.best_score_)
print("Best parameters:",gsearch.best_params_)

Fitting 5 folds for each of 32 candidates, totalling 160 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done 160 out of 160 | elapsed:  1.6min finished


Best Score: -152.39864130434782
Best parameters: {'max_depth': 16, 'min_child_weight': 1}


### Final Model

In [24]:
xg = xgb.XGBClassifier(subsample = 0.8, 
                       reg_alpha= 0.1, 
                       n_estimators= 175, 
                       min_child_weight=1, 
                       max_depth = 13, 
                       learning_rate = 0.1, 
                       gamma =0.4, 
                       colsample_bytree =0.8)
xg.fit(x_train, y_train)
print('Score:',scr(y_test, xg.predict(x_test)))

Score: -74.5


| Classifier | Accuracy |
|--|--|
|SVM|57|
|Neural Network|53|
|Random Forest|62|
|XGBoost|74.7|

The Final model obtained has a cost of SVM is able to give the best cost of 57 in this regard.