In [1]:
# Importing Required Python Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)

In [2]:
# Importing Cross_val_score & Stratified K fold 
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

In [3]:
# Importing the Sklearn's roc_auc_score module
from sklearn.metrics import roc_auc_score

In [4]:
# Loading Reduced feature Training set
X_train_red = pd.read_csv('X_train_final.csv')
y_train = pd.read_csv('y_train.final.csv')

In [5]:
# Loading Reduced feature Test set
X_test_red = pd.read_csv('X_test_final.csv')
y_test = pd.read_csv('y_test.final.csv')

## Model_8: SVM Classifier with Tuned Hyperparameters using Random Search CV.

In [6]:
# Importing Random Search CV objects
from sklearn.model_selection import RandomizedSearchCV

In [7]:
# Initializing the K Fold object
cv_strat = StratifiedKFold(5,random_state=42)

In [8]:
# Formulating the hyperparameter grid for SVM Classifier
params_svm = {'C': [.001,.01,.1,.5,1,2,4,6,8,10,20,40,60,80,100],
              'gamma':  [.0001,.001,.01,.1,.5,1,2,4,6,8,10]+['scale','auto'],
              'class_weight':['balanced',{0:1.0,1:9.0},{0:1.0,1:10},{0:1.0,1:11},{0:1.0,1:12}]
             }
              

In [9]:
# Importing the SVM Classifier
from sklearn.svm import SVC

In [10]:
# Instantiating the SVM Classifier
svm_s = SVC(probability=True,cache_size=500,random_state=42,kernel='rbf')

In [11]:
# Initializing Random_search CV object
randcv_svm = RandomizedSearchCV(svm_s,param_distributions=params_svm,scoring='roc_auc',n_jobs=5,cv=cv_strat,
                              random_state=42,n_iter=50)

In [12]:
# Running Random Search CV on Training set
randcv_svm.fit(X_train_red,y_train)

RandomizedSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=False),
                   estimator=SVC(cache_size=500, probability=True,
                                 random_state=42),
                   n_iter=50, n_jobs=5,
                   param_distributions={'C': [0.001, 0.01, 0.1, 0.5, 1, 2, 4, 6,
                                              8, 10, 20, 40, 60, 80, 100],
                                        'class_weight': ['balanced',
                                                         {0: 1.0, 1: 9.0},
                                                         {0: 1.0, 1: 10},
                                                         {0: 1.0, 1: 11},
                                                         {0: 1.0, 1: 12}],
                                        'gamma': [0.0001, 0.001, 0.01, 0.1, 0.5,
                                                  1, 2, 4, 6, 8, 10, 'scale',
                                                  'auto']},
                 

In [13]:
# Getting the parameters of the best SVM Model
randcv_svm.best_params_

{'gamma': 0.0001, 'class_weight': 'balanced', 'C': 80}

In [14]:
# Getting the best roc_auc 5 Fold Score 
randcv_svm.best_score_

0.781478287729693

In [17]:
# Getting the best SVM estimator from Random search CV, which has already been trained on the entire Training set
svm_red = randcv_svm.best_estimator_

In [18]:
# Defining the function to calculate the roc_auc score for the feature sets
def cal_roc_auc(X, y, cls, f_set, t_set, model_name):
    ''' Calculates the roc auc score using the best study parameters 
        f_set : String: specifies 'full feature', 'Reduced feature'
        t_set: String: specifies 'training', 'test'
        model_name: String: specifies Name of the model '''
        
    y_pred = cls.predict_proba(X)
    print('The roc_auc_score for the {} {} set using the best {} classifier is '.format(f_set, t_set, model_name),roc_auc_score(y, y_pred[:,1]))

In [19]:
# Calculating the Reduced feature training set roc_auc score using the best SVM model
cal_roc_auc(X_train_red, y_train, svm_red, 'Reduced feature', 'training', 'SVM')

The roc_auc_score for the Reduced feature training set using the best SVM classifier is  0.7853852343047899


In [20]:
# Calculating the Reduced feature test set roc_auc score using the best study parameters
cal_roc_auc(X_test_red, y_test, svm_red, 'Reduced feature', 'test', 'SVM')

The roc_auc_score for the Reduced feature test set using the best SVM classifier is  0.7934770655455445


In [21]:
# Saving the Reduced feature set best SVM Classifier 
import joblib
joblib.dump(svm_red,'SVM_Reduced.joblib')

['SVM_Reduced.joblib']

### Calculating R_R ratio for best SVM Classifier.

In [8]:
# Loading the best SVM  Classifier model
import joblib
svm_red = joblib.load('SVM_Reduced.joblib')

In [9]:
# Importing required Libraries
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [10]:
# Instantiating the Stratified K fold object
cv_strat = StratifiedKFold(10,random_state=42)

In [11]:
# Computing the CV scores using sklearn's cross_val_score
score_svm = cross_val_score(svm_red, X_train_red, y_train, cv=cv_strat, n_jobs=5, scoring='roc_auc')

In [12]:
print('The reward associated with the best SVM Classifier using roc_auc metric is: ',np.mean(score_svm))

The reward associated with the best SVM Classifier using roc_auc metric is:  0.782037372445053


In [13]:
print('The risk associated with the best SVM Classifier using roc_auc metric is: ',np.std(score_svm))

The risk associated with the best SVM Classifier using roc_auc metric is:  0.016582480391873475


In [14]:
R_R_Ratio_svm = np.mean(score_svm)/np.std(score_svm)

In [15]:
print('The reward risk ratio for the best SVM Classifier using roc_auc metric is: ',R_R_Ratio_svm)

The reward risk ratio for the best SVM Classifier using roc_auc metric is:  47.16045814402432


#### R_R Ratio for the best SVM classifier using reduced feature set is: 47.16045814402432

## Observations:
### 1) SVM classifier shows no sign of overfitting  as both training and test roc_auc scores are almost equal to each other . But both training & test roc_auc scores for best SVM Model  are less than the corresponding scores from the best logistics Regression model( which belongs to the same classifier family  as SVM).
### 2) SVM fits really really slowly & is only a feasible option for small or medium sized datasets.
### 3) The R_R ration for the best SVM model is less than that of best Logistic Regression, which has much lower computational complexity. Thus for this dataset, Logistic Regression outperforms SVM on all fronts.