In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

## Type 1 dataset

In [44]:
df = pd.read_csv('../dataset/processed_dataset/final_data_S1.csv')
X = df.drop(['class'],axis=1)
y = df['class']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [45]:
xtrain.head()

Unnamed: 0,locDbShimmer,locPctJitter,DFA,meanNoiseToHarmHarmonicity,numPeriodsPulses,PPE,maxIntensity,f1,f2,f3,...,mean_MFCC_2nd_coef,mean_MFCC_3rd_coef,mean_MFCC_4th_coef,mean_MFCC_5th_coef,mean_MFCC_6th_coef,mean_MFCC_7th_coef,mean_MFCC_8th_coef,mean_MFCC_9th_coef,mean_MFCC_10th_coef,mean_MFCC_12th_coef
40,0.218765,0.037707,0.827942,0.024048,0.451791,0.959555,0.706979,0.226865,0.395574,0.172818,...,0.550667,0.296805,0.513906,0.676077,0.654307,0.21682,0.445642,0.444748,0.135134,0.561431
47,0.820613,0.14731,0.776283,0.356674,0.433884,0.426359,0.515664,0.751612,0.303098,0.843062,...,0.714284,0.626373,0.594501,0.490417,0.539186,0.576182,0.843049,0.568743,0.499411,0.505783
116,0.113758,0.008044,0.193537,0.003579,0.679063,0.936383,0.804886,0.929958,0.650572,0.544976,...,0.109548,0.154698,0.427971,0.297728,0.690704,0.424521,0.462322,0.296469,0.244151,0.343756
131,0.452115,0.422826,0.680889,0.349384,0.137741,0.480461,0.692021,0.373978,0.312325,0.355587,...,0.762243,0.459204,0.745548,0.453004,0.613895,0.534103,0.468956,0.440034,0.65089,0.527681
87,0.164317,0.1272,0.727276,0.063317,0.212121,0.937214,0.596907,0.385927,0.334516,0.578855,...,0.679746,0.454103,0.697229,0.584356,0.552085,0.39211,0.729282,0.76404,0.574777,0.677339


In [46]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(xtrain, ytrain)

y_pred = gnb.predict(xtest)

In [58]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix

acc = accuracy_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)
recall = recall_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
print(acc, f1, recall, precision)

0.7720588235294118 0.841025641025641 0.803921568627451 0.8817204301075269


### Hyperparameter optimization

In [53]:
params = {'var_smoothing': [np.exp(-x) for x in range(5,500, 5)]}

In [54]:
from sklearn.model_selection import ParameterGrid
param_dict = ParameterGrid(params)
print(len(list(param_dict)))

99


In [55]:
def run_model(param, X_train, y_train, X_test, y_test):
    gnb = GaussianNB(**param)
    gnb.fit(X_train, y_train)
    param.update({
        "accuracy_score":accuracy_score(y_test, gnb.predict(X_test)),
        "f1_score":f1_score(y_test, gnb.predict(X_test)),
        "precision_score":precision_score(y_test, gnb.predict(X_test)),
        "recall_score":recall_score(y_test, gnb.predict(X_test)),
        "roc_auc_score":roc_auc_score(y_test, gnb.predict(X_test)),
        "confusion_matrix":str(confusion_matrix(y_test, gnb.predict(X_test)))
    })
    
    return pd.DataFrame(param, index=[0])


In [56]:
lst = []
for param in param_dict:
    lst.append(run_model(param, xtrain, ytrain, xtest, ytest))

In [57]:
pd.concat(lst)

Unnamed: 0,var_smoothing,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,confusion_matrix
0,6.737947e-03,0.764706,0.836735,0.87234,0.803922,0.725490,[[22 12]\n [20 82]]
0,4.539993e-05,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,3.059023e-07,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,2.061154e-09,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,1.388794e-11,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
...,...,...,...,...,...,...,...
0,5.130044e-207,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,3.456597e-209,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,2.329036e-211,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]
0,1.569292e-213,0.772059,0.841026,0.88172,0.803922,0.740196,[[23 11]\n [20 82]]


On observing the above dataframe, the optimal value of var_smoothing is found to be `4.539993e-05`

In [63]:
gaussian_model = GaussianNB(var_smoothing=4.539993e-05)
gaussian_model.fit(xtrain, ytrain)

GaussianNB(var_smoothing=4.539993e-05)

## Type 2 dataset

In [66]:
data = pd.read_csv('../dataset/processed_dataset/final_data_S2.csv')
X = data.drop(['class'],axis=1)
y = data['class']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [67]:
x_train.head()

Unnamed: 0,PCA1,PCA2,PCA3,PCA4,PCA5,PCA6,PCA7,PCA9,PCA10,PCA11,...,PCA22,PCA23,PCA24,PCA25,PCA26,PCA27,PCA28,PCA29,PCA30,PCA31
40,-0.021541,0.449023,0.230178,0.227837,-0.191375,0.445557,0.187521,-0.18934,-0.123635,-0.044382,...,-0.051699,0.004013,0.06265,-0.035867,-0.07326,-0.022004,-0.035672,0.080169,-0.028131,0.015708
47,1.375255,-0.431577,0.088344,0.078963,0.225477,-0.146534,-0.293941,0.078695,-0.10172,-0.266613,...,0.055102,-0.048686,-0.118993,-0.003545,0.016227,-0.02147,-0.065941,-0.067461,0.022318,-0.064948
116,-0.694219,-0.430284,0.462707,0.193823,-0.170851,0.052937,-0.190918,0.130157,0.008796,0.152191,...,0.046799,-0.048531,0.011695,-0.015748,-0.020432,0.030973,-0.048812,-0.054583,-0.018546,-0.001089
131,0.980027,0.327126,0.211552,-0.079559,-0.39146,-0.49738,0.165832,0.143384,0.037632,0.226283,...,0.318909,0.166554,0.090675,0.173251,0.254848,0.073386,-0.038142,0.194657,0.006749,0.092184
87,0.175981,0.463141,-0.152082,-0.089578,0.077347,-0.00778,-0.009131,0.036945,0.269576,0.11965,...,0.056305,0.022637,0.031979,0.025242,-0.067887,-0.056225,0.02139,0.0128,-0.046182,0.011772


### Hyperparameter optimizationn using the above defined function `run_model`

In [70]:
param_lst = []
for param in param_dict:
    param_lst.append(run_model(param, x_train, y_train, x_test, y_test))

In [71]:
pd.concat(param_lst)

Unnamed: 0,var_smoothing,accuracy_score,f1_score,precision_score,recall_score,roc_auc_score,confusion_matrix
0,6.737947e-03,0.808824,0.884956,0.806452,0.980392,0.637255,[[ 10 24]\n [ 2 100]]
0,4.539993e-05,0.786765,0.869955,0.801653,0.950980,0.622549,[[10 24]\n [ 5 97]]
0,3.059023e-07,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
0,2.061154e-09,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
0,1.388794e-11,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
...,...,...,...,...,...,...,...
0,5.130044e-207,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
0,3.456597e-209,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
0,2.329036e-211,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]
0,1.569292e-213,0.794118,0.873874,0.808333,0.950980,0.637255,[[11 23]\n [ 5 97]]


On observing the above dataframe, the optimal value of var_smoothing is found to be 3.059023e-07

In [72]:
gaussian_clf = GaussianNB(var_smoothing=3.059023e-07)
gaussian_clf.fit(xtrain, ytrain)

GaussianNB(var_smoothing=3.059023e-07)