In [3]:
import numpy as np
import pandas as pd

#
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [5]:
# load numpy array
data = np.load('./data/data_pca_50_target.npz')
data.files

['arr_0', 'arr_1']

In [6]:
data.allow_pickle = True

In [14]:
X = data['arr_0'] # pca data with 50 components
Y = data['arr_1'] # target or dependent variable

In [15]:
X.shape,Y.shape

((4321, 50), (4321,))

In [12]:
X

array([[ 0.87500172,  0.26735315, -0.23778001, ...,  0.77766206,
         0.4386648 ,  1.06235175],
       [ 1.17092594, -0.22183947, -0.34321241, ..., -1.03002293,
         0.47537964, -1.79689269],
       [-0.7894864 , -0.30915247, -0.37195776, ..., -1.27151125,
         0.3914073 , -1.93833213],
       ...,
       [ 1.33149975,  0.57080384,  1.26953343, ...,  0.68991025,
         1.44484117, -1.34391011],
       [-1.24438955, -0.45981387,  0.40834252, ..., -0.72285477,
         0.4021218 ,  0.543946  ],
       [ 1.23973254,  0.94343787, -0.05882303, ..., -0.05265067,
        -0.37440415, -0.32812991]])

In [16]:
Y

array(['female', 'female', 'female', ..., 'male', 'male', 'male'],
      dtype=object)

In [17]:
## split the data into training and testing

In [18]:
x_train,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.2,stratify=Y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(3456, 50) (865, 50) (3456,) (865,)


In [19]:
### train machine learning model

In [25]:
model_svc = SVC(probability=True)

param_grid = {'C':[0.5, 1,10,20,30,50],
             'kernel':['rbf','poly'],
             'gamma' :[0.1,0.05,0.01,0.001,0.002,0.005],
             'coef0':[0,1]}

In [26]:
model_grid = GridSearchCV(model_svc,
                          param_grid=param_grid,
                          scoring='accuracy',cv=3,verbose=2)

In [27]:
model_grid.fit(x_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.9s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.8s
[CV] END ..............C=0.5, coef0=0, gamma=0.1, kernel=rbf; total time=   0.8s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.5s
[CV] END .............C=0.5, coef0=0, gamma=0.1, kernel=poly; total time=   0.5s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.7s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.7s
[CV] END .............C=0.5, coef0=0, gamma=0.05, kernel=rbf; total time=   0.7s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   0.5s
[CV] END ............C=0.5, coef0=0, gamma=0.05, kernel=poly; total time=   0.5s
[CV] END ............C=0.5, coef0=0, gamma=0.0

In [28]:
model_grid.best_params_

{'C': 1, 'coef0': 0, 'gamma': 0.01, 'kernel': 'rbf'}

In [29]:
model_final = model_grid.best_estimator_

In [31]:
### Model Evaluation
# - Classification Report
#    - Precision, Recall, F1-Score
# - Kappa Score
#    - -ve (worst model)
#    - 0 to 0.5 (bad model)
#    - 0.5 to 0.7 (good model)
#    - 0.7 to 0.9 (excellent model)
#    - 0.9 to 1 (perfect model)
# - AUC
#    - Less than 0.5 (worst model)
#    - 0.5 to 0.6 (bad model)
#    - 0.6 to 0.8 (good model)
#    - 0.8 to 0.9 (excellent model)
#    - 0.9 to 1.0 (perfect model)

In [32]:
y_pred = model_final.predict(x_test) # predicted values

In [33]:
y_pred

array(['male', 'female', 'female', 'male', 'male', 'female', 'female',
       'female', 'male', 'female', 'female', 'male', 'female', 'female',
       'female', 'male', 'female', 'female', 'female', 'male', 'female',
       'female', 'male', 'female', 'male', 'male', 'female', 'female',
       'male', 'female', 'female', 'female', 'male', 'female', 'male',
       'female', 'female', 'male', 'female', 'female', 'female', 'male',
       'male', 'female', 'female', 'female', 'female', 'female', 'female',
       'male', 'female', 'male', 'male', 'female', 'male', 'male', 'male',
       'female', 'female', 'female', 'female', 'female', 'male', 'female',
       'female', 'female', 'male', 'female', 'female', 'female', 'female',
       'male', 'female', 'female', 'female', 'female', 'male', 'female',
       'female', 'female', 'female', 'male', 'female', 'female', 'female',
       'male', 'male', 'female', 'male', 'male', 'male', 'female',
       'female', 'male', 'male', 'female', 'female', 

In [34]:
### Classification Report

In [38]:
cr = metrics.classification_report(y_test,y_pred,output_dict=True)
pd.DataFrame(cr).T

Unnamed: 0,precision,recall,f1-score,support
female,0.805668,0.834382,0.819773,477.0
male,0.787062,0.752577,0.769433,388.0
accuracy,0.797688,0.797688,0.797688,0.797688
macro avg,0.796365,0.793479,0.794603,865.0
weighted avg,0.797322,0.797688,0.797193,865.0


In [39]:
### Kappa Score

In [40]:
metrics.cohen_kappa_score(y_test,y_pred)

np.float64(0.5893679182072433)

In [41]:
### Area Under Curve (AUC)

In [42]:
metrics.roc_auc_score(np.where(y_test=='male',1,0),
                      np.where(y_pred=='male',1,0))

np.float64(0.7934794354751561)