# Imports

In [1]:
#imports
import numpy as np
import pandas as pd 
import math as mt
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import OneClassSVM
from sklearn.metrics import classification_report

import warnings
warnings.filterwarnings('ignore')

# Reading and preprocessing data

In [2]:
#reading data
data_path = "/home/romulo/projects/tuberculosis_project/data/"

array_cxr = np.load(data_path + 'features_cxr.npy')
array_labels = np.load(data_path + 'labels_cxr.npy')

In [3]:
array_cxr[0:3]

array([[0.11764706, 0.02745098, 0.03529412, ..., 0.37647059, 0.36862745,
        0.37647059],
       [0.19215686, 0.10196078, 0.11372549, ..., 0.51372549, 0.52156863,
        0.53333333],
       [0.35686275, 0.28627451, 0.30588235, ..., 0.8745098 , 0.89019608,
        0.90980392]])

In [4]:
array_labels[0:3]

array([False, False, False])

In [5]:
#changing from false and positive to 0 and 1
array_labels = array_labels.astype(int)

In [6]:
#changing 0 to -1
for z in range(len(array_labels)):
    if array_labels[z] == 0:
        array_labels[z] = -1

# Training 

## Some definitions 

In [7]:
def split_folds(data, label, n_splits, seed, shuffle):

    '''
    Split into different train, validation and test folds
    
    '''
    X_train, X_test, y_train, y_test = [], [], [], []

    kf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)

    for train_index, test_index in kf.split(data, label):
        X_train.append(data[train_index])
        X_test.append(data[test_index])
        y_train.append(label[train_index])
        y_test.append(label[test_index])
        
    return X_train, X_test, y_train, y_test

In [8]:
def sp(classification):
    
    '''
    sp metric
    '''
    sp_i = (classification['1']['recall'] + classification['-1']['recall']) / 2
    sp_i = sp_i * mt.sqrt(classification['1']['recall'] * classification['-1']['recall'])
    sp_i = mt.sqrt(sp_i)
    
    return sp_i

## OCSVM params for grid search 

In [9]:
params_grid = {
    'nu': [0.1, 0.5, 0.9],
    'gamma': [0.01, 'auto'],
    'kernel': ['rbf']}

# params_grid = {
#     'nu': [0.1, 0.9],
#     'gamma': [0.01],
#     'kernel': ['rbf']}

## Training itself 

In [10]:
#creation of the first train and test fold
n_splits = 5
seed = 0
shuffle = True

X_train, X_test, y_train, y_test = split_folds(array_cxr, array_labels, n_splits, seed, True)

In [11]:
class fold():
    '''
    Fold class, to be easier to perform the calculations
    '''
    
    def __init__(self, 
                 data_train,
                 data_test,
                 label_train,
                 label_test,
                 seed,
                 params_grid,
                 shuffle,
                 n_splits,
                 i):
        
        self.X_train = data_train
        self.X_test = data_test
        self.y_train = label_train
        self.y_test = label_test
        self.name = str(i)
        
        self.sensitivity_mean = 0
        self.specificity_mean = 0
        self.sp_mean = 0
        self.sensitivity_std = 0
        self.specificity_std = 0
        self.sp_std = 0
        
        self.best_params = 0
        
        self.X_train_folds = []
        self.X_cv_folds = []
        self.y_train_folds = []
        self.y_cv_folds = []
        
    def split_folds(self, n_splits, shuffle):
        
        kf = StratifiedKFold(n_splits=n_splits, random_state=seed, shuffle=shuffle)

        for train_index, cv_index in kf.split(self.X_train, self.y_train):
            self.X_train_folds.append(self.X_train[train_index])
            self.X_cv_folds.append(self.X_train[cv_index])
            self.y_train_folds.append(self.y_train[train_index])
            self.y_cv_folds.append(self.y_train[cv_index])
            
    def filter_samples(self, X_train, y_train, label):
        
        X_train = X_train[y_train == label]
        
        return X_train
    
    def gridsearch(self,params_grid, X_train, X_cv, y_cv, name, list_params):
                
        for nu in params_grid['nu']:
            for gamma in params_grid['gamma']:
                for kernel in params_grid['kernel']:
                    #fitting the model
                    model = OneClassSVM(nu=nu, kernel=kernel, gamma=gamma)
                    model.fit(X_train)
                    
                    #original vs predicted
                    y_true, y_pred = y_cv, model.predict(X_cv)
                    
                    #name of the fold + subfold + params
                    name_and_param = name + '_nu_' + str(nu) + '_gamma_' + str(gamma) + '_kernel_' + str(kernel)
                    
                    #finding the right params object to append
                    for params in list_params:
                        if params.name == 'nu_' + str(nu) + '_gamma_' + str(gamma) + '_kernel_' + str(kernel):
                            classification = classification_report(y_true, y_pred, output_dict=True)
                            params.list_sensitivity.append(classification['1']['recall'])
                            params.list_specificity.append(classification['-1']['recall'])
                            params.list_sp.append(sp(classification))
                    
    def main_subfold(self, j, list_params):
        '''
        main for each subfold
        '''
        #name of the subfold
        name = 'fold_' + self.name + '_subfold_' + str(j)
        
        #filtering
        self.X_train_folds[j] = self.filter_samples(self.X_train_folds[j], self.y_train_folds[j], 1)
        
        #doing the gridsearch for each subfold
        self.gridsearch(params_grid, self.X_train_folds[j], self.X_cv_folds[j], self.y_cv_folds[j], name, list_params)

    def return_best_sp(self, list_params):
        max = 0
        for params in list_params:
            if params.sp_mean > max:
                max = params.sp_mean
                self.best_params = params
        
    def main(self):
        '''
        main for each fold
        '''
        #split into n folds
        self.split_folds(n_splits, shuffle)

        #creating list with params:
        #list with params objects
        list_params = []
        for nu in params_grid['nu']:
            for gamma in params_grid['gamma']:
                for kernel in params_grid['kernel']:
                    #creating params object
                    list_params.append(ParamsOCSVM(nu, gamma, kernel))
        
        #gridsearch in each subfold
        for j in range(len(self.X_train_folds)):
            self.main_subfold(j, list_params)
            
        #mean and std for each param
        for param in list_params:
            param.calculate_mean_std()
            
        #find the best params when it comes to sp
        max = 0
        for param in list_params:
            if param.sp_mean > max:
                max = param.sp_mean
                print(param.name)
                
        self.return_best_sp(list_params)
        
        return self.best_params

    def fit_and_testing_best_params(self):
        '''
        fitting and testing with the best params
        
        '''
        model = OneClassSVM(nu=self.best_params.nu, gamma=self.best_params.gamma, kernel=self.best_params.kernel)
        X_train = self.filter_samples(self.X_train, self.y_train, 1)
        model.fit(X_train)
        support_vectors = model.support_vectors_
        
        print('Results best training')
        print('Best parameters are: ' + str(self.best_params.name))
        print('Number of elements in training: ' + str(len(X_train)))
        print('Number of support vectors: ' + str(len(support_vectors)))
        
        #predicting test data
        y_true, y_pred = self.y_test, model.predict(self.X_test)
        classification = classification_report(y_true, y_pred, output_dict=True)
        sensitivity = (classification['1']['recall'])
        specificity = (classification['-1']['recall'])
        sp_ = (sp(classification))
        
        print('Results from testing')
        print('Sensitivity: ' + str(sensitivity))
        print('Specificity: ' + str(specificity))
        print('Sp: ' + str(sp_))

class ParamsOCSVM():
    '''
    Class of the ocsvm params
    '''
    
    def __init__(self, nu, gamma, kernel):
        
        self.nu = nu
        self.gamma = gamma
        self.kernel = kernel
        
        self.name = 'nu_' + str(self.nu) + '_gamma_' + str(self.gamma) + '_kernel_' + str(self.kernel)
        self.list_sp = []
        self.list_sensitivity = []
        self.list_specificity = []
        
        self.sp_std = 0
        self.sp_mean = 0
        self.sensitivity_std = 0
        self.sensitivity_mean = 0
        self.specificity_std = 0
        self.specificity_mean = 0
        
    def calculate_mean_std(self):
        #mean
        self.sp_mean = np.mean(self.list_sp)
        self.sensitivity_mean = np.mean(self.list_sensitivity)
        self.specificity_mean = np.mean(self.list_specificity)
        
        #std
        self.sp_std = np.std(self.list_sp)
        self.sensitivity_std = np.std(self.list_sensitivity)
        self.specificity_std = np.std(self.list_specificity)

In [12]:
#creating folds objects
list_folds = []

for i in range(n_splits):
    list_folds.append(fold(X_train[i], X_test[i], y_train[i], y_test[i], seed, params_grid, shuffle, n_splits, i))

In [13]:
best_params = list_folds[0].main()

nu_0.1_gamma_auto_kernel_rbf
nu_0.5_gamma_auto_kernel_rbf


In [14]:
list_folds[0].fit_and_testing_best_params()

Results best training
Best parameters are: nu_0.5_gamma_auto_kernel_rbf
Number of elements in training: 269
Number of support vectors: 137
Results from testing
Sensitivity: 0.40298507462686567
Specificity: 0.7424242424242424
Sp: 0.5596939782827451
