## Carregar alguns pacotes necessários

In [3]:
import imblearn

In [4]:
print(imblearn.__version__)

0.7.0


In [5]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [6]:
import pandas as pd

In [7]:
import numpy as np

## Carregar os arquivos

In [8]:
dataGAL = pd.read_excel('dataGAL.xlsx')

dataGLO = pd.read_excel('dataGLO.xlsx')

dataGPS = pd.read_excel('dataGPS.xlsx')

dataSBA = pd.read_excel('dataSBA.xlsx')

Adicionar as labels em cada amostra, se ' s4_corrected_vert' > 0,1 a label correspondente é 1, do contrário, a label é 0.

In [9]:
dataGAL['label'] = dataGAL.apply(lambda row: 1 if row[' s4_corrected_vert'] > 0.1 else 0, axis=1)

In [10]:
dataGLO['label'] = dataGLO.apply(lambda row: 1 if row[' s4_corrected_vert'] > 0.1 else 0, axis=1)

In [11]:
dataGPS['label'] = dataGPS.apply(lambda row: 1 if row[' s4_corrected_vert'] > 0.1 else 0, axis=1)

In [12]:
dataSBA['label'] = dataSBA.apply(lambda row: 1 if row[' s4_corrected_vert'] > 0.1 else 0, axis=1)

## Informações das bases de dados

Verificação da quantidade de amostras pertencentes à classe NC, 's4_corrected_vert' <= 0,1, classe C, 's4_corrected_vert' > 0,1 e porcentagem de amostras da classe C, respectivamente.

In [13]:
dataGAL.loc[dataGAL['label'] == 0, 'label'].count(), dataGAL.loc[dataGAL['label'] == 1, 'label'].count(), dataGAL.loc[dataGAL['label'] == 1, 'label'].count()/len(dataGAL)

(76838, 7959, 0.09385945257497318)

In [14]:
dataGLO.loc[dataGLO['label'] == 0, 'label'].count(), dataGLO.loc[dataGLO['label'] == 1, 'label'].count(), dataGLO.loc[dataGLO['label'] == 1, 'label'].count()/len(dataGLO)

(236171, 22130, 0.08567523935253832)

In [15]:
dataGPS.loc[dataGPS['label'] == 0, 'label'].count(), dataGPS.loc[dataGPS['label'] == 1, 'label'].count(), dataGPS.loc[dataGPS['label'] == 1, 'label'].count()/len(dataGPS)

(237864, 20434, 0.07911017506910623)

In [16]:
dataSBA.loc[dataSBA['label'] == 0, 'label'].count(), dataSBA.loc[dataSBA['label'] == 1, 'label'].count(), dataSBA.loc[dataSBA['label'] == 1, 'label'].count()/len(dataSBA)

(141880, 11956, 0.07771912946254453)

## Definindo as variáveis dependentes e independentes, X e Y, respectivamente

In [17]:
X_GAL, Y_GAL = dataGAL.iloc[:,[0,1,2]].copy(), dataGAL['label'].copy()

In [18]:
X_GLO, Y_GLO = dataGLO.iloc[:,[0,1,2]].copy(), dataGLO['label'].copy()

In [19]:
X_GPS, Y_GPS = dataGPS.iloc[:,[0,1,2]].copy(), dataGPS['label'].copy()

In [20]:
X_SBA, Y_SBA = dataSBA.iloc[:,[0,1,2]].copy(), dataSBA['label'].copy()

In [21]:
X_GAL, Y_GAL = X_GAL.values, Y_GAL.values

In [22]:
X_GLO, Y_GLO = X_GLO.values, Y_GLO.values

In [23]:
X_GPS, Y_GPS = X_GPS.values, Y_GPS.values

In [24]:
X_SBA, Y_SBA = X_SBA.values, Y_SBA.values

## Processo usado para tratar o desbalanceamento de classes observado

In [25]:
over = SMOTE(sampling_strategy=0.3)
under = RandomUnderSampler(sampling_strategy=0.5)

In [26]:
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

In [27]:
X_GAL_RS, Y_GAL_RS = pipeline.fit_resample(X_GAL, Y_GAL)

In [28]:
X_GLO_RS, Y_GLO_RS = pipeline.fit_resample(X_GLO, Y_GLO)

In [29]:
X_GPS_RS, Y_GPS_RS = pipeline.fit_resample(X_GPS, Y_GPS)

In [30]:
X_SBA_RS, Y_SBA_RS = pipeline.fit_resample(X_SBA, Y_SBA)

Quantidade de amostras nas classes C e NC após o processo de geração de amostras artificiais da classe minoritária e posterior subamostragem das amostras da classe majoritária.

In [32]:
np.count_nonzero(Y_GAL_RS == 1), np.count_nonzero(Y_GAL_RS == 0)

(23051, 46102)

In [34]:
np.count_nonzero(Y_GLO_RS == 1), np.count_nonzero(Y_GLO_RS == 0)

(70851, 141702)

In [35]:
np.count_nonzero(Y_GPS_RS == 1), np.count_nonzero(Y_GPS_RS == 0)

(71359, 142718)

In [36]:
np.count_nonzero(Y_SBA_RS == 1), np.count_nonzero(Y_SBA_RS == 0)

(42564, 85128)

## Contrução do modelo SVM

### Pré-processamento dos dados

In [37]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

In [38]:
X_GAL_SVM = scaler.fit_transform(X_GAL_RS)

In [39]:
X_GLO_SVM = scaler.fit_transform(X_GLO_RS)

In [40]:
X_GPS_SVM = scaler.fit_transform(X_GPS_RS)

In [41]:
X_SBA_SVM = scaler.fit_transform(X_SBA_RS)

### Treinamento e avaliação dos modelos

In [42]:
from sklearn.svm import SVC

In [43]:
from sklearn.metrics import confusion_matrix

def specificity(Y_true, Y_pred):
    '''
    Função para calcular a métrica Especificidade
    '''
    
    tn, fp, fn, tp = confusion_matrix(Y_true, Y_pred).ravel()
    
    specificity = tn / (tn+fp)
    
    return specificity   

In [44]:
def apply_KFold_SVM(X, Y, k):
    '''
    Função para aplicar o processo de validação cruzada K-fold
    input
    -----
    X: np.array
        variável independente
    Y: np.array
        labels
    k: int
        quantidade de folds
    '''
    kf = StratifiedKFold(n_splits=k)
    kf.get_n_splits(X)
    Acuracia_k_fold = []
    f1_score_k_fold = []
    recall_k_fold = []
    precision_k_fold = []
    specificity_k_fold = []
    for train_index, test_index in kf.split(X, Y):
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        # model
        svm = SVC()
        svm.fit(X_train, Y_train)
        Y_pred = svm.predict(X_test)
        Acuracia_k_fold.append(accuracy_score(Y_test,Y_pred))
        f1_score_k_fold.append(f1_score(Y_test,Y_pred))
        recall_k_fold.append(recall_score(Y_test,Y_pred))
        precision_k_fold.append(precision_score(Y_test,Y_pred))
        specificity_k_fold.append(specificity(Y_test,Y_pred))
  
  # obter os valores médios das métricas

    mean_acc = sum(Acuracia_k_fold)/k

    mean_f1 = sum(f1_score_k_fold)/k
    
    mean_recall = sum(recall_k_fold)/k
    
    mean_precision = sum(precision_k_fold)/k
    
    mean_specificity = sum(specificity_k_fold)/k

  # return

    print('Accuracy: '+ str(mean_acc))
    print('F1-score: '+ str(mean_f1))
    print('Precision: '+ str(mean_precision))
    print('Recall: '+ str(mean_recall))
    print('Specificity: '+ str(mean_specificity))

In [51]:
apply_KFold_SVM(X_GAL_SVM, Y_GAL_RS, 10)

Accuracy: 0.9971801625706181
F1-score: 0.9957880317816639
Precision: 0.9917404027036907
Recall: 0.9998698481561823
Specificity: 0.9958353244693086


In [52]:
apply_KFold_SVM(X_GLO_SVM, Y_GLO_RS, 10)

Accuracy: 0.9977182156081497
F1-score: 0.9965890993085956
Precision: 0.9932015946772547
Recall: 1.0
Specificity: 0.9965773252371092


In [53]:
apply_KFold_SVM(X_GPS_SVM, Y_GPS_RS, 10)

Accuracy: 0.9978605838335156
F1-score: 0.9968013084349687
Precision: 0.9936372807004998
Recall: 0.9999859865470853
Specificity: 0.9967978813301018


In [54]:
apply_KFold_SVM(X_SBA_SVM, Y_SBA_RS, 10)

Accuracy: 0.9976192751716806
F1-score: 0.9964418726799869
Precision: 0.9929094834591993
Recall: 1.0
Specificity: 0.9964289150538452
