In [None]:
!pip install "dask[dataframe]"

In [None]:
!pip install tsfresh

# Setup

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.base import clone
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

# models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
import xgboost as xgb
import lightgbm as lgb

In [None]:
from tsfresh.feature_extraction import feature_calculators

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Path Pedro
path = '/content/drive/MyDrive/PADS/pads-parkinsons-disease-smartwatch-dataset-1.0.0/pads-parkinsons-disease-smartwatch-dataset-1.0.0'
os.chdir(path)

print(f'Current working directory: {os.getcwd()}')

In [None]:
import os

# Path Rafael
path = '/content/drive/MyDrive/pads'
os.chdir(path)

print(f'Current working directory: {os.getcwd()}')

Current working directory: /content/drive/MyDrive/pads


# Database

**negrito**## Generating database


In [None]:
strat = pd.read_csv('preprocessed/stratified_subset_file_list.csv')
HC_PD = strat[strat['label'] != 2]
DD_PD = strat[strat['label'] != 0]

In [None]:
import numpy as np
from scipy.fft import fft, fftfreq

def calculate_harmonic_features(signal, sampling_rate=100):

    n = len(signal) # Número de amostras
    fft_values = fft(signal) # transformada de fourier
    fft_magnitudes = np.abs(fft_values[:n // 2])
    freqs = fftfreq(n, d=1 / sampling_rate)[:n // 2]

    # Identificar a frequência fundamental e suas harmônicas
    fundamental_idx = np.argmax(fft_magnitudes[1:]) + 1 # Índice da frequência fundamental.
    fundamental_freq = freqs[fundamental_idx] # valor da frequência fundamental.
    Hamp1 = fft_magnitudes[fundamental_idx]

    # Harmônicas
    Hamp2 = fft_magnitudes[2 * fundamental_idx] if 2 * fundamental_idx < len(fft_magnitudes) else 0
    Hamp3 = fft_magnitudes[3 * fundamental_idx] if 3 * fundamental_idx < len(fft_magnitudes) else 0

    # Distorções Harmônicas
    HD2 = Hamp2 / Hamp1 if Hamp1 != 0 else 0
    HD3 = Hamp3 / Hamp1 if Hamp1 != 0 else 0

    # Distorção Harmônica Total (THD)
    harmonic_indices = [k * fundamental_idx for k in range(2, 8) if k * fundamental_idx < len(fft_magnitudes)]
    THD = np.sqrt(sum((fft_magnitudes[idx] / Hamp1) ** 2 for idx in harmonic_indices)) if Hamp1 != 0 else 0

    return [Hamp1, Hamp2, Hamp3, fundamental_freq, HD2, HD3, THD]

def getFeaturesSelect(subjects, sampling_rate=100):
    features = []
    for id in subjects:
        subject_features = []
        try:
            x = np.fromfile(f"./preprocessed/movement/{id:03d}_ml.bin", dtype=np.float32).reshape((-1, 976))
        except FileNotFoundError:
            print(f"File for subject {id} not found.")
            continue

        for col in x:
            # Calcular as características harmônicas para cada coluna (sinal)
            harmonic_features = calculate_harmonic_features(col, sampling_rate)
            subject_features.extend(harmonic_features)

        features.append(subject_features)
    return features


In [None]:
X = getFeaturesSelect(strat['id'].tolist())
y = strat['label'].tolist()

df = pd.DataFrame(X)
df['label'] = y
df

In [None]:
import numpy as np
from scipy.fft import rfft, rfftfreq

def getFeaturesSelect(subjects):
    features = []

    for id in subjects:
        subject_features = []
        try:
            x = np.fromfile(f"./preprocessed/movement/{id:03d}_ml.bin", dtype=np.float32).reshape((-1, 976))
        except FileNotFoundError:
            print(f"File for subject {id} not found.")
            continue

        for col in x:

            N = len(col)  # Número de amostras
            fft_vals = rfft(col)  # Valores complexos da FFT
            fft_freqs = rfftfreq(N, d=1.0)  # Frequências associadas

            # Magnitudes da FFT (amplitude)
            fft_magnitudes = np.abs(fft_vals)

            # Identifica a frequência fundamental e suas harmônicas
            fundamental_idx = np.argmax(fft_magnitudes)  # Índice da frequência fundamental
            fundamental_freq = fft_freqs[fundamental_idx]  # Frequência fundamental

            hamp1 = fft_magnitudes[fundamental_idx]  # Amplitude da fundamental
            hamp2 = fft_magnitudes[2 * fundamental_idx] if 2 * fundamental_idx < len(fft_magnitudes) else 0
            hamp3 = fft_magnitudes[3 * fundamental_idx] if 3 * fundamental_idx < len(fft_magnitudes) else 0

            # Cálculo das features
            hd2 = hamp2 / hamp1 if hamp1 != 0 else 0
            hd3 = hamp3 / hamp1 if hamp1 != 0 else 0

            # THD considerando até a 7ª harmônica
            thd = np.sqrt(
                sum(
                    (fft_magnitudes[n * fundamental_idx] / hamp1) ** 2
                    for n in range(2, 8)
                    if n * fundamental_idx < len(fft_magnitudes)
                )
            ) if hamp1 != 0 else 0

            # Adiciona as features calculadas para esta coluna
            subject_features.extend([
                hamp1,  # Hamp1
                hamp2,  # Hamp2
                hamp3,  # Hamp3
                fundamental_freq,  # freq
                hd2,  # HD2
                hd3,  # HD3
                thd,  # THD
            ])

        features.append(subject_features)

    return features


In [None]:
def getFeatures(subjects):
    features = list()
    for id in subjects:
        subject_features = list()
        try:
            x = np.fromfile(f"./preprocessed/movement/{id:03d}_ml.bin", dtype=np.float32).reshape((-1, 976))
        except FileNotFoundError:
            print(f"File for subject {id} not found.")
            continue

        for col in x:
            subject_features.append(feature_calculators.abs_energy(col))
            subject_features.append(feature_calculators.absolute_maximum(col))
            subject_features.append(feature_calculators.absolute_sum_of_changes(col))
            subject_features.append(feature_calculators.count_above_mean(col))
            subject_features.append(feature_calculators.count_below_mean(col))
            subject_features.append(feature_calculators.kurtosis(col))
            subject_features.append(feature_calculators.length(col))
            subject_features.append(feature_calculators.maximum(col))
            subject_features.append(feature_calculators.mean(col))
            subject_features.append(feature_calculators.mean_abs_change(col))
            subject_features.append(feature_calculators.mean_change(col))
            subject_features.append(feature_calculators.median(col))
            subject_features.append(feature_calculators.minimum(col))
            subject_features.append(feature_calculators.root_mean_square(col))
            subject_features.append(feature_calculators.skewness(col))
            subject_features.append(feature_calculators.standard_deviation(col))
            subject_features.append(feature_calculators.sum_values(col))
            subject_features.append(feature_calculators.variance(col))

            '''
            Features que requerem parâmetros

            ## Aggregated autocorrelation ##
            example_autocorr_param = [
                {"f_agg": "mean", "maxlag": 5},
                {"f_agg": "median", "maxlag": 3}
            ]
            subject_features.append(feature_calculators.agg_autocorrelation(col, example_autocorr_param))

            ## Linear trend ##
            example_linear_trend_param = [
                {"attr": "slope", "chunk_len": 5, "f_agg": "mean"},
                {"attr": "intercept", "chunk_len": 3, "f_agg": "median"}
            ]
            subject_features.append(feature_calculators.agg_linear_trend(col, example_linear_trend_param))

            ## Approximate entropy ##
            m = 2       # (int) Tamanho das sequências a comparar
            r = 0.2     # (float) Tolerância à similaridade
            subject_features.append(feature_calculators.approximate_entropy(col, m, r))

            ## Autoregressive coefficients ##
            example_ar_coefficient_param = [
                {"coeff": 0, "k": 3},
                {"coeff": 1, "k": 3}
            ]
            subject_features.append(feature_calculators.ar_coefficient(col, example_ar_coefficient_param))


            ## Partial autocorrelation ##
            example_partial_autocorr_param = [{"lag": 1}, {"lag": 2}]
            subject_features.append(feature_calculators.partial_autocorrelation(col, example_partial_autocorr_param))

            ## Binned entropy ##
            max_bins = 10
            subject_features.append(feature_calculators.binned_entropy(col, max_bins))

            ## Fourier transform coefficients ##
            example_fft_param = [{"coeff": 0, "attr": "real"}, {"coeff": 1, "attr": "abs"}]
            subject_features.append(feature_calculators.fft_coefficient(col, example_fft_param))

            ## Continuous wavelet transform coefficients ##
            example_cwt_param = [{"widths": [1, 2, 3], "wavelet": "mexh"}]
            subject_features.append(feature_calculators.cwt_coefficients(col, example_cwt_param))

            ## Energy ratio by chunks ##
            example_energy_ratio_param = [{"num_segments": 4, "segment_focus": 2}]
            subject_features.append(feature_calculators.energy_ratio_by_chunks(col, example_energy_ratio_param))

            ## Spkt Welch Density ##
            example_spkt_param = [{"coeff": 2}, {"coeff": 5}]
            subject_features.append(feature_calculators.spkt_welch_density(col, example_spkt_param))

            ## Permutation entropy ##
            tau = 1
            dimension = 3
            subject_features.append(feature_calculators.permutation_entropy(col, tau, dimension))
            '''

        features.append(subject_features)
    return features


In [None]:
def getFeaturesSelect(subjects):
    features = list()
    for id in subjects:
        subject_features = list()
        try:
            x = np.fromfile(f"./preprocessed/movement/{id:03d}_ml.bin", dtype=np.float32).reshape((-1, 976))
        except FileNotFoundError:
            print(f"File for subject {id} not found.")
            continue

        for col in x:
            subject_features.append(feature_calculators.abs_energy(col))
            subject_features.append(feature_calculators.absolute_sum_of_changes(col))
            subject_features.append(feature_calculators.kurtosis(col))
            subject_features.append(feature_calculators.median(col))
            subject_features.append(feature_calculators.root_mean_square(col))
            subject_features.append(feature_calculators.skewness(col))
            subject_features.append(feature_calculators.standard_deviation(col))

        features.append(subject_features)
    return features

In [None]:
HC_PD_X = getFeatures(HC_PD['id'].tolist())
HC_PD_y = HC_PD['label'].tolist()

In [None]:
HC_PD_df = pd.DataFrame(HC_PD_X)
HC_PD_df['label'] = HC_PD_y

In [None]:
X = getFeatures(strat['id'].tolist())
y = strat['label'].tolist()

df = pd.DataFrame(X)
df['label'] = y
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2367,2368,2369,2370,2371,2372,2373,2374,2375,label
0,0.005346,0.009385,1.063007,500,476,0.686966,976,0.007030,-0.000233,0.001090,...,0.011756,-0.000054,-0.004338,-0.171969,0.056804,0.030953,0.056509,-5.643614,0.003193,0
1,13.380221,0.556001,28.544725,487,489,3.655442,976,0.556001,-0.000005,0.029277,...,0.012463,-0.000020,-0.014459,-0.397742,0.112564,0.010076,0.112119,-9.756145,0.012571,2
2,0.003560,0.008610,1.135437,531,445,0.573104,976,0.004668,-0.000100,0.001165,...,0.017372,-0.000042,0.003519,-0.277881,0.085805,-0.043397,0.085727,3.557633,0.007349,0
3,0.005594,0.008917,1.366292,490,486,0.843515,976,0.008917,-0.000217,0.001401,...,0.010305,-0.000164,0.001337,-0.127640,0.051773,0.026157,0.051762,1.057014,0.002679,0
4,0.019405,0.015874,2.542418,472,504,0.321554,976,0.015874,0.000018,0.002608,...,0.009476,-0.000107,0.002329,-0.150653,0.060765,0.042647,0.060694,2.872063,0.003684,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.013193,0.012904,1.575305,481,495,0.131550,976,0.011063,0.000090,0.001616,...,0.080380,-0.000780,-0.009489,-1.123851,0.426334,0.257998,0.426311,-4.365879,0.181741,1
230,0.029831,0.020558,2.028810,490,486,-0.115299,976,0.020558,0.000038,0.002081,...,0.031484,-0.000022,-0.004945,-0.352693,0.112591,-0.168042,0.111965,-11.572714,0.012536,2
231,0.013714,0.019414,1.206776,478,498,2.624293,976,0.019414,0.000154,0.001238,...,0.048100,0.000511,-0.014655,-0.731003,0.314677,0.217025,0.314488,-10.658627,0.098902,2
232,0.013222,0.015302,1.624582,509,467,1.505466,976,0.015302,-0.000036,0.001666,...,0.013198,0.000031,-0.014160,-0.212136,0.101939,0.800520,0.101938,-0.405702,0.010391,0


In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

df_copy = df.copy()

# StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_copy)

# PCA
pca = PCA(0.95)
df_pca = pca.fit_transform(df_scaled)

df_pca = pd.DataFrame(df_pca)
df_pca.shape


(234, 135)

In [None]:
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
         "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

wrists = ["Left", "Right"]

sensors = ["Accelerometer", "Gyroscope"]

axes = ["X", "Y", "Z"]

features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
            "count_below_mean", "kurtosis", "length", "maximum", "mean",
            "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
            "skewness", "standard_deviation", "sum_values", "variance"]

In [None]:
column_names = []
for task in tasks:
    for wrist in wrists:
        for sensor in sensors:
            for axis in axes:
                for feature in features:
                    column_names.append(f"{task}_{wrist}_{sensor}_{axis}_{feature}")

column_names.append("label")

In [None]:
df.columns = column_names
df

Unnamed: 0,Relaxed1_Left_Accelerometer_X_abs_energy,Relaxed1_Left_Accelerometer_X_absolute_maximum,Relaxed1_Left_Accelerometer_X_absolute_sum_of_changes,Relaxed1_Left_Accelerometer_X_count_above_mean,Relaxed1_Left_Accelerometer_X_count_below_mean,Relaxed1_Left_Accelerometer_X_kurtosis,Relaxed1_Left_Accelerometer_X_length,Relaxed1_Left_Accelerometer_X_maximum,Relaxed1_Left_Accelerometer_X_mean,Relaxed1_Left_Accelerometer_X_mean_abs_change,...,Entrainment2_Right_Gyroscope_Z_mean_abs_change,Entrainment2_Right_Gyroscope_Z_mean_change,Entrainment2_Right_Gyroscope_Z_median,Entrainment2_Right_Gyroscope_Z_minimum,Entrainment2_Right_Gyroscope_Z_root_mean_square,Entrainment2_Right_Gyroscope_Z_skewness,Entrainment2_Right_Gyroscope_Z_standard_deviation,Entrainment2_Right_Gyroscope_Z_sum_values,Entrainment2_Right_Gyroscope_Z_variance,label
0,0.005346,0.009385,1.063007,500,476,0.686966,976,0.007030,-0.000233,0.001090,...,0.011756,-0.000054,-0.004338,-0.171969,0.056804,0.030953,0.056509,-5.643614,0.003193,0
1,13.380221,0.556001,28.544725,487,489,3.655442,976,0.556001,-0.000005,0.029277,...,0.012463,-0.000020,-0.014459,-0.397742,0.112564,0.010076,0.112119,-9.756145,0.012571,2
2,0.003560,0.008610,1.135437,531,445,0.573104,976,0.004668,-0.000100,0.001165,...,0.017372,-0.000042,0.003519,-0.277881,0.085805,-0.043397,0.085727,3.557633,0.007349,0
3,0.005594,0.008917,1.366292,490,486,0.843515,976,0.008917,-0.000217,0.001401,...,0.010305,-0.000164,0.001337,-0.127640,0.051773,0.026157,0.051762,1.057014,0.002679,0
4,0.019405,0.015874,2.542418,472,504,0.321554,976,0.015874,0.000018,0.002608,...,0.009476,-0.000107,0.002329,-0.150653,0.060765,0.042647,0.060694,2.872063,0.003684,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.013193,0.012904,1.575305,481,495,0.131550,976,0.011063,0.000090,0.001616,...,0.080380,-0.000780,-0.009489,-1.123851,0.426334,0.257998,0.426311,-4.365879,0.181741,1
230,0.029831,0.020558,2.028810,490,486,-0.115299,976,0.020558,0.000038,0.002081,...,0.031484,-0.000022,-0.004945,-0.352693,0.112591,-0.168042,0.111965,-11.572714,0.012536,2
231,0.013714,0.019414,1.206776,478,498,2.624293,976,0.019414,0.000154,0.001238,...,0.048100,0.000511,-0.014655,-0.731003,0.314677,0.217025,0.314488,-10.658627,0.098902,2
232,0.013222,0.015302,1.624582,509,467,1.505466,976,0.015302,-0.000036,0.001666,...,0.013198,0.000031,-0.014160,-0.212136,0.101939,0.800520,0.101938,-0.405702,0.010391,0


In [None]:
df.to_csv('statFeatures.csv')

## Import ready-made database

In [None]:
df = pd.read_csv('statFeatures.csv')
df = df.drop(columns=['Unnamed: 0'])

'''
HC_PD = df[df['label'] != 2]
DD_PD = df[df['label'] != 0]
'''

"\nHC_PD = df[df['label'] != 2]\nDD_PD = df[df['label'] != 0]\n"

In [None]:
df

# 18 Features

## Número de instâcias de cada classe

In [None]:
df.shape

(234, 2377)

In [None]:
classes = df['label'].value_counts()
classes

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,82
1,82
0,70


## Classificação

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [None]:
def evaluate_classifier_scaled(X, y, clf, kf):

    #kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    acc_list = []
    bal_acc_list = []
    f1_list = []
    confusion_matrices = []

    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # StandardScaler dentro de cada fold
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        clf.fit(X_train_scaled, y_train)
        y_pred = clf.predict(X_test_scaled)

        acc = accuracy_score(y_test, y_pred)
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))

        acc_list.append(acc)
        bal_acc_list.append(bal_acc)
        f1_list.append(f1)
        confusion_matrices.append(cm)

    mean_acc = np.mean(acc_list)
    std_acc = np.std(acc_list)
    mean_bal_acc = np.mean(bal_acc_list)
    std_bal_acc = np.std(bal_acc_list)
    mean_f1 = np.mean(f1_list)
    std_f1 = np.std(f1_list)

    confusion_matrix_all = np.sum(confusion_matrices, axis=0)

    return mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all


In [None]:
def evaluate_classifier_scaled_transformed(X, y, clf, kf):

    #kf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)

    acc_list = []
    bal_acc_list = []
    f1_list = []
    confusion_matrices = []

    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # StandardScaler dentro de cada fold
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # PCA após o escalonamento
        pca = PCA(0.95)
        X_train_scaled_transformed = pca.fit_transform(X_train_scaled)
        X_test_scaled_transformed = pca.transform(X_test_scaled)

        clf.fit(X_train_scaled_transformed, y_train)
        y_pred = clf.predict(X_test_scaled_transformed)

        acc = accuracy_score(y_test, y_pred)
        bal_acc = balanced_accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')
        cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))

        acc_list.append(acc)
        bal_acc_list.append(bal_acc)
        f1_list.append(f1)
        confusion_matrices.append(cm)

    mean_acc = np.mean(acc_list)
    std_acc = np.std(acc_list)
    mean_bal_acc = np.mean(bal_acc_list)
    std_bal_acc = np.std(bal_acc_list)
    mean_f1 = np.mean(f1_list)
    std_f1 = np.std(f1_list)

    confusion_matrix_all = np.sum(confusion_matrices, axis=0)

    return mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all


## Multiclassificação

In [None]:
X = df.drop(columns=['label'])
y = df['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 51.22%, Desvio padrão: 7.67%
Acurácia balanceada média: 50.30%, Desvio padrão: 7.66%
F-score médio: 49.53%, Desvio padrão: 7.95%

Matriz de Confusão:
[[22 29 19]
 [ 6 61 15]
 [ 9 36 37]]

KNN:

Acurácia média: 43.53%, Desvio padrão: 9.58%
Acurácia balanceada média: 42.84%, Desvio padrão: 9.73%
F-score médio: 41.38%, Desvio padrão: 10.29%

Matriz de Confusão:
[[23 31 16]
 [10 53 19]
 [14 42 26]]

LDA:

Acurácia média: 50.81%, Desvio padrão: 10.81%
Acurácia balanceada média: 50.90%, Desvio padrão: 11.17%
F-score médio: 50.50%, Desvio padrão: 10.53%

Matriz de Confusão:
[[37 19 14]
 [16 45 21]
 [14 31 37]]

Decision Tree:

Acurácia média: 45.31%, Desvio padrão: 2.65%
Acurácia balanceada média: 45.36%, Desvio padrão: 2.84%
F-score médio: 45.29%, Desvio padrão: 2.43%

Matriz de Confusão:
[[32 25 13]
 [16 30 36]
 [18 20 44]]

Naive Bayes:

Acurácia média: 42.71%, Desvio padrão: 5.54%
Acurácia balanceada média: 44.67%, Desvio padrão: 5.79%
F-score médio: 39.16%, Desvio p



LightGBM:

Acurácia média: 60.26%, Desvio padrão: 4.35%
Acurácia balanceada média: 60.30%, Desvio padrão: 4.33%
F-score médio: 60.19%, Desvio padrão: 4.55%

Matriz de Confusão:
[[43 18  9]
 [ 9 53 20]
 [17 20 45]]





In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 54.23%, Desvio padrão: 6.90%
Acurácia balanceada média: 53.81%, Desvio padrão: 7.34%
F-score médio: 52.32%, Desvio padrão: 7.80%

Matriz de Confusão:
[[31 32  7]
 [ 6 64 12]
 [10 40 32]]

KNN:

Acurácia média: 45.69%, Desvio padrão: 4.86%
Acurácia balanceada média: 44.97%, Desvio padrão: 4.86%
F-score médio: 43.63%, Desvio padrão: 5.21%

Matriz de Confusão:
[[23 29 18]
 [ 6 58 18]
 [15 41 26]]

LDA:

Acurácia média: 54.63%, Desvio padrão: 13.02%
Acurácia balanceada média: 54.64%, Desvio padrão: 13.28%
F-score médio: 54.44%, Desvio padrão: 12.83%

Matriz de Confusão:
[[38 21 11]
 [10 52 20]
 [13 31 38]]

Decision Tree:

Acurácia média: 42.28%, Desvio padrão: 5.78%
Acurácia balanceada média: 42.30%, Desvio padrão: 6.17%
F-score médio: 42.22%, Desvio padrão: 5.54%

Matriz de Confusão:
[[30 22 18]
 [22 32 28]
 [27 18 37]]

Naive Bayes:

Acurácia média: 44.83%, Desvio padrão: 5.43%
Acurácia balanceada média: 45.51%, Desvio padrão: 5.73%
F-score médio: 41.13%, Desvio pa



LightGBM:

Acurácia média: 48.71%, Desvio padrão: 11.82%
Acurácia balanceada média: 48.58%, Desvio padrão: 11.88%
F-score médio: 48.31%, Desvio padrão: 11.60%

Matriz de Confusão:
[[33 29  8]
 [13 46 23]
 [23 24 35]]





## PD vs CO

In [None]:
# Controle (0) e Parkinson (1)
df1 = df[df['label'] != 2]

In [None]:
X = df1.drop(columns=['label'])
y = df1['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

## PD vs DD

In [None]:
# Parkinson (1) e DD (2)
df2 = df[df['label'] != 0]

In [None]:
X = df2.drop(columns=['label'])
y = df2['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 61.61%, Desvio padrão: 5.45%
Acurácia balanceada média: 61.51%, Desvio padrão: 5.89%
F-score médio: 60.18%, Desvio padrão: 6.36%

Matriz de Confusão:
[[64 18]
 [45 37]]

KNN:

Acurácia média: 62.82%, Desvio padrão: 8.79%
Acurácia balanceada média: 62.90%, Desvio padrão: 9.27%
F-score médio: 61.34%, Desvio padrão: 8.75%

Matriz de Confusão:
[[65 17]
 [44 38]]

LDA:

Acurácia média: 58.50%, Desvio padrão: 5.54%
Acurácia balanceada média: 58.49%, Desvio padrão: 5.65%
F-score médio: 58.15%, Desvio padrão: 5.71%

Matriz de Confusão:
[[53 29]
 [39 43]]

Decision Tree:

Acurácia média: 59.13%, Desvio padrão: 4.26%
Acurácia balanceada média: 59.19%, Desvio padrão: 4.45%
F-score médio: 58.76%, Desvio padrão: 4.21%

Matriz de Confusão:
[[50 32]
 [35 47]]

Naive Bayes:

Acurácia média: 52.42%, Desvio padrão: 2.64%
Acurácia balanceada média: 52.10%, Desvio padrão: 2.19%
F-score médio: 49.01%, Desvio padrão: 4.08%

Matriz de Confusão:
[[43 39]
 [39 43]]



In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 61.61%, Desvio padrão: 6.38%
Acurácia balanceada média: 61.58%, Desvio padrão: 7.03%
F-score médio: 58.96%, Desvio padrão: 7.35%

Matriz de Confusão:
[[70 12]
 [51 31]]

KNN:

Acurácia média: 58.54%, Desvio padrão: 9.47%
Acurácia balanceada média: 58.71%, Desvio padrão: 9.79%
F-score médio: 57.96%, Desvio padrão: 9.09%

Matriz de Confusão:
[[57 25]
 [43 39]]

LDA:

Acurácia média: 61.57%, Desvio padrão: 8.75%
Acurácia balanceada média: 61.62%, Desvio padrão: 8.91%
F-score médio: 61.27%, Desvio padrão: 8.74%

Matriz de Confusão:
[[52 30]
 [33 49]]

Decision Tree:

Acurácia média: 60.40%, Desvio padrão: 5.20%
Acurácia balanceada média: 60.44%, Desvio padrão: 5.10%
F-score médio: 59.63%, Desvio padrão: 5.26%

Matriz de Confusão:
[[58 24]
 [41 41]]

Naive Bayes:

Acurácia média: 53.75%, Desvio padrão: 8.74%
Acurácia balanceada média: 53.75%, Desvio padrão: 9.01%
F-score médio: 46.43%, Desvio padrão: 11.67%

Matriz de Confusão:
[[73  9]
 [67 15]]



# 7 Features

In [None]:
# Conjunto menor de features

X = getFeaturesSelect(strat['id'].tolist())
y = strat['label'].tolist()

df = pd.DataFrame(X)
df['label'] = y
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,915,916,917,918,919,920,921,922,923,label
0,0.005346,1.063007,0.686966,-0.000123,0.002340,-0.348814,0.002329,0.006907,1.271359,0.363277,...,-0.164565,0.105856,3.149282,11.462404,-0.358709,-0.004338,0.056804,0.030953,0.056509,0
1,13.380221,28.544725,3.655442,-0.000732,0.117086,-0.164743,0.117086,19.874962,53.032730,10.301358,...,-0.170144,0.224062,12.366482,12.151266,0.531585,-0.014459,0.112564,0.010076,0.112119,2
2,0.003560,1.135437,0.573104,0.000153,0.001910,-0.510957,0.001907,0.003744,1.242591,0.195343,...,0.143543,0.099760,7.185780,16.937801,0.290071,0.003519,0.085805,-0.043397,0.085727,0
3,0.005594,1.366292,0.843515,-0.000202,0.002394,-0.004547,0.002384,0.012699,2.066915,0.271209,...,-0.333765,0.080068,2.616145,10.047055,-0.505359,0.001337,0.051773,0.026157,0.051762,0
4,0.019405,2.542418,0.321554,-0.000152,0.004459,0.126854,0.004459,4.888727,29.976044,-0.682492,...,-0.138209,0.058596,3.603762,9.239023,-0.489067,0.002329,0.060765,0.042647,0.060694,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.013193,1.575305,0.131550,0.000041,0.003677,0.032810,0.003676,0.019646,1.855868,0.471217,...,0.411668,0.282305,177.398590,78.370926,1.035932,-0.009489,0.426334,0.257998,0.426311,1
230,0.029831,2.028810,-0.115299,0.000065,0.005528,0.036339,0.005528,0.033001,2.660729,-0.498142,...,0.292891,0.118495,12.372566,30.697189,-0.229536,-0.004945,0.112591,-0.168042,0.111965,2
231,0.013714,1.206776,2.624293,0.000087,0.003749,0.258454,0.003745,0.026533,1.324726,2.408379,...,-0.028794,0.394127,96.645210,46.897354,-0.336487,-0.014655,0.314677,0.217025,0.314488,2
232,0.013222,1.624582,1.505466,0.000149,0.003681,-0.083127,0.003680,0.033925,2.598997,0.662536,...,1.216531,0.182885,10.142200,12.868165,0.778473,-0.014160,0.101939,0.800520,0.101938,0


In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

df_copy = df.copy()

# StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_copy)

# PCA
pca = PCA(0.95)
df_pca = pca.fit_transform(df_scaled)

df_pca = pd.DataFrame(df_pca)
df_pca.shape


(234, 118)

In [None]:
column_names = []
for task in tasks:
    for wrist in wrists:
        for sensor in sensors:
            for axis in axes:
                for feature in featuresSelect:
                    column_names.append(f"{task}_{wrist}_{sensor}_{axis}_{feature}")

column_names.append("label")

In [None]:
df.columns = column_names
df

Unnamed: 0,Relaxed1_Left_Accelerometer_X_abs_energy,Relaxed1_Left_Accelerometer_X_absolute_sum_of_changes,Relaxed1_Left_Accelerometer_X_kurtosis,Relaxed1_Left_Accelerometer_X_median,Relaxed1_Left_Accelerometer_X_root_mean_square,Relaxed1_Left_Accelerometer_X_skewness,Relaxed1_Left_Accelerometer_X_standard_deviation,Relaxed1_Left_Accelerometer_Y_abs_energy,Relaxed1_Left_Accelerometer_Y_absolute_sum_of_changes,Relaxed1_Left_Accelerometer_Y_kurtosis,...,Entrainment2_Right_Gyroscope_Y_skewness,Entrainment2_Right_Gyroscope_Y_standard_deviation,Entrainment2_Right_Gyroscope_Z_abs_energy,Entrainment2_Right_Gyroscope_Z_absolute_sum_of_changes,Entrainment2_Right_Gyroscope_Z_kurtosis,Entrainment2_Right_Gyroscope_Z_median,Entrainment2_Right_Gyroscope_Z_root_mean_square,Entrainment2_Right_Gyroscope_Z_skewness,Entrainment2_Right_Gyroscope_Z_standard_deviation,label
0,0.005346,1.063007,0.686966,-0.000123,0.002340,-0.348814,0.002329,0.006907,1.271359,0.363277,...,-0.164565,0.105856,3.149282,11.462404,-0.358709,-0.004338,0.056804,0.030953,0.056509,0
1,13.380221,28.544725,3.655442,-0.000732,0.117086,-0.164743,0.117086,19.874962,53.032730,10.301358,...,-0.170144,0.224062,12.366482,12.151266,0.531585,-0.014459,0.112564,0.010076,0.112119,2
2,0.003560,1.135437,0.573104,0.000153,0.001910,-0.510957,0.001907,0.003744,1.242591,0.195343,...,0.143543,0.099760,7.185780,16.937801,0.290071,0.003519,0.085805,-0.043397,0.085727,0
3,0.005594,1.366292,0.843515,-0.000202,0.002394,-0.004547,0.002384,0.012699,2.066915,0.271209,...,-0.333765,0.080068,2.616145,10.047055,-0.505359,0.001337,0.051773,0.026157,0.051762,0
4,0.019405,2.542418,0.321554,-0.000152,0.004459,0.126854,0.004459,4.888727,29.976044,-0.682492,...,-0.138209,0.058596,3.603762,9.239023,-0.489067,0.002329,0.060765,0.042647,0.060694,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.013193,1.575305,0.131550,0.000041,0.003677,0.032810,0.003676,0.019646,1.855868,0.471217,...,0.411668,0.282305,177.398590,78.370926,1.035932,-0.009489,0.426334,0.257998,0.426311,1
230,0.029831,2.028810,-0.115299,0.000065,0.005528,0.036339,0.005528,0.033001,2.660729,-0.498142,...,0.292891,0.118495,12.372566,30.697189,-0.229536,-0.004945,0.112591,-0.168042,0.111965,2
231,0.013714,1.206776,2.624293,0.000087,0.003749,0.258454,0.003745,0.026533,1.324726,2.408379,...,-0.028794,0.394127,96.645210,46.897354,-0.336487,-0.014655,0.314677,0.217025,0.314488,2
232,0.013222,1.624582,1.505466,0.000149,0.003681,-0.083127,0.003680,0.033925,2.598997,0.662536,...,1.216531,0.182885,10.142200,12.868165,0.778473,-0.014160,0.101939,0.800520,0.101938,0


## Multiclassificação

In [None]:
X = df.drop(columns=['label'])
y = df['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 52.52%, Desvio padrão: 7.17%
Acurácia balanceada média: 51.77%, Desvio padrão: 6.88%
F-score médio: 51.31%, Desvio padrão: 7.55%

Matriz de Confusão:
[[26 28 16]
 [ 5 61 16]
 [13 33 36]]

KNN:

Acurácia média: 46.13%, Desvio padrão: 5.96%
Acurácia balanceada média: 45.79%, Desvio padrão: 5.98%
F-score médio: 44.71%, Desvio padrão: 5.95%

Matriz de Confusão:
[[29 26 15]
 [ 7 54 21]
 [16 41 25]]

LDA:

Acurácia média: 43.53%, Desvio padrão: 8.15%
Acurácia balanceada média: 43.79%, Desvio padrão: 8.28%
F-score médio: 43.37%, Desvio padrão: 8.02%

Matriz de Confusão:
[[34 16 20]
 [13 37 32]
 [28 23 31]]

Decision Tree:

Acurácia média: 44.40%, Desvio padrão: 11.24%
Acurácia balanceada média: 43.94%, Desvio padrão: 11.10%
F-score médio: 44.12%, Desvio padrão: 11.15%

Matriz de Confusão:
[[27 23 20]
 [22 36 24]
 [15 26 41]]

Naive Bayes:

Acurácia média: 41.86%, Desvio padrão: 5.50%
Acurácia balanceada média: 43.92%, Desvio padrão: 5.72%
F-score médio: 37.67%, Desvio pa

In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 54.22%, Desvio padrão: 8.08%
Acurácia balanceada média: 53.90%, Desvio padrão: 8.38%
F-score médio: 52.75%, Desvio padrão: 8.48%

Matriz de Confusão:
[[33 29  8]
 [ 6 62 14]
 [16 34 32]]

KNN:

Acurácia média: 50.84%, Desvio padrão: 5.15%
Acurácia balanceada média: 50.25%, Desvio padrão: 5.18%
F-score médio: 48.94%, Desvio padrão: 6.34%

Matriz de Confusão:
[[28 31 11]
 [ 7 63 12]
 [15 39 28]]

LDA:

Acurácia média: 47.81%, Desvio padrão: 10.11%
Acurácia balanceada média: 48.13%, Desvio padrão: 10.73%
F-score médio: 47.15%, Desvio padrão: 9.77%

Matriz de Confusão:
[[37 17 16]
 [10 44 28]
 [20 31 31]]

Decision Tree:

Acurácia média: 43.06%, Desvio padrão: 11.90%
Acurácia balanceada média: 43.14%, Desvio padrão: 12.21%
F-score médio: 42.92%, Desvio padrão: 11.87%

Matriz de Confusão:
[[31 23 16]
 [17 35 30]
 [20 27 35]]

Naive Bayes:

Acurácia média: 47.84%, Desvio padrão: 4.78%
Acurácia balanceada média: 48.88%, Desvio padrão: 5.04%
F-score médio: 45.28%, Desvio 

## PD vs CO

In [None]:
# Controle (0) e Parkinson (1)
df1 = df[df['label'] != 2]

In [None]:
X = df1.drop(columns=['label'])
y = df1['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 73.01%, Desvio padrão: 7.14%
Acurácia balanceada média: 71.68%, Desvio padrão: 7.50%
F-score médio: 71.73%, Desvio padrão: 7.75%

Matriz de Confusão:
[[39 31]
 [10 72]]

KNN:

Acurácia média: 67.74%, Desvio padrão: 8.23%
Acurácia balanceada média: 65.74%, Desvio padrão: 8.65%
F-score médio: 64.77%, Desvio padrão: 9.99%

Matriz de Confusão:
[[28 42]
 [ 7 75]]

LDA:

Acurácia média: 70.34%, Desvio padrão: 7.74%
Acurácia balanceada média: 69.77%, Desvio padrão: 7.94%
F-score médio: 70.03%, Desvio padrão: 7.87%

Matriz de Confusão:
[[44 26]
 [19 63]]

Decision Tree:

Acurácia média: 62.56%, Desvio padrão: 9.97%
Acurácia balanceada média: 61.71%, Desvio padrão: 10.13%
F-score médio: 61.88%, Desvio padrão: 10.18%

Matriz de Confusão:
[[36 34]
 [23 59]]

Naive Bayes:

Acurácia média: 57.96%, Desvio padrão: 6.64%
Acurácia balanceada média: 60.20%, Desvio padrão: 6.35%
F-score médio: 54.12%, Desvio padrão: 8.00%

Matriz de Confusão:
[[62  8]
 [56 26]]



In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 75.66%, Desvio padrão: 6.85%
Acurácia balanceada média: 74.36%, Desvio padrão: 7.31%
F-score médio: 74.52%, Desvio padrão: 7.79%

Matriz de Confusão:
[[41 29]
 [ 8 74]]

KNN:

Acurácia média: 67.72%, Desvio padrão: 5.52%
Acurácia balanceada média: 65.70%, Desvio padrão: 5.79%
F-score médio: 64.75%, Desvio padrão: 7.27%

Matriz de Confusão:
[[28 42]
 [ 7 75]]

LDA:

Acurácia média: 76.97%, Desvio padrão: 7.61%
Acurácia balanceada média: 76.52%, Desvio padrão: 7.92%
F-score médio: 76.68%, Desvio padrão: 7.86%

Matriz de Confusão:
[[50 20]
 [15 67]]

Decision Tree:

Acurácia média: 64.49%, Desvio padrão: 7.25%
Acurácia balanceada média: 63.94%, Desvio padrão: 6.74%
F-score médio: 64.00%, Desvio padrão: 6.96%

Matriz de Confusão:
[[40 30]
 [24 58]]

Naive Bayes:

Acurácia média: 63.83%, Desvio padrão: 7.40%
Acurácia balanceada média: 65.07%, Desvio padrão: 7.58%
F-score médio: 62.65%, Desvio padrão: 7.76%

Matriz de Confusão:
[[56 14]
 [41 41]]



## PD vs DD

In [None]:
# Parkinson (1) e DD (2)
df2 = df[df['label'] != 0]

In [None]:
X = df2.drop(columns=['label'])
y = df2['label']

classifiers = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB()
}

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StarndardScaler

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 62.84%, Desvio padrão: 6.84%
Acurácia balanceada média: 62.83%, Desvio padrão: 7.23%
F-score médio: 61.82%, Desvio padrão: 7.33%

Matriz de Confusão:
[[63 19]
 [42 40]]

KNN:

Acurácia média: 58.56%, Desvio padrão: 6.37%
Acurácia balanceada média: 58.86%, Desvio padrão: 6.56%
F-score médio: 57.67%, Desvio padrão: 6.01%

Matriz de Confusão:
[[51 31]
 [37 45]]

LDA:

Acurácia média: 60.93%, Desvio padrão: 6.22%
Acurácia balanceada média: 60.85%, Desvio padrão: 6.34%
F-score médio: 60.70%, Desvio padrão: 6.33%

Matriz de Confusão:
[[54 28]
 [36 46]]

Decision Tree:

Acurácia média: 56.70%, Desvio padrão: 3.53%
Acurácia balanceada média: 56.62%, Desvio padrão: 3.41%
F-score médio: 56.25%, Desvio padrão: 3.54%

Matriz de Confusão:
[[41 41]
 [30 52]]

Naive Bayes:

Acurácia média: 52.46%, Desvio padrão: 6.63%
Acurácia balanceada média: 52.06%, Desvio padrão: 6.47%
F-score médio: 50.52%, Desvio padrão: 5.62%

Matriz de Confusão:
[[42 40]
 [38 44]]



In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in classifiers.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 62.23%, Desvio padrão: 7.64%
Acurácia balanceada média: 62.21%, Desvio padrão: 8.13%
F-score médio: 60.16%, Desvio padrão: 8.41%

Matriz de Confusão:
[[68 14]
 [48 34]]

KNN:

Acurácia média: 61.02%, Desvio padrão: 5.93%
Acurácia balanceada média: 61.29%, Desvio padrão: 6.20%
F-score médio: 59.78%, Desvio padrão: 5.90%

Matriz de Confusão:
[[60 22]
 [42 40]]

LDA:

Acurácia média: 60.97%, Desvio padrão: 6.51%
Acurácia balanceada média: 60.88%, Desvio padrão: 6.79%
F-score médio: 60.41%, Desvio padrão: 7.09%

Matriz de Confusão:
[[55 27]
 [37 45]]

Decision Tree:

Acurácia média: 59.79%, Desvio padrão: 7.59%
Acurácia balanceada média: 59.67%, Desvio padrão: 7.74%
F-score médio: 59.50%, Desvio padrão: 7.71%

Matriz de Confusão:
[[52 30]
 [36 46]]

Naive Bayes:

Acurácia média: 52.52%, Desvio padrão: 8.80%
Acurácia balanceada média: 52.50%, Desvio padrão: 9.27%
F-score médio: 46.37%, Desvio padrão: 10.71%

Matriz de Confusão:
[[70 12]
 [66 16]]



# Feature engineering

In [None]:

def filter_dataframe(df, wrists, tasks, sensors, axes, features):
    valid_columns = set(
        f"{task}_{wrist}_{sensor}_{axis}_{feature}"
        for task in tasks
        for wrist in wrists
        for sensor in sensors
        for axis in axes
        for feature in features
    )

    valid_columns.add('label')

    filtered_columns = [col for col in df.columns if col in valid_columns]
    filtered_df = df[filtered_columns]

    filtered_df = filtered_df[filtered_df['label'] != 3]

    return filtered_df


## Multiclassificação

In [None]:
tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
         "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

wrists = ["Left", "Right"]

sensors = ["Accelerometer", "Gyroscope"]

axes = ["X", "Y", "Z"]

features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
            "count_below_mean", "kurtosis", "length", "maximum", "mean",
            "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
            "skewness", "standard_deviation", "sum_values", "variance"]

In [None]:
exp_wrists = ['Left', 'Right']

exp_tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
         "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

exp_sensors = ["Accelerometer", "Gyroscope"]

exp_axes = ['X', 'Y', 'Z']

exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
                "count_below_mean", "kurtosis", "length", "maximum", "mean",
                "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
                "skewness", "standard_deviation", "sum_values", "variance"]

exp_models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Extra Trees': ExtraTreesClassifier(random_state=42),
    #'XGBoost': xgb.XGBClassifier(),
    #'LightGBM': lgb.LGBMClassifier(verbose=-1)
}

exp_df = filter_dataframe(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features)

X = exp_df.drop(columns=['label'])
y = exp_df['label']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
# Normalização com StandarScaler

for name, clf in exp_models.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

## PD vs CO

In [None]:
exp_wrists = ['Left', 'Right']
exp_tasks = ['RelaxedTask2', "StretchHold", "HoldWeight",
         "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]
exp_sensors = ['Gyroscope']
exp_axes = ['Y', 'Z']
exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
            "count_below_mean", "kurtosis", "length", "maximum", "mean",
            "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
            "skewness", "standard_deviation", "sum_values", "variance"]

exp_models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}

exp_df = filter_dataframe(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features)

X = exp_df.drop(columns=['label'])
y = exp_df['label']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
# Normalização com StandarScaler

for name, clf in exp_models.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 74.34%, Desvio padrão: 3.22%
Acurácia balanceada média: 72.72%, Desvio padrão: 3.25%
F-score médio: 72.82%, Desvio padrão: 3.43%

Matriz de Confusão:
[[37 33]
 [ 6 76]]

KNN:

Acurácia média: 68.41%, Desvio padrão: 6.17%
Acurácia balanceada média: 66.68%, Desvio padrão: 5.71%
F-score médio: 66.55%, Desvio padrão: 5.73%

Matriz de Confusão:
[[32 38]
 [10 72]]

LDA:

Acurácia média: 70.30%, Desvio padrão: 7.80%
Acurácia balanceada média: 70.35%, Desvio padrão: 7.54%
F-score médio: 70.31%, Desvio padrão: 7.77%

Matriz de Confusão:
[[50 20]
 [25 57]]

Decision Tree:

Acurácia média: 63.78%, Desvio padrão: 7.00%
Acurácia balanceada média: 63.72%, Desvio padrão: 6.72%
F-score médio: 63.60%, Desvio padrão: 7.02%

Matriz de Confusão:
[[45 25]
 [30 52]]

Naive Bayes:

Acurácia média: 57.91%, Desvio padrão: 5.53%
Acurácia balanceada média: 60.01%, Desvio padrão: 5.83%
F-score médio: 54.47%, Desvio padrão: 5.63%

Matriz de Confusão:
[[61  9]
 [55 27]]

Random Forest:

Acurác



LightGBM:

Acurácia média: 74.97%, Desvio padrão: 6.78%
Acurácia balanceada média: 74.79%, Desvio padrão: 7.13%
F-score médio: 74.39%, Desvio padrão: 7.27%

Matriz de Confusão:
[[52 18]
 [20 62]]





In [None]:
# Normalização com StandarScaler e tranformação com PCA

for name, clf in exp_models.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 74.32%, Desvio padrão: 5.80%
Acurácia balanceada média: 72.89%, Desvio padrão: 5.50%
F-score médio: 72.98%, Desvio padrão: 5.92%

Matriz de Confusão:
[[39 31]
 [ 8 74]]

KNN:

Acurácia média: 67.76%, Desvio padrão: 4.84%
Acurácia balanceada média: 66.14%, Desvio padrão: 4.76%
F-score médio: 65.96%, Desvio padrão: 5.14%

Matriz de Confusão:
[[33 37]
 [12 70]]

LDA:

Acurácia média: 80.30%, Desvio padrão: 3.99%
Acurácia balanceada média: 79.70%, Desvio padrão: 4.27%
F-score médio: 80.09%, Desvio padrão: 4.10%

Matriz de Confusão:
[[51 19]
 [11 71]]

Decision Tree:

Acurácia média: 68.43%, Desvio padrão: 5.71%
Acurácia balanceada média: 67.81%, Desvio padrão: 5.34%
F-score médio: 67.82%, Desvio padrão: 5.67%

Matriz de Confusão:
[[43 27]
 [21 61]]

Naive Bayes:

Acurácia média: 71.10%, Desvio padrão: 10.42%
Acurácia balanceada média: 72.11%, Desvio padrão: 9.60%
F-score médio: 70.11%, Desvio padrão: 11.63%

Matriz de Confusão:
[[59 11]
 [33 49]]

Random Forest:

Acur



LightGBM:

Acurácia média: 67.74%, Desvio padrão: 7.78%
Acurácia balanceada média: 67.31%, Desvio padrão: 7.27%
F-score médio: 67.40%, Desvio padrão: 7.80%

Matriz de Confusão:
[[44 26]
 [23 59]]





## PD vs CO

In [None]:
exp_wrists = ["Left", 'Right']

exp_tasks = ['HoldWeight', 'DrinkGlas', 'CrossArms']

exp_sensors = ['Gyroscope']

exp_axes = ['Y', 'Z']

exp_features = ['abs_energy', 'absolute_maximum', 'absolute_sum_of_changes']

exp_models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}

exp_df = filter_dataframe(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features)
exp_df['label'] = exp_df['label'].replace(2, 0)

X = exp_df.drop(columns=['label'])
y = exp_df['label']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
# Normalização com StandarScaler
# PD vs DD

for name, clf in exp_models.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 70.21%, Desvio padrão: 10.78%
Acurácia balanceada média: 70.07%, Desvio padrão: 10.81%
F-score médio: 69.77%, Desvio padrão: 11.14%

Matriz de Confusão:
[[51 31]
 [18 64]]

KNN:

Acurácia média: 69.56%, Desvio padrão: 5.54%
Acurácia balanceada média: 69.41%, Desvio padrão: 5.73%
F-score médio: 68.92%, Desvio padrão: 5.81%

Matriz de Confusão:
[[49 33]
 [17 65]]

LDA:

Acurácia média: 51.80%, Desvio padrão: 5.47%
Acurácia balanceada média: 51.80%, Desvio padrão: 5.84%
F-score médio: 50.93%, Desvio padrão: 5.53%

Matriz de Confusão:
[[33 49]
 [30 52]]

Decision Tree:

Acurácia média: 57.35%, Desvio padrão: 3.98%
Acurácia balanceada média: 57.50%, Desvio padrão: 3.78%
F-score médio: 56.67%, Desvio padrão: 4.75%

Matriz de Confusão:
[[44 38]
 [32 50]]

Naive Bayes:

Acurácia média: 61.63%, Desvio padrão: 8.95%
Acurácia balanceada média: 61.69%, Desvio padrão: 9.03%
F-score médio: 57.47%, Desvio padrão: 10.96%

Matriz de Confusão:
[[26 56]
 [ 7 75]]

Random Forest:

Ac



In [None]:
# Normalização com StandarScaler e tranformação com PCA
# PD vs DD

for name, clf in exp_models.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1, confusion_matrix_all = evaluate_classifier_scaled_transformed(X, y, clf, kf)
    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")

SVM:

Acurácia média: 71.42%, Desvio padrão: 10.31%
Acurácia balanceada média: 71.32%, Desvio padrão: 10.36%
F-score médio: 70.93%, Desvio padrão: 10.73%

Matriz de Confusão:
[[51 31]
 [16 66]]

KNN:

Acurácia média: 67.71%, Desvio padrão: 3.92%
Acurácia balanceada média: 67.61%, Desvio padrão: 4.02%
F-score médio: 67.17%, Desvio padrão: 3.81%

Matriz de Confusão:
[[50 32]
 [21 61]]

LDA:

Acurácia média: 60.45%, Desvio padrão: 12.53%
Acurácia balanceada média: 60.44%, Desvio padrão: 12.62%
F-score médio: 59.19%, Desvio padrão: 13.75%

Matriz de Confusão:
[[38 44]
 [21 61]]

Decision Tree:

Acurácia média: 67.71%, Desvio padrão: 7.22%
Acurácia balanceada média: 67.72%, Desvio padrão: 7.07%
F-score médio: 67.55%, Desvio padrão: 7.31%

Matriz de Confusão:
[[56 26]
 [27 55]]

Naive Bayes:

Acurácia média: 60.45%, Desvio padrão: 10.10%
Acurácia balanceada média: 60.33%, Desvio padrão: 10.14%
F-score médio: 58.16%, Desvio padrão: 11.92%

Matriz de Confusão:
[[33 49]
 [16 66]]

Random Forest



LightGBM:

Acurácia média: 69.56%, Desvio padrão: 8.65%
Acurácia balanceada média: 69.49%, Desvio padrão: 8.67%
F-score médio: 69.04%, Desvio padrão: 9.00%

Matriz de Confusão:
[[54 28]
 [22 60]]





# RFE

RFE sem PCA

In [None]:
from sklearn.feature_selection import RFECV

In [None]:
def evaluate_classifiers_with_rfecv(X, y, classifiers, kf):
    results = {}
    selected_features_per_fold = []

    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        print(f"\n--- Fold {fold + 1} ---")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        inner_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        selector = RFECV(
            estimator=SVC(kernel='linear'),
            step=1,
            cv=inner_kf.split(X_train_scaled, y_train),
            scoring='accuracy'
        )
        selector.fit(X_train_scaled, y_train)
        X_train_selected = selector.transform(X_train_scaled)
        X_test_selected = selector.transform(X_test_scaled)

        selected_features = selector.support_
        selected_features_per_fold.append(selected_features)

        for name, clf in classifiers.items():
            if name not in results:
                results[name] = {
                    "acc_list": [],
                    "bal_acc_list": [],
                    "f1_list": [],
                    "confusion_matrices": []
                }

            clf.fit(X_train_selected, y_train)
            y_pred = clf.predict(X_test_selected)

            acc = accuracy_score(y_test, y_pred)
            bal_acc = balanced_accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))

            results[name]["acc_list"].append(acc)
            results[name]["bal_acc_list"].append(bal_acc)
            results[name]["f1_list"].append(f1)
            results[name]["confusion_matrices"].append(cm)

    for name in results:
        acc_list = results[name]["acc_list"]
        bal_acc_list = results[name]["bal_acc_list"]
        f1_list = results[name]["f1_list"]
        confusion_matrices = results[name]["confusion_matrices"]

        mean_acc = np.mean(acc_list)
        std_acc = np.std(acc_list)
        mean_bal_acc = np.mean(bal_acc_list)
        std_bal_acc = np.std(bal_acc_list)
        mean_f1 = np.mean(f1_list)
        std_f1 = np.std(f1_list)
        confusion_matrix_all = np.sum(confusion_matrices, axis=0)

        results[name]["metrics"] = (mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1)
        results[name]["confusion_matrix"] = confusion_matrix_all

    return results, selected_features_per_fold


RFE com PCA

In [None]:
def evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf):
    results = {}
    selected_features_per_fold = []
    pca_components_per_fold = []

    for fold, (train_index, test_index) in enumerate(kf.split(X, y)):
        print(f"\n--- Fold {fold + 1} ---")

        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        inner_kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        selector = RFECV(estimator=SVC(kernel='linear'), step=1, cv=inner_kf.split(X_train_scaled, y_train), scoring='accuracy')
        selector.fit(X_train_scaled, y_train)
        X_train_selected = selector.transform(X_train_scaled)
        X_test_selected = selector.transform(X_test_scaled)

        selected_features = selector.support_
        selected_features_per_fold.append(selected_features)

        pca = PCA(0.95)
        X_train_pca = pca.fit_transform(X_train_selected)
        X_test_pca = pca.transform(X_test_selected)

        pca_components_per_fold.append(pca.n_components_)

        for name, clf in classifiers.items():
            if name not in results:
                results[name] = {
                    "acc_list": [],
                    "bal_acc_list": [],
                    "f1_list": [],
                    "confusion_matrices": []
                }

            clf.fit(X_train_pca, y_train)
            y_pred = clf.predict(X_test_pca)

            acc = accuracy_score(y_test, y_pred)
            bal_acc = balanced_accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average='weighted')
            cm = confusion_matrix(y_test, y_pred, labels=np.unique(y))

            results[name]["acc_list"].append(acc)
            results[name]["bal_acc_list"].append(bal_acc)
            results[name]["f1_list"].append(f1)
            results[name]["confusion_matrices"].append(cm)

    for name in results:
        acc_list = results[name]["acc_list"]
        bal_acc_list = results[name]["bal_acc_list"]
        f1_list = results[name]["f1_list"]
        confusion_matrices = results[name]["confusion_matrices"]

        mean_acc = np.mean(acc_list)
        std_acc = np.std(acc_list)
        mean_bal_acc = np.mean(bal_acc_list)
        std_bal_acc = np.std(bal_acc_list)
        mean_f1 = np.mean(f1_list)
        std_f1 = np.std(f1_list)
        confusion_matrix_all = np.sum(confusion_matrices, axis=0)

        results[name]["metrics"] = (mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1)
        results[name]["confusion_matrix"] = confusion_matrix_all

    return results, selected_features_per_fold


## PD vs CO

In [None]:
df1 = df[df['label'] != 2]

In [None]:
X = df1.drop(columns=['label'])
y = df1['label']

classifiers = {
    'SVM rbf': SVC(),
    'SVM linear': SVC(kernel='linear'),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}


In [None]:
# Sem PCA
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 75.61%, Desvio padrão: 6.32%
Acurácia balanceada média: 75.45%, Desvio padrão: 6.48%
F-score médio: 75.43%, Desvio padrão: 6.39%

Matriz de Confusão:
[[52 18]
 [19 63]]

SVM linear:

Acurácia média: 78.26%, Desvio padrão: 6.25%
Acurácia balanceada média: 77.70%, Desvio padrão: 6.54%
F-score médio: 77.93%, Desvio padrão: 6.55%

Matriz de Confusão:
[[50 20]
 [13 69]]

KNN:

Acurácia média: 66.99%, Desvio padrão: 7.95%
Acurácia balanceada média: 64.95%, Desvio padrão: 7.91%
F-score médio: 63.77%, Desvio padrão: 9.98%

Matriz de Confusão:
[[27 43]
 [ 7 75]]

LDA:

Acurácia média: 65.20%, Desvio padrão: 6.33%
Acurácia balanceada média: 65.03%, Desvio padrão: 6.52%
F-score médio: 64.79%, Desvio padrão: 6.34%

Matriz de Confusão:
[[44 26]
 [27 55]]

Decision Tree:

Acurácia média: 57.94%, Desvio padrão: 4.44%
Acurácia balanceada média: 57.39%, Desvio padrão: 4.44%
F-score médio: 57.48%, Desvio padrão: 4.26%

Matriz de Confusão:
[[35 35]
 [29 53]]

Nai



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 74.32%, Desvio padrão: 3.98%
Acurácia balanceada média: 73.72%, Desvio padrão: 4.13%
F-score médio: 73.98%, Desvio padrão: 4.03%

Matriz de Confusão:
[[47 23]
 [16 66]]

SVM linear:

Acurácia média: 79.57%, Desvio padrão: 7.46%
Acurácia balanceada média: 79.13%, Desvio padrão: 7.84%
F-score médio: 79.27%, Desvio padrão: 7.78%

Matriz de Confusão:
[[52 18]
 [13 69]]

KNN:

Acurácia média: 65.68%, Desvio padrão: 9.62%
Acurácia balanceada média: 63.64%, Desvio padrão: 9.93%
F-score médio: 61.46%, Desvio padrão: 14.18%

Matriz de Confusão:
[[26 44]
 [ 8 74]]

LDA:

Acurácia média: 75.63%, Desvio padrão: 5.50%
Acurácia balanceada média: 75.09%, Desvio padrão: 5.65%
F-score médio: 75.33%, Desvio padrão: 5.76%

Matriz de Confusão:
[[48 22]
 [15 67]]

Decision Tree:

Acurácia média: 65.12%, Desvio padrão: 5.41%
Acurácia balanceada média: 65.05%, Desvio padrão: 6.13%
F-score médio: 64.80%, Desvio padrão: 5.58%

Matriz de Confusão:
[[45 25]
 [28 54]]

Na



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO_PCA.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

### Aceleração

In [None]:
def filter_dataframe_r(df, wrists, tasks, sensors, axes, features, remove):
    valid_columns = set(
        f"{task}_{wrist}_{sensor}_{axis}_{feature}"
        for task in tasks
        for wrist in wrists
        for sensor in sensors
        for axis in axes
        for feature in features
    )

    valid_columns.add('label')

    filtered_columns = [col for col in df.columns if col in valid_columns]
    filtered_df = df[filtered_columns]

    filtered_df = filtered_df[filtered_df['label'] != remove]

    return filtered_df


In [None]:
exp_wrists = ["Left", "Right"]

exp_tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
             "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

exp_sensors = ["Accelerometer"]

exp_axes = ["X", "Y", "Z"]

exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
                "count_below_mean", "kurtosis", "length", "maximum", "mean",
                "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
                "skewness", "standard_deviation", "sum_values", "variance"]

df1_ac = filter_dataframe_r(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features, remove=2)


In [None]:
# Sem PCA

X = df1_ac.drop(columns=['label'])
y = df1_ac['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 67.10%, Desvio padrão: 9.43%
Acurácia balanceada média: 66.31%, Desvio padrão: 9.37%
F-score médio: 66.14%, Desvio padrão: 9.19%

Matriz de Confusão:
[[41 29]
 [21 61]]

SVM linear:

Acurácia média: 67.76%, Desvio padrão: 12.31%
Acurácia balanceada média: 67.13%, Desvio padrão: 12.42%
F-score médio: 67.45%, Desvio padrão: 12.40%

Matriz de Confusão:
[[42 28]
 [21 61]]

KNN:

Acurácia média: 66.49%, Desvio padrão: 9.13%
Acurácia balanceada média: 65.03%, Desvio padrão: 9.09%
F-score médio: 64.51%, Desvio padrão: 9.82%

Matriz de Confusão:
[[32 38]
 [13 69]]

LDA:

Acurácia média: 63.87%, Desvio padrão: 8.63%
Acurácia balanceada média: 63.91%, Desvio padrão: 8.79%
F-score médio: 63.65%, Desvio padrão: 8.67%

Matriz de Confusão:
[[45 25]
 [30 52]]

Decision Tree:

Acurácia média: 61.18%, Desvio padrão: 11.79%
Acurácia balanceada média: 61.01%, Desvio padrão: 12.48%
F-score médio: 60.54%, Desvio padrão: 12.35%

Matriz de Confusão:
[[41 29]
 [30 52]



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO_ac.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA

X = df1_ac.drop(columns=['label'])
y = df1_ac['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 68.41%, Desvio padrão: 12.71%
Acurácia balanceada média: 67.68%, Desvio padrão: 12.35%
F-score médio: 67.93%, Desvio padrão: 12.48%

Matriz de Confusão:
[[42 28]
 [20 62]]

SVM linear:

Acurácia média: 65.78%, Desvio padrão: 12.40%
Acurácia balanceada média: 64.99%, Desvio padrão: 12.45%
F-score médio: 65.27%, Desvio padrão: 12.40%

Matriz de Confusão:
[[39 31]
 [21 61]]

KNN:

Acurácia média: 63.89%, Desvio padrão: 9.51%
Acurácia balanceada média: 62.04%, Desvio padrão: 10.16%
F-score médio: 60.83%, Desvio padrão: 11.38%

Matriz de Confusão:
[[27 43]
 [12 70]]

LDA:

Acurácia média: 67.10%, Desvio padrão: 16.97%
Acurácia balanceada média: 67.06%, Desvio padrão: 17.18%
F-score médio: 67.04%, Desvio padrão: 16.98%

Matriz de Confusão:
[[47 23]
 [27 55]]

Decision Tree:

Acurácia média: 67.03%, Desvio padrão: 7.81%
Acurácia balanceada média: 66.63%, Desvio padrão: 8.38%
F-score médio: 66.44%, Desvio padrão: 8.77%

Matriz de Confusão:
[[43 27]
 [2



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO_PCA_ac.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

### Rotação

In [None]:
exp_wrists = ["Left", "Right"]

exp_tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
             "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

exp_sensors = ["Gyroscope"]

exp_axes = ["X", "Y", "Z"]

exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
                "count_below_mean", "kurtosis", "length", "maximum", "mean",
                "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
                "skewness", "standard_deviation", "sum_values", "variance"]

df1_ro = filter_dataframe_r(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features, remove=2)


In [None]:
# Sem PCA

X = df1_ro.drop(columns=['label'])
y = df1_ro['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 74.95%, Desvio padrão: 7.32%
Acurácia balanceada média: 74.88%, Desvio padrão: 6.86%
F-score médio: 74.48%, Desvio padrão: 7.70%

Matriz de Confusão:
[[53 17]
 [21 61]]

SVM linear:

Acurácia média: 74.99%, Desvio padrão: 1.72%
Acurácia balanceada média: 74.22%, Desvio padrão: 1.17%
F-score médio: 74.39%, Desvio padrão: 1.39%

Matriz de Confusão:
[[46 24]
 [14 68]]

KNN:

Acurácia média: 69.01%, Desvio padrão: 6.36%
Acurácia balanceada média: 67.14%, Desvio padrão: 6.16%
F-score médio: 66.33%, Desvio padrão: 7.16%

Matriz de Confusão:
[[31 39]
 [ 8 74]]

LDA:

Acurácia média: 72.32%, Desvio padrão: 4.74%
Acurácia balanceada média: 72.56%, Desvio padrão: 4.56%
F-score médio: 72.00%, Desvio padrão: 4.94%

Matriz de Confusão:
[[53 17]
 [25 57]]

Decision Tree:

Acurácia média: 59.27%, Desvio padrão: 6.42%
Acurácia balanceada média: 59.05%, Desvio padrão: 6.63%
F-score médio: 59.19%, Desvio padrão: 6.51%

Matriz de Confusão:
[[40 30]
 [32 50]]

Nai



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO_ro.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA

X = df1_ro.drop(columns=['label'])
y = df1_ro['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 74.99%, Desvio padrão: 5.02%
Acurácia balanceada média: 74.61%, Desvio padrão: 5.18%
F-score médio: 74.23%, Desvio padrão: 5.40%

Matriz de Confusão:
[[50 20]
 [18 64]]

SVM linear:

Acurácia média: 73.66%, Desvio padrão: 3.81%
Acurácia balanceada média: 72.88%, Desvio padrão: 3.67%
F-score médio: 73.13%, Desvio padrão: 3.60%

Matriz de Confusão:
[[45 25]
 [15 67]]

KNN:

Acurácia média: 66.39%, Desvio padrão: 7.48%
Acurácia balanceada média: 64.41%, Desvio padrão: 7.67%
F-score médio: 63.20%, Desvio padrão: 9.23%

Matriz de Confusão:
[[28 42]
 [ 9 73]]

LDA:

Acurácia média: 76.32%, Desvio padrão: 2.38%
Acurácia balanceada média: 75.56%, Desvio padrão: 2.25%
F-score médio: 75.86%, Desvio padrão: 2.29%

Matriz de Confusão:
[[47 23]
 [13 69]]

Decision Tree:

Acurácia média: 66.41%, Desvio padrão: 7.63%
Acurácia balanceada média: 66.19%, Desvio padrão: 7.16%
F-score médio: 66.23%, Desvio padrão: 7.40%

Matriz de Confusão:
[[45 25]
 [26 56]]

Nai



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsCO_PCA_ro.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

## PD vs DD

In [None]:
df2 = df[df['label'] != 0]
df2['label'] = df2['label'].replace(2, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['label'] = df2['label'].replace(2, 0)


In [None]:
X = df2.drop(columns=['label'])
y = df2['label']

classifiers = {
    'SVM rbf': SVC(),
    'SVM linear': SVC(kernel='linear'),
    'KNN': KNeighborsClassifier(),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(),
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Extra Trees': ExtraTreesClassifier(),
    'XGBoost': xgb.XGBClassifier(),
    'LightGBM': lgb.LGBMClassifier(verbose=-1)
}


In [None]:
# Sem PCA
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---




SVM rbf:

Acurácia média: 59.75%, Desvio padrão: 5.87%
Acurácia balanceada média: 59.74%, Desvio padrão: 5.82%
F-score médio: 59.65%, Desvio padrão: 5.90%

Matriz de Confusão:
[[49 33]
 [33 49]]

SVM linear:

Acurácia média: 61.55%, Desvio padrão: 6.99%
Acurácia balanceada média: 61.51%, Desvio padrão: 7.23%
F-score médio: 60.95%, Desvio padrão: 7.17%

Matriz de Confusão:
[[46 36]
 [27 55]]

KNN:

Acurácia média: 62.23%, Desvio padrão: 4.65%
Acurácia balanceada média: 62.13%, Desvio padrão: 5.11%
F-score médio: 60.42%, Desvio padrão: 5.73%

Matriz de Confusão:
[[37 45]
 [17 65]]

LDA:

Acurácia média: 46.93%, Desvio padrão: 5.44%
Acurácia balanceada média: 46.80%, Desvio padrão: 5.52%
F-score médio: 46.62%, Desvio padrão: 5.56%

Matriz de Confusão:
[[35 47]
 [40 42]]

Decision Tree:

Acurácia média: 59.70%, Desvio padrão: 6.88%
Acurácia balanceada média: 59.56%, Desvio padrão: 7.08%
F-score médio: 58.79%, Desvio padrão: 7.68%

Matriz de Confusão:
[[44 38]
 [28 54]]

Naive Bayes:

Acurá



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 62.16%, Desvio padrão: 7.01%
Acurácia balanceada média: 62.13%, Desvio padrão: 7.11%
F-score médio: 61.73%, Desvio padrão: 7.12%

Matriz de Confusão:
[[44 38]
 [24 58]]

SVM linear:

Acurácia média: 60.91%, Desvio padrão: 11.67%
Acurácia balanceada média: 60.92%, Desvio padrão: 11.89%
F-score médio: 60.48%, Desvio padrão: 11.76%

Matriz de Confusão:
[[46 36]
 [28 54]]

KNN:

Acurácia média: 62.82%, Desvio padrão: 5.76%
Acurácia balanceada média: 62.76%, Desvio padrão: 6.30%
F-score médio: 60.82%, Desvio padrão: 7.18%

Matriz de Confusão:
[[38 44]
 [17 65]]

LDA:

Acurácia média: 59.09%, Desvio padrão: 9.58%
Acurácia balanceada média: 59.08%, Desvio padrão: 9.82%
F-score médio: 58.54%, Desvio padrão: 9.63%

Matriz de Confusão:
[[45 37]
 [30 52]]

Decision Tree:

Acurácia média: 50.55%, Desvio padrão: 9.23%
Acurácia balanceada média: 50.59%, Desvio padrão: 9.38%
F-score médio: 50.30%, Desvio padrão: 9.12%

Matriz de Confusão:
[[38 44]
 [37 45]]





In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD_PCA.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

### Aceleração

In [None]:
exp_wrists = ["Left", "Right"]

exp_tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
             "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

exp_sensors = ["Accelerometer"]

exp_axes = ["X", "Y", "Z"]

exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
                "count_below_mean", "kurtosis", "length", "maximum", "mean",
                "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
                "skewness", "standard_deviation", "sum_values", "variance"]

df2_ac = filter_dataframe_r(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features, remove=0)
df2_ac['label'] = df2_ac['label'].replace(2, 0)


In [None]:
# Sem PCA

X = df2_ac.drop(columns=['label'])
y = df2_ac['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 59.77%, Desvio padrão: 6.36%
Acurácia balanceada média: 59.78%, Desvio padrão: 6.60%
F-score médio: 59.43%, Desvio padrão: 6.49%

Matriz de Confusão:
[[44 38]
 [28 54]]

SVM linear:

Acurácia média: 54.85%, Desvio padrão: 6.09%
Acurácia balanceada média: 54.93%, Desvio padrão: 6.38%
F-score médio: 54.21%, Desvio padrão: 5.85%

Matriz de Confusão:
[[36 46]
 [28 54]]

KNN:

Acurácia média: 57.94%, Desvio padrão: 2.05%
Acurácia balanceada média: 58.09%, Desvio padrão: 2.05%
F-score médio: 57.22%, Desvio padrão: 2.32%

Matriz de Confusão:
[[41 41]
 [28 54]]

LDA:

Acurácia média: 55.45%, Desvio padrão: 6.18%
Acurácia balanceada média: 55.55%, Desvio padrão: 6.43%
F-score médio: 54.91%, Desvio padrão: 6.01%

Matriz de Confusão:
[[38 44]
 [29 53]]

Decision Tree:

Acurácia média: 58.56%, Desvio padrão: 5.44%
Acurácia balanceada média: 58.68%, Desvio padrão: 5.30%
F-score médio: 58.19%, Desvio padrão: 5.55%

Matriz de Confusão:
[[47 35]
 [33 49]]

Nai



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD_ac.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA

X = df2_ac.drop(columns=['label'])
y = df2_ac['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---
SVM rbf:

Acurácia média: 60.98%, Desvio padrão: 5.80%
Acurácia balanceada média: 60.96%, Desvio padrão: 6.04%
F-score médio: 60.57%, Desvio padrão: 6.00%

Matriz de Confusão:
[[44 38]
 [26 56]]

SVM linear:

Acurácia média: 59.15%, Desvio padrão: 4.07%
Acurácia balanceada média: 59.12%, Desvio padrão: 4.31%
F-score médio: 58.13%, Desvio padrão: 4.12%

Matriz de Confusão:
[[37 45]
 [22 60]]

KNN:

Acurácia média: 58.58%, Desvio padrão: 4.18%
Acurácia balanceada média: 58.71%, Desvio padrão: 4.24%
F-score médio: 57.71%, Desvio padrão: 4.53%

Matriz de Confusão:
[[40 42]
 [26 56]]

LDA:

Acurácia média: 60.36%, Desvio padrão: 5.77%
Acurácia balanceada média: 60.33%, Desvio padrão: 5.81%
F-score médio: 59.44%, Desvio padrão: 5.66%

Matriz de Confusão:
[[39 43]
 [22 60]]

Decision Tree:

Acurácia média: 57.90%, Desvio padrão: 7.44%
Acurácia balanceada média: 58.05%, Desvio padrão: 7.49%
F-score médio: 57.63%, Desvio padrão: 7.42%

Matriz de Confusão:
[[45 37]
 [32 50]]

Nai



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD_PCA_ac.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

### Rotação

In [None]:
exp_wrists = ["Left", "Right"]

exp_tasks = ["Relaxed1", "Relaxed2", "RelaxedTask1", "RelaxedTask2", "StretchHold", "HoldWeight",
             "DrinkGlas", "CrossArms", "TouchNose", "Entrainment1", "Entrainment2"]

exp_sensors = ["Gyroscope"]

exp_axes = ["X", "Y", "Z"]

exp_features = ["abs_energy", "absolute_maximum", "absolute_sum_of_changes", "count_above_mean",
                "count_below_mean", "kurtosis", "length", "maximum", "mean",
                "mean_abs_change", "mean_change", "median", "minimum", "root_mean_square",
                "skewness", "standard_deviation", "sum_values", "variance"]

df2_ro = filter_dataframe_r(df, exp_wrists, exp_tasks, exp_sensors, exp_axes, exp_features, remove=0)
df2_ro['label'] = df2_ac['label'].replace(2, 0)


In [None]:
# Sem PCA

X = df2_ro.drop(columns=['label'])
y = df2_ro['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")



--- Fold 1 ---





--- Fold 2 ---





--- Fold 3 ---





--- Fold 4 ---





--- Fold 5 ---




SVM rbf:

Acurácia média: 57.97%, Desvio padrão: 5.90%
Acurácia balanceada média: 57.87%, Desvio padrão: 5.89%
F-score médio: 57.28%, Desvio padrão: 5.57%

Matriz de Confusão:
[[51 31]
 [38 44]]

SVM linear:

Acurácia média: 57.33%, Desvio padrão: 3.13%
Acurácia balanceada média: 57.06%, Desvio padrão: 2.86%
F-score médio: 56.22%, Desvio padrão: 2.85%

Matriz de Confusão:
[[55 27]
 [43 39]]

KNN:

Acurácia média: 58.58%, Desvio padrão: 4.98%
Acurácia balanceada média: 58.53%, Desvio padrão: 4.78%
F-score médio: 57.58%, Desvio padrão: 5.26%

Matriz de Confusão:
[[41 41]
 [27 55]]

LDA:

Acurácia média: 53.05%, Desvio padrão: 7.04%
Acurácia balanceada média: 52.79%, Desvio padrão: 6.72%
F-score médio: 51.84%, Desvio padrão: 6.73%

Matriz de Confusão:
[[47 35]
 [42 40]]

Decision Tree:

Acurácia média: 56.72%, Desvio padrão: 2.70%
Acurácia balanceada média: 56.58%, Desvio padrão: 2.68%
F-score médio: 56.30%, Desvio padrão: 2.75%

Matriz de Confusão:
[[46 36]
 [35 47]]

Naive Bayes:

Acurá



In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD_ro.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

In [None]:
# Com PCA

X = df2_ro.drop(columns=['label'])
y = df2_ro['label']
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results, selected_features_per_fold = evaluate_classifiers_with_rfecv_PCA(X, y, classifiers, kf)

for name, result in results.items():
    mean_acc, std_acc, mean_bal_acc, std_bal_acc, mean_f1, std_f1 = result["metrics"]
    confusion_matrix_all = result["confusion_matrix"]

    print(f"{name}:\n")
    print(f"Acurácia média: {mean_acc:.2%}, Desvio padrão: {std_acc:.2%}")
    print(f"Acurácia balanceada média: {mean_bal_acc:.2%}, Desvio padrão: {std_bal_acc:.2%}")
    print(f"F-score médio: {mean_f1:.2%}, Desvio padrão: {std_f1:.2%}\n")
    print(f"Matriz de Confusão:\n{confusion_matrix_all}\n")


In [None]:
rfe_path = os.path.join(path, "RFE")
os.makedirs(rfe_path, exist_ok=True)

output_path = os.path.join(rfe_path, "selected_features_PDvsDD_PCA_ro.txt")

with open(output_path, "w") as f:
    for i, selected_features in enumerate(selected_features_per_fold):
        f.write(f"Fold {i + 1}: Features selecionadas:\n")
        f.write(", ".join(X.columns[selected_features]) + "\n\n")

# Nested CV

In [None]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

def nestedCV(df, label_column, param_grid, model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy'):
    X = df.drop(columns=[label_column])
    y = df[label_column]

    outer_loop = StratifiedKFold(n_splits=outer_splits, shuffle=True, random_state=42)
    inner_loop = StratifiedKFold(n_splits=inner_splits, shuffle=True, random_state=42)

    model = SVC(kernel='linear')
    outer_scores = {'accuracy': [], 'balanced_accuracy': [], 'f1_score': []}

    for train_idx, test_idx in outer_loop.split(X, y):

        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # StandardScaler
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # RFECV
        rfecv = RFECV(
            estimator=model,
            step=1,
            cv=inner_loop.split(X_train_scaled, y_train),
            scoring=scoring,
            n_jobs=-1
        )
        rfecv.fit(X_train_scaled, y_train)

        selected_features = X.columns[rfecv.support_]
        X_train_selected = X_train_scaled[:, rfecv.support_]
        X_test_selected = X_test_scaled[:, rfecv.support_]

        grid_search = GridSearchCV(
            estimator=model_for_grid,
            param_grid=param_grid,
            cv=inner_loop.split(X_train_selected, y_train),
            scoring=scoring,
            n_jobs=-1
        )

        grid_search.fit(X_train_selected, y_train)
        best_model = grid_search.best_estimator_
        best_model.fit(X_train_selected, y_train)

        y_pred = best_model.predict(X_test_selected)

        accuracy = best_model.score(X_test_selected, y_test)
        balanced_acc = balanced_accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        outer_scores['accuracy'].append(accuracy)
        outer_scores['balanced_accuracy'].append(balanced_acc)
        outer_scores['f1_score'].append(f1)

        print(f"Acurácia do fold: {accuracy:.2%}")
        print(f"Acurácia balanceada do fold: {balanced_acc:.2%}")
        print(f"F1-score do fold: {f1:.2%}")
        print(f"Número de features selecionadas: {rfecv.n_features_}")
        print(f"Melhores parâmetros do GridSearch: {grid_search.best_params_}")

    print(f"\nDesempenho médio do outer loop:")
    print(f"Acurácia: {np.mean(outer_scores['accuracy']):.2%} (±{np.std(outer_scores['accuracy']):.2%})")
    print(f"Acurácia balanceada: {np.mean(outer_scores['balanced_accuracy']):.2%} (±{np.std(outer_scores['balanced_accuracy']):.2%})")
    print(f"F1-score: {np.mean(outer_scores['f1_score']):.2%} (±{np.std(outer_scores['f1_score']):.2%})")

    return {
        'accuracy_mean': np.mean(outer_scores['accuracy']),
        'accuracy_std': np.std(outer_scores['accuracy']),
        'balanced_accuracy_mean': np.mean(outer_scores['balanced_accuracy']),
        'balanced_accuracy_std': np.std(outer_scores['balanced_accuracy']),
        'f1_score_mean': np.mean(outer_scores['f1_score']),
        'f1_score_std': np.std(outer_scores['f1_score'])
    }


In [None]:
df1 = df[df['label'] != 2]

In [None]:
X = df1.drop(columns=['label'])
y = df1['label']

outer_loop = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_loop = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

model = SVC(kernel='linear')
outer_scores = []

for train_idx, test_idx in outer_loop.split(X, y):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    rfecv = RFECV(
        estimator=model,
        step=1,
        cv=inner_loop.split(X_train_scaled, y_train),
        scoring='accuracy',
        n_jobs=-1
    )

    rfecv.fit(X_train_scaled, y_train)

    selected_features = X.columns[rfecv.support_]
    X_train_selected = X_train_scaled[:, rfecv.support_]
    X_test_selected = X_test_scaled[:, rfecv.support_]

    model.fit(X_train_selected, y_train)
    score = model.score(X_test_selected, y_test)

    outer_scores.append(score)
    print(f"Acurácia do fold: {score:.2%}")
    print(f"Número de features selecionadas: {rfecv.n_features_}")

print(f"\nDesempenho médio do outer loop: {np.mean(outer_scores):.2%}")
print(f"Desvio padrão: {np.std(outer_scores):.2%}")


## PD vs CO

In [None]:
# SVM
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'shrinking': [True, False]
}
model_for_grid = SVC()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid)

Acurácia do fold: 87.10%
Acurácia balanceada do fold: 86.34%
F1-score do fold: 86.96%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 80.65%
Acurácia balanceada do fold: 79.20%
F1-score do fold: 80.05%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'C': 10, 'degree': 2, 'gamma': 'scale', 'kernel': 'rbf', 'shrinking': True}
Acurácia do fold: 80.00%
Acurácia balanceada do fold: 80.36%
F1-score do fold: 80.00%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 83.33%
Acurácia balanceada do fold: 83.48%
F1-score do fold: 83.35%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 70.00%
Acurácia balancead

{'accuracy_mean': 0.8021505376344086,
 'accuracy_std': 0.05686128868571409,
 'balanced_accuracy_mean': 0.7962710084033613,
 'balanced_accuracy_std': 0.059834868566440866,
 'f1_score_mean': 0.7982557658314857,
 'f1_score_std': 0.06094445017835579}

In [None]:
k_range = list(range(1, 31))
param_grid = {
    'n_neighbors': k_range,
    'leaf_size': [1, 20, 40],
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}
model_for_grid = KNeighborsClassifier()

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 80.65%
Acurácia balanceada do fold: 78.57%
F1-score do fold: 79.46%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 74.19%
Acurácia balanceada do fold: 71.43%
F1-score do fold: 71.49%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 4, 'p': 1, 'weights': 'uniform'}
Acurácia do fold: 80.00%
Acurácia balanceada do fold: 79.91%
F1-score do fold: 80.00%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 4, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 66.67%
Acurácia balanceada do fold: 66.96%
F1-score do fold: 66.67%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 70.00%
Acurácia balanceada do fold: 69.20%
F1-score do fold: 69.49%
Número de features s

{'accuracy_mean': 0.743010752688172,
 'accuracy_std': 0.054684582057991545,
 'balanced_accuracy_mean': 0.7321428571428571,
 'balanced_accuracy_std': 0.051368392370151846,
 'f1_score_mean': 0.7341997486384583,
 'f1_score_std': 0.05376960043885962}

In [None]:
param_grid = [
    {'solver': ['lsqr'], 'shrinkage': [None, 'auto']}
]
model_for_grid = LinearDiscriminantAnalysis()

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 77.42%
Acurácia balanceada do fold: 76.26%
F1-score do fold: 76.98%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 80.65%
Acurácia balanceada do fold: 79.83%
F1-score do fold: 80.44%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 80.00%
Acurácia balanceada do fold: 80.36%
F1-score do fold: 80.00%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 76.67%
Acurácia balanceada do fold: 76.79%
F1-score do fold: 76.69%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 73.33%
Acurácia balanceada do fold: 71.88%
F1-score do fold: 71.80%
Número de features selecionadas: 286
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}

Desempe

{'accuracy_mean': 0.7761290322580645,
 'accuracy_std': 0.026130580953262872,
 'balanced_accuracy_mean': 0.7702205882352942,
 'balanced_accuracy_std': 0.030379669464718324,
 'f1_score_mean': 0.7718263616023701,
 'f1_score_std': 0.030908683597686882}

In [None]:
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy']
}

model_for_grid = DecisionTreeClassifier()

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 58.06%
Acurácia balanceada do fold: 56.72%
F1-score do fold: 57.25%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 10}
Acurácia do fold: 64.52%
Acurácia balanceada do fold: 63.24%
F1-score do fold: 63.83%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 5}
Acurácia do fold: 53.33%
Acurácia balanceada do fold: 53.12%
F1-score do fold: 53.33%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 2}
Acurácia do fold: 70.00%
Acurácia balanceada do fold: 70.09%
F1-score do fold: 70.03%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': None, 'min_samples_split': 2}
Acurácia do fold: 50.00

{'accuracy_mean': 0.5918279569892473,
 'accuracy_std': 0.07283499836572321,
 'balanced_accuracy_mean': 0.5845588235294118,
 'balanced_accuracy_std': 0.0744311824898767,
 'f1_score_mean': 0.5871673345043756,
 'f1_score_std': 0.07442620596511149}

In [None]:
param_grid = {'var_smoothing': [1e-09] + np.logspace(-8, -10, num=10).tolist()}

model_for_grid = GaussianNB()

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 58.06%
Acurácia balanceada do fold: 57.98%
F1-score do fold: 58.15%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 58.06%
Acurácia balanceada do fold: 61.76%
F1-score do fold: 51.73%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 70.00%
Acurácia balanceada do fold: 70.98%
F1-score do fold: 69.49%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 70.00%
Acurácia balanceada do fold: 70.98%
F1-score do fold: 69.49%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 60.00%
Acurácia balanceada do fold: 59.38%
F1-score do fold: 59.64%
Número de features selecionadas: 286
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}

Desempenho médio do outer loop:
Acurácia: 63.23% (±5.58%)
Acurácia balanceada: 64.

{'accuracy_mean': 0.6322580645161291,
 'accuracy_std': 0.05576074951311455,
 'balanced_accuracy_mean': 0.6421743697478991,
 'balanced_accuracy_std': 0.056542616051197564,
 'f1_score_mean': 0.6170257149181284,
 'f1_score_std': 0.06895022084472471}

In [None]:
param_grid = {
    'n_estimators': [10, 100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy', 'log_loss']
}

model_for_grid = RandomForestClassifier(random_state=42)

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 83.87%
Acurácia balanceada do fold: 82.14%
F1-score do fold: 83.15%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
Acurácia do fold: 83.87%
Acurácia balanceada do fold: 82.77%
F1-score do fold: 83.56%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 100}
Acurácia do fold: 76.67%
Acurácia balanceada do fold: 77.23%
F1-score do fold: 76.59%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 500}
Acurácia do fold: 86.67%
Acurácia balanceada do fold: 87.05%
F1-score do fold: 86.67%
Número de features selecionadas: 220
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 200}
Acurácia do fold: 7

{'accuracy_mean': 0.8021505376344086,
 'accuracy_std': 0.06087579116001952,
 'balanced_accuracy_mean': 0.7959033613445378,
 'balanced_accuracy_std': 0.06251707802828398,
 'f1_score_mean': 0.7974609127600375,
 'f1_score_std': 0.06395997963687465}

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

model_for_grid = ExtraTreesClassifier()

nestedCV(df1, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 83.87%
Acurácia balanceada do fold: 82.14%
F1-score do fold: 83.15%
Número de features selecionadas: 1021
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
Acurácia do fold: 74.19%
Acurácia balanceada do fold: 72.69%
F1-score do fold: 73.40%
Número de features selecionadas: 1286
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 10, 'max_features': None, 'min_samples_split': 5, 'n_estimators': 200}
Acurácia do fold: 76.67%
Acurácia balanceada do fold: 77.23%
F1-score do fold: 76.59%
Número de features selecionadas: 207
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Acurácia do fold: 70.00%
Acurácia balanceada do fold: 70.09%
F1-score do fold: 70.03%
Número de features selecionadas: 220
Melhores parâ

{'accuracy_mean': 0.7361290322580645,
 'accuracy_std': 0.06835395152680092,
 'balanced_accuracy_mean': 0.7284138655462186,
 'balanced_accuracy_std': 0.0677894252607053,
 'f1_score_mean': 0.7300008915661192,
 'f1_score_std': 0.07067853133329868}

## PD vs DD

In [None]:
df2 = df[df['label'] != 0]
df2['label'] = df2['label'].replace(2, 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['label'] = df2['label'].replace(2, 0)


In [None]:
# SVM
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'shrinking': [True, False]
}
model_for_grid = SVC()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid)

Acurácia do fold: 69.70%
Acurácia balanceada do fold: 69.85%
F1-score do fold: 69.64%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'C': 0.1, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 63.64%
Acurácia balanceada do fold: 63.79%
F1-score do fold: 63.57%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'C': 100, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf', 'shrinking': True}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 54.60%
F1-score do fold: 54.55%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'C': 10, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf', 'shrinking': True}
Acurácia do fold: 57.58%
Acurácia balanceada do fold: 57.17%
F1-score do fold: 56.78%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 56.25%
Acurácia balanceada do fold: 56

{'accuracy_mean': 0.6034090909090909,
 'accuracy_std': 0.05592222889078965,
 'balanced_accuracy_mean': 0.6033088235294117,
 'balanced_accuracy_std': 0.056948517544524645,
 'f1_score_mean': 0.600180207649248,
 'f1_score_std': 0.05754620495510946}

In [None]:
k_range = list(range(1, 31))
param_grid = {
    'n_neighbors': k_range,
    'leaf_size': [1, 20, 40],
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}
model_for_grid = KNeighborsClassifier()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 63.64%
Acurácia balanceada do fold: 63.97%
F1-score do fold: 63.23%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 4, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 54.78%
F1-score do fold: 54.29%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 8, 'p': 2, 'weights': 'distance'}
Acurácia do fold: 60.61%
Acurácia balanceada do fold: 60.29%
F1-score do fold: 60.17%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 54.41%
F1-score do fold: 54.46%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 2, 'p': 2, 'weights': 'uniform'}
Acurácia do fold: 53.12%
Acurácia balanceada do fold: 53.12%
F1-score do fold: 52.71%
Número de features sele

{'accuracy_mean': 0.5729166666666666,
 'accuracy_std': 0.04091084451909276,
 'balanced_accuracy_mean': 0.5731617647058824,
 'balanced_accuracy_std': 0.041372955578338796,
 'f1_score_mean': 0.5697258757103029,
 'f1_score_std': 0.04025744956254527}

In [None]:
param_grid = [
    {'solver': ['lsqr'], 'shrinkage': [None, 'auto']}
]
model_for_grid = LinearDiscriminantAnalysis()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 69.70%
Acurácia balanceada do fold: 69.85%
F1-score do fold: 69.64%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'shrinkage': None, 'solver': 'lsqr'}
Acurácia do fold: 66.67%
Acurácia balanceada do fold: 66.91%
F1-score do fold: 66.48%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 60.61%
Acurácia balanceada do fold: 60.29%
F1-score do fold: 60.17%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 57.58%
Acurácia balanceada do fold: 56.80%
F1-score do fold: 54.55%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 50.00%
Acurácia balanceada do fold: 50.00%
F1-score do fold: 48.18%
Número de features selecionadas: 354
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}

Desempenho mé

{'accuracy_mean': 0.6090909090909091,
 'accuracy_std': 0.06936680692278543,
 'balanced_accuracy_mean': 0.6077205882352941,
 'balanced_accuracy_std': 0.07101966806874808,
 'f1_score_mean': 0.5980253746242186,
 'f1_score_std': 0.0780148183845914}

In [None]:
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy']
}

model_for_grid = DecisionTreeClassifier()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 48.48%
Acurácia balanceada do fold: 48.35%
F1-score do fold: 48.39%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 5}
Acurácia do fold: 60.61%
Acurácia balanceada do fold: 60.48%
F1-score do fold: 60.53%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 5}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 54.23%
F1-score do fold: 54.04%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 10}
Acurácia do fold: 60.61%
Acurácia balanceada do fold: 60.66%
F1-score do fold: 60.61%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
Acurácia do fold: 40.62%
Acurácia ba

{'accuracy_mean': 0.5297348484848484,
 'accuracy_std': 0.07636968620816455,
 'balanced_accuracy_mean': 0.5286764705882353,
 'balanced_accuracy_std': 0.0762688231914159,
 'f1_score_mean': 0.5282669289343371,
 'f1_score_std': 0.07633661948912067}

In [None]:
param_grid = {'var_smoothing': [1e-09] + np.logspace(-8, -10, num=10).tolist()}

model_for_grid = GaussianNB()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 60.61%
Acurácia balanceada do fold: 61.21%
F1-score do fold: 59.03%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 60.61%
Acurácia balanceada do fold: 61.03%
F1-score do fold: 59.87%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 54.23%
F1-score do fold: 54.04%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 54.55%
Acurácia balanceada do fold: 53.68%
F1-score do fold: 50.41%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 62.50%
Acurácia balanceada do fold: 62.50%
F1-score do fold: 62.35%
Número de features selecionadas: 354
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}

Desempenho médio do outer loop:
Acurácia: 58.56% (±3.35%)
Acurácia balanceada: 58.53% 

{'accuracy_mean': 0.5856060606060606,
 'accuracy_std': 0.033505067253781814,
 'balanced_accuracy_mean': 0.5852941176470587,
 'balanced_accuracy_std': 0.03775504388953014,
 'f1_score_mean': 0.5714113434256373,
 'f1_score_std': 0.04316510591349}

In [None]:
param_grid = {
    'n_estimators': [10, 100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy', 'log_loss']
}

model_for_grid = RandomForestClassifier(random_state=42)

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 69.70%
Acurácia balanceada do fold: 69.85%
F1-score do fold: 69.64%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'n_estimators': 500}
Acurácia do fold: 75.76%
Acurácia balanceada do fold: 76.10%
F1-score do fold: 75.49%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 500}
Acurácia do fold: 51.52%
Acurácia balanceada do fold: 50.92%
F1-score do fold: 49.56%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 500}
Acurácia do fold: 66.67%
Acurácia balanceada do fold: 66.54%
F1-score do fold: 66.61%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 100}
Acurácia do fold: 59.38%
Acurác

{'accuracy_mean': 0.6460227272727272,
 'accuracy_std': 0.08405421667399872,
 'balanced_accuracy_mean': 0.6455882352941177,
 'balanced_accuracy_std': 0.08696543331517123,
 'f1_score_mean': 0.6406131834037023,
 'f1_score_std': 0.08989114282675328}

In [None]:
# SVM
param_grid = {
    'C': [0.1, 1, 10, 100, 1000],
    'gamma': ['scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid']
}
model_for_grid = SVC()
nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid)

Acurácia do fold: 69.70%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Acurácia do fold: 63.64%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'C': 1, 'gamma': 'scale', 'kernel': 'linear'}
Acurácia do fold: 54.55%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
Acurácia do fold: 60.61%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Acurácia do fold: 56.25%
Número de features selecionadas: 354
Melhores parâmetros do GridSearch: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}

Desempenho médio do outer loop: 60.95%
Desvio padrão: 5.42%


(0.609469696969697, 0.054213002008529854)

In [None]:
# RF
param_grid = {
    'n_estimators': [10, 100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy', 'log_loss']
}
model_for_grid = RandomForestClassifier(random_state=42)
nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid)

Acurácia do fold: 69.70%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'n_estimators': 500}
Acurácia do fold: 75.76%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'log2', 'n_estimators': 500}
Acurácia do fold: 51.52%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': 4, 'max_features': 'sqrt', 'n_estimators': 500}
Acurácia do fold: 66.67%
Número de features selecionadas: 177
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': 5, 'max_features': 'log2', 'n_estimators': 100}
Acurácia do fold: 59.38%
Número de features selecionadas: 354
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 200}

Desempenho médio do outer loop: 64.60%
Desvio padrão: 8.41%


(0.6460227272727272, 0.08405421667399872)

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

model_for_grid = ExtraTreesClassifier()

nestedCV(df2, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 69.70%
Acurácia balanceada do fold: 70.04%
F1-score do fold: 69.36%
Número de features selecionadas: 38
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.0, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 100}
Acurácia do fold: 69.70%
Acurácia balanceada do fold: 70.04%
F1-score do fold: 69.36%
Número de features selecionadas: 56
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 20, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 200}
Acurácia do fold: 45.45%
Acurácia balanceada do fold: 45.04%
F1-score do fold: 44.43%
Número de features selecionadas: 876
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_split': 5, 'n_estimators': 200}
Acurácia do fold: 72.73%
Acurácia balanceada do fold: 72.43%
F1-score do fold: 72.42%
Número de features selecionadas: 177
Melhores parâmet

{'accuracy_mean': 0.6464015151515152,
 'accuracy_std': 0.09854965816260829,
 'balanced_accuracy_mean': 0.6463235294117647,
 'balanced_accuracy_std': 0.1004127511237258,
 'f1_score_mean': 0.640673647933176,
 'f1_score_std': 0.10119049050948378}

## Multiclassificação

In [None]:
# SVM
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2, 3, 4],
    'shrinking': [True, False]
}
model_for_grid = SVC()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid)

Acurácia do fold: 61.70%
Acurácia balanceada do fold: 62.54%
F1-score do fold: 60.80%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 44.68%
Acurácia balanceada do fold: 44.31%
F1-score do fold: 43.92%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 55.32%
Acurácia balanceada do fold: 56.04%
F1-score do fold: 55.32%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'C': 0.01, 'degree': 2, 'gamma': 'scale', 'kernel': 'linear', 'shrinking': True}
Acurácia do fold: 68.09%
Acurácia balanceada do fold: 68.05%
F1-score do fold: 68.34%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'C': 10, 'degree': 2, 'gamma': 'auto', 'kernel': 'rbf', 'shrinking': True}
Acurácia do fold: 41.30%
Acurácia balanceada 

{'accuracy_mean': 0.5421831637372804,
 'accuracy_std': 0.10072149294369356,
 'balanced_accuracy_mean': 0.544607843137255,
 'balanced_accuracy_std': 0.10264243318236675,
 'f1_score_mean': 0.5390891663600968,
 'f1_score_std': 0.10199063553699665}

In [None]:
k_range = list(range(1, 31))
param_grid = {
    'n_neighbors': k_range,
    'leaf_size': [1, 20, 40],
    'p': [1, 2],
    'weights': ['uniform', 'distance']
}
model_for_grid = KNeighborsClassifier()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 48.94%
Acurácia balanceada do fold: 48.84%
F1-score do fold: 48.90%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 9, 'p': 1, 'weights': 'distance'}
Acurácia do fold: 51.06%
Acurácia balanceada do fold: 49.61%
F1-score do fold: 47.51%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 9, 'p': 1, 'weights': 'uniform'}
Acurácia do fold: 51.06%
Acurácia balanceada do fold: 50.42%
F1-score do fold: 42.28%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 2, 'p': 1, 'weights': 'uniform'}
Acurácia do fold: 46.81%
Acurácia balanceada do fold: 45.13%
F1-score do fold: 44.13%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'leaf_size': 1, 'n_neighbors': 11, 'p': 1, 'weights': 'distance'}
Acurácia do fold: 47.83%
Acurácia balanceada do fold: 47.32%
F1-score do fold: 44.92%
Número de features

{'accuracy_mean': 0.49139685476410727,
 'accuracy_std': 0.017091517380027357,
 'balanced_accuracy_mean': 0.48266806722689076,
 'balanced_accuracy_std': 0.01870940879812464,
 'f1_score_mean': 0.4554527618898444,
 'f1_score_std': 0.02373169076492136}

In [None]:
param_grid = [
    {'solver': ['lsqr'], 'shrinkage': [None, 'auto']}
]
model_for_grid = LinearDiscriminantAnalysis()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 63.83%
Acurácia balanceada do fold: 65.04%
F1-score do fold: 62.09%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 46.81%
Acurácia balanceada do fold: 46.81%
F1-score do fold: 46.59%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 48.94%
Acurácia balanceada do fold: 49.37%
F1-score do fold: 47.77%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 59.57%
Acurácia balanceada do fold: 60.08%
F1-score do fold: 59.88%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}
Acurácia do fold: 41.30%
Acurácia balanceada do fold: 41.07%
F1-score do fold: 41.40%
Número de features selecionadas: 915
Melhores parâmetros do GridSearch: {'shrinkage': 'auto', 'solver': 'lsqr'}

Desempen

{'accuracy_mean': 0.5209065679925995,
 'accuracy_std': 0.0834279245435113,
 'balanced_accuracy_mean': 0.5247549019607842,
 'balanced_accuracy_std': 0.0880310594848911,
 'f1_score_mean': 0.5154558300421159,
 'f1_score_std': 0.08029642145684841}

In [None]:
param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy']
}

model_for_grid = DecisionTreeClassifier()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 53.19%
Acurácia balanceada do fold: 54.03%
F1-score do fold: 52.49%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.01, 'criterion': 'entropy', 'max_depth': 10, 'min_samples_split': 2}
Acurácia do fold: 27.66%
Acurácia balanceada do fold: 27.29%
F1-score do fold: 26.35%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': 20, 'min_samples_split': 10}
Acurácia do fold: 55.32%
Acurácia balanceada do fold: 55.99%
F1-score do fold: 54.94%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.01, 'criterion': 'gini', 'max_depth': 20, 'min_samples_split': 5}
Acurácia do fold: 42.55%
Acurácia balanceada do fold: 43.31%
F1-score do fold: 42.89%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'gini', 'max_depth': 10, 'min_samples_split': 2}
Acurácia do fold: 60.87%
Acurá

{'accuracy_mean': 0.479185938945421,
 'accuracy_std': 0.11743208342085258,
 'balanced_accuracy_mean': 0.48385854341736695,
 'balanced_accuracy_std': 0.12060301672387926,
 'f1_score_mean': 0.47508516561446756,
 'f1_score_std': 0.12062910863122375}

In [None]:
param_grid = {'var_smoothing': [1e-09] + np.logspace(-8, -10, num=10).tolist()}

model_for_grid = GaussianNB()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 36.17%
Acurácia balanceada do fold: 37.55%
F1-score do fold: 33.70%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 61.70%
Acurácia balanceada do fold: 63.73%
F1-score do fold: 60.05%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 59.57%
Acurácia balanceada do fold: 59.66%
F1-score do fold: 59.45%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 44.68%
Acurácia balanceada do fold: 46.78%
F1-score do fold: 41.80%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}
Acurácia do fold: 39.13%
Acurácia balanceada do fold: 40.48%
F1-score do fold: 36.68%
Número de features selecionadas: 915
Melhores parâmetros do GridSearch: {'var_smoothing': 1e-09}

Desempenho médio do outer loop:
Acurácia: 48.25% (±10.50%)
Acurácia balanceada: 49.

{'accuracy_mean': 0.4825161887141536,
 'accuracy_std': 0.10497903184721354,
 'balanced_accuracy_mean': 0.49639355742296914,
 'balanced_accuracy_std': 0.10364829355146131,
 'f1_score_mean': 0.463374253901628,
 'f1_score_std': 0.11257478012652784}

In [None]:
param_grid = {
    'n_estimators': [10, 100, 200, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 4, 5, 6, 7, 8],
    'criterion': ['gini', 'entropy', 'log_loss']
}

model_for_grid = RandomForestClassifier(random_state=42)

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 61.70%
Acurácia balanceada do fold: 62.11%
F1-score do fold: 61.16%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 6, 'max_features': 'sqrt', 'n_estimators': 200}
Acurácia do fold: 63.83%
Acurácia balanceada do fold: 63.41%
F1-score do fold: 63.98%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'n_estimators': 500}
Acurácia do fold: 55.32%
Acurácia balanceada do fold: 56.04%
F1-score do fold: 55.96%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'criterion': 'gini', 'max_depth': None, 'max_features': 'log2', 'n_estimators': 500}
Acurácia do fold: 57.45%
Acurácia balanceada do fold: 57.83%
F1-score do fold: 57.37%
Número de features selecionadas: 1898
Melhores parâmetros do GridSearch: {'criterion': 'entropy', 'max_depth': 4, 'max_features': 'log2', 'n_estimators': 500}
Acurácia do fold: 58

{'accuracy_mean': 0.5939870490286772,
 'accuracy_std': 0.030294292060673866,
 'balanced_accuracy_mean': 0.5960434173669468,
 'balanced_accuracy_std': 0.02742439438876429,
 'f1_score_mean': 0.5938316058256016,
 'f1_score_std': 0.028631561497031003}

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'ccp_alpha': [0.0, 0.001, 0.01],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

model_for_grid = ExtraTreesClassifier()

nestedCV(df, label_column='label', param_grid=param_grid, model_for_grid=model_for_grid, outer_splits=5, inner_splits=2, scoring='accuracy')

Acurácia do fold: 48.94%
Acurácia balanceada do fold: 49.09%
F1-score do fold: 47.14%
Número de features selecionadas: 628
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.01, 'criterion': 'gini', 'max_depth': 20, 'max_features': 'log2', 'min_samples_split': 2, 'n_estimators': 100}
Acurácia do fold: 57.45%
Acurácia balanceada do fold: 56.69%
F1-score do fold: 57.35%
Número de features selecionadas: 797
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
Acurácia do fold: 51.06%
Acurácia balanceada do fold: 51.58%
F1-score do fold: 50.83%
Número de features selecionadas: 645
Melhores parâmetros do GridSearch: {'ccp_alpha': 0.001, 'criterion': 'entropy', 'max_depth': None, 'max_features': None, 'min_samples_split': 2, 'n_estimators': 50}
Acurácia do fold: 65.96%
Acurácia balanceada do fold: 66.21%
F1-score do fold: 65.93%
Número de features selecionadas: 1898
Melhores par

{'accuracy_mean': 0.5685476410730804,
 'accuracy_std': 0.06254071097355762,
 'balanced_accuracy_mean': 0.5685574229691877,
 'balanced_accuracy_std': 0.06174763320259053,
 'f1_score_mean': 0.5634460053910723,
 'f1_score_std': 0.06708251673448959}

In [None]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
2,82
1,82
0,70
