In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import (Model, 
                                     load_model, 
                                     Sequential)
from tensorflow.keras import layers

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import (accuracy_score, 
                             precision_score, 
                             recall_score,
                             confusion_matrix)

from sklearn.model_selection import LeaveOneGroupOut
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from table_data_utils import *

In [2]:
devices = tf.config.list_physical_devices('GPU')
for device in devices:
    print(device)

# Verificar qué dispositivo está siendo utilizado actualmente
print("Dispositivo actual:", tf.test.gpu_device_name() if tf.test.gpu_device_name() else "CPU")

PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')
Dispositivo actual: /device:GPU:0


In [3]:
# importamos los datos
datos_pacientes = pd.read_csv("ReplicatedAcousticFeatures-ParkinsonDatabase.csv")

datos_pacientes.head()

Unnamed: 0,ID,Recording,Status,Gender,Jitter_rel,Jitter_abs,Jitter_RAP,Jitter_PPQ,Shim_loc,Shim_dB,...,Delta3,Delta4,Delta5,Delta6,Delta7,Delta8,Delta9,Delta10,Delta11,Delta12
0,CONT-01,1,0,1,0.25546,1.5e-05,0.001467,0.001673,0.030256,0.26313,...,1.407701,1.417218,1.380352,1.42067,1.45124,1.440295,1.403678,1.405495,1.416705,1.35461
1,CONT-01,2,0,1,0.36964,2.2e-05,0.001932,0.002245,0.023146,0.20217,...,1.331232,1.227338,1.213377,1.352739,1.354242,1.365692,1.32287,1.314549,1.318999,1.323508
2,CONT-01,3,0,1,0.23514,1.3e-05,0.001353,0.001546,0.019338,0.1671,...,1.412304,1.324674,1.276088,1.429634,1.455996,1.368882,1.438053,1.38891,1.305469,1.305402
3,CONT-02,1,0,0,0.2932,1.7e-05,0.001105,0.001444,0.024716,0.20892,...,1.5012,1.53417,1.323993,1.496442,1.472926,1.643177,1.551286,1.638346,1.604008,1.621456
4,CONT-02,2,0,0,0.23075,1.5e-05,0.001073,0.001404,0.013119,0.11607,...,1.508468,1.334511,1.610694,1.685021,1.417614,1.574895,1.640088,1.533666,1.297536,1.382023


In [4]:
subset_dec_gini_vars = ["HNR05","HNR15","HNR25","HNR35","HNR38","MFCC0","MFCC3","MFCC4","MFCC5","MFCC6","MFCC7","MFCC8","MFCC9","MFCC10","MFCC11","MFCC12","Delta0","Delta1","Delta2","Delta3","Delta5","Delta7","Delta9","Delta10","Delta11","Delta12"]
subset_dec_acc_vars = ["HNR05", "HNR15", "HNR25", "HNR35", "HNR38", "PPE", "MFCC3", "MFCC4", "MFCC5", "MFCC6", "MFCC7", "MFCC8", "MFCC9", "MFCC10", "MFCC11", "MFCC12", "Delta0", "Delta1", "Delta2", "Delta3", "Delta4", "Delta5", "Delta9", "Delta10", "Delta11", "Delta12"]
subset_imp_xgb_vars = ["Delta0", "HNR38", "MFCC4", "PPE", "HNR35", "GNE", "Delta11", "MFCC5", "Delta5", "MFCC3", "RPDE", "MFCC10", "Shim_loc", "Shi_APQ11", "MFCC9", "Delta7", "MFCC2", "MFCC6", "MFCC11", "MFCC1", "DFA", "Delta12", "Shim_APQ5", "MFCC7", "Delta6"]
subset_interseccion_vars = ["HNR35", "HNR38", "MFCC3", "MFCC4", "MFCC5", "MFCC6", "MFCC7", "MFCC9", "MFCC10", "MFCC11", "Delta0", "Delta5", "Delta11", "Delta12"]

In [20]:
sex = np.array(datos_pacientes[['ID', 'Gender']].drop_duplicates()['Gender'])

# Primer conjunto
subset_dec_gini_vars

In [25]:
# definimos las variables que vamos a utilizar en el modelo
ids = datos_pacientes['ID'].to_numpy()
X = datos_pacientes.drop(columns=['ID', 'Recording', 'Status', 'Gender'])
X = X[subset_dec_gini_vars]
y = np.array(datos_pacientes['Status'])

# definimos el leave one group out
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups=ids)

# guardamos los valores de las diferentes métricas
test_metrics = np.zeros(80) # el 80 es por el numero de grupos que hay

y_group_true = np.zeros(80)
y_group_pred = np.zeros(80)

for i, (train_index, test_index) in enumerate(logo.split(X, y, groups=ids)):

    tf.random.set_seed(i)

    X_train = X.iloc[train_index, :]; X_test = X.iloc[test_index, :]
    y_train = y[train_index]; y_test = y[test_index]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    modelo = model()

    trained_model, _ = compile_fit_model(modelo, X_train=X_train_scaled, y_train=y_train)

    y_test_preds =  model_evaluate(model=trained_model, X_test=X_test_scaled)

    y_group_pred[i] = np.argmax(np.bincount(y_test_preds, minlength = 2))

    y_group_true[i] = np.argmax(np.bincount(y_test, minlength = 2))

    # metricas sin considerar grupos
    test_metrics[i] = accuracy_score(y_pred=y_test_preds, 
                                        y_true=y_test)
                        #  precision_score(y_pred=y_test_preds, 
                        #                  y_true=y_test),
                        #  recall_score(y_pred=y_test_preds, 
                        #               y_true=y_test),
                        # specificity_score(y_pred=y_test_preds, 
                        #                    y_true=y_test)
                                        #    ]
    print(f"********************** iteracion {i + 1} **********************")
    
    # limpiamos la sesion
    del(modelo, trained_model, X_train, y_train, X_test, y_test, X_train_scaled, X_test_scaled)
    tf.keras.backend.clear_session()

********************** iteracion 1 **********************
********************** iteracion 2 **********************
********************** iteracion 3 **********************
********************** iteracion 4 **********************
********************** iteracion 5 **********************
********************** iteracion 6 **********************
********************** iteracion 7 **********************
********************** iteracion 8 **********************
********************** iteracion 9 **********************
********************** iteracion 10 **********************
********************** iteracion 11 **********************
********************** iteracion 12 **********************
********************** iteracion 13 **********************
********************** iteracion 14 **********************
********************** iteracion 15 **********************
********************** iteracion 16 **********************
********************** iteracion 17 **********************
******

In [26]:
print(np.mean(y_group_true == y_group_pred))
print(accuracy_score(y_pred=y_group_pred, y_true=y_group_true))
print(precision_score(y_pred=y_group_pred, y_true=y_group_true))
print(recall_score(y_pred=y_group_pred, y_true=y_group_true))
print(specificity_score(y_pred=y_group_pred, y_true=y_group_true))

0.775
0.775
0.7894736842105263
0.75
0.8


In [27]:
hombres = y_group_true[np.where(sex == 0)]
mujeres = y_group_true[np.where(sex == 1)]

hombres_preds = y_group_pred[np.where(sex == 0)]
mujeres_preds = y_group_pred[np.where(sex == 1)]

print(accuracy_score(y_pred=hombres_preds, y_true=hombres))
print(accuracy_score(y_pred=mujeres_preds,  y_true=mujeres))
print(precision_score(y_pred=hombres_preds, y_true=hombres))
print(precision_score(y_pred=mujeres_preds, y_true=mujeres))
print(recall_score(y_pred=hombres_preds, y_true=hombres))
print(recall_score(y_pred=mujeres_preds, y_true=mujeres))
print(specificity_score(y_pred=hombres_preds, y_true=hombres))
print(specificity_score(y_pred=mujeres_preds, y_true=mujeres))

0.7083333333333334
0.875
0.8
0.7777777777777778
0.6153846153846154
1.0
0.8181818181818182
0.7777777777777778


# Tercer conjunto
subset_imp_xgb_vars

In [28]:
# definimos las variables que vamos a utilizar en el modelo
ids = datos_pacientes['ID'].to_numpy()
X = datos_pacientes.drop(columns=['ID', 'Recording', 'Status', 'Gender'])
X = X[subset_imp_xgb_vars]
y = np.array(datos_pacientes['Status'])

# definimos el leave one group out
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups=ids)

# guardamos los valores de las diferentes métricas
test_metrics = np.zeros(80) # el 80 es por el numero de grupos que hay

y_group_true = np.zeros(80)
y_group_pred = np.zeros(80)

for i, (train_index, test_index) in enumerate(logo.split(X, y, groups=ids)):

    tf.random.set_seed(i)

    X_train = X.iloc[train_index, :]; X_test = X.iloc[test_index, :]
    y_train = y[train_index]; y_test = y[test_index]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    modelo = model()

    trained_model, _ = compile_fit_model(modelo, X_train=X_train_scaled, y_train=y_train)

    y_test_preds =  model_evaluate(model=trained_model, X_test=X_test_scaled)

    y_group_pred[i] = np.argmax(np.bincount(y_test_preds, minlength = 2))

    y_group_true[i] = np.argmax(np.bincount(y_test, minlength = 2))

    # metricas sin considerar grupos
    test_metrics[i] = accuracy_score(y_pred=y_test_preds, 
                                        y_true=y_test)
                        #  precision_score(y_pred=y_test_preds, 
                        #                  y_true=y_test),
                        #  recall_score(y_pred=y_test_preds, 
                        #               y_true=y_test),
                        # specificity_score(y_pred=y_test_preds, 
                        #                    y_true=y_test)
                                        #    ]
    print(f"********************** iteracion {i + 1} **********************")
    
    # limpiamos la sesion
    del(modelo, trained_model, X_train, y_train, X_test, y_test, X_train_scaled, X_test_scaled)
    tf.keras.backend.clear_session()

In [29]:
print(np.mean(y_group_true == y_group_pred))
print(accuracy_score(y_pred=y_group_pred, y_true=y_group_true))
print(precision_score(y_pred=y_group_pred, y_true=y_group_true))
print(recall_score(y_pred=y_group_pred, y_true=y_group_true))
print(specificity_score(y_pred=y_group_pred, y_true=y_group_true))

In [30]:
hombres = y_group_true[np.where(sex == 0)]
mujeres = y_group_true[np.where(sex == 1)]

hombres_preds = y_group_pred[np.where(sex == 0)]
mujeres_preds = y_group_pred[np.where(sex == 1)]

print(accuracy_score(y_pred=hombres_preds, y_true=hombres))
print(accuracy_score(y_pred=mujeres_preds,  y_true=mujeres))
print(precision_score(y_pred=hombres_preds, y_true=hombres))
print(precision_score(y_pred=mujeres_preds, y_true=mujeres))
print(recall_score(y_pred=hombres_preds, y_true=hombres))
print(recall_score(y_pred=mujeres_preds, y_true=mujeres))
print(specificity_score(y_pred=hombres_preds, y_true=hombres))
print(specificity_score(y_pred=mujeres_preds, y_true=mujeres))

# Cuarto conjunto
subset_interseccion_vars

In [31]:
# definimos las variables que vamos a utilizar en el modelo
ids = datos_pacientes['ID'].to_numpy()
X = datos_pacientes.drop(columns=['ID', 'Recording', 'Status', 'Gender'])
X = X[subset_interseccion_vars]
y = np.array(datos_pacientes['Status'])

# definimos el leave one group out
logo = LeaveOneGroupOut()
logo.get_n_splits(X, y, groups=ids)

# guardamos los valores de las diferentes métricas
test_metrics = np.zeros(80) # el 80 es por el numero de grupos que hay

y_group_true = np.zeros(80)
y_group_pred = np.zeros(80)

for i, (train_index, test_index) in enumerate(logo.split(X, y, groups=ids)):

    tf.random.set_seed(i)

    X_train = X.iloc[train_index, :]; X_test = X.iloc[test_index, :]
    y_train = y[train_index]; y_test = y[test_index]

    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    modelo = model()

    trained_model, _ = compile_fit_model(modelo, X_train=X_train_scaled, y_train=y_train)

    y_test_preds =  model_evaluate(model=trained_model, X_test=X_test_scaled)

    y_group_pred[i] = np.argmax(np.bincount(y_test_preds, minlength = 2))

    y_group_true[i] = np.argmax(np.bincount(y_test, minlength = 2))

    # metricas sin considerar grupos
    test_metrics[i] = accuracy_score(y_pred=y_test_preds, 
                                        y_true=y_test)
                        #  precision_score(y_pred=y_test_preds, 
                        #                  y_true=y_test),
                        #  recall_score(y_pred=y_test_preds, 
                        #               y_true=y_test),
                        # specificity_score(y_pred=y_test_preds, 
                        #                    y_true=y_test)
                                        #    ]
    print(f"********************** iteracion {i + 1} **********************")
    
    # limpiamos la sesion
    del(modelo, trained_model, X_train, y_train, X_test, y_test, X_train_scaled, X_test_scaled)
    tf.keras.backend.clear_session()

In [32]:
print(np.mean(y_group_true == y_group_pred))
print(accuracy_score(y_pred=y_group_pred, y_true=y_group_true))
print(precision_score(y_pred=y_group_pred, y_true=y_group_true))
print(recall_score(y_pred=y_group_pred, y_true=y_group_true))
print(specificity_score(y_pred=y_group_pred, y_true=y_group_true))

In [33]:
hombres = y_group_true[np.where(sex == 0)]
mujeres = y_group_true[np.where(sex == 1)]

hombres_preds = y_group_pred[np.where(sex == 0)]
mujeres_preds = y_group_pred[np.where(sex == 1)]

print(accuracy_score(y_pred=hombres_preds, y_true=hombres))
print(accuracy_score(y_pred=mujeres_preds,  y_true=mujeres))
print(precision_score(y_pred=hombres_preds, y_true=hombres))
print(precision_score(y_pred=mujeres_preds, y_true=mujeres))
print(recall_score(y_pred=hombres_preds, y_true=hombres))
print(recall_score(y_pred=mujeres_preds, y_true=mujeres))
print(specificity_score(y_pred=hombres_preds, y_true=hombres))
print(specificity_score(y_pred=mujeres_preds, y_true=mujeres))