In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [13]:
# Conexión a un archivo CSV preprocesado perteneciente a la DB MIMIC-III
# Lectura del archivo CSV en un DataFrame de Pandas
df = pd.read_csv(r'C:\Users\rocio\OneDrive\Escritorio\henry\M6\Clase 01\mimic\modelo.csv')

In [14]:
df.head()

Unnamed: 0,subject_id,fecha,Arterial BP [Systolic],Arterial Blood Pressure mean,Arterial Blood Pressure systolic,Bicarbonate,"Bilirubin, Total",Creatinine,GCS Total,Heart Rate,Platelet Count,"Potassium, Whole Blood",Respiratory Rate,"Sodium, Whole Blood",Temperature Fahrenheit,Urea Nitrogen,Vancomycin/Random,WBC Count,pO2
0,10006,2164-09-24,,,,27.0,,7.0,,,157.0,,,,,31.0,,,
1,10006,2164-09-25,,,,28.0,,7.4,,,168.0,,,,,34.0,,,
2,10006,2164-09-26,,,,28.0,,8.3,,,152.0,,,,,42.0,,,
3,10006,2164-09-27,,,,,,,,,127.0,,,,,,,,
4,10006,2164-09-28,,,,27.0,,6.2,,,,,,,,28.0,,,


In [15]:
# Renombra las columnas del DataFrame
df = df.rename(columns={
    'subject_id': 'paciente',
    'fecha': 'fecha',
    'Arterial BP [Systolic]': 'PAS',
    'Arterial Blood Pressure mean': 'PAM',
    'Arterial Blood Pressure systolic': 'sistolica',
    'Bicarbonate': 'bicarbonato',
    'Bilirubin, Total': 'bilirrubina',
    'Creatinine': 'creatinina',
    'GCS Total': 'GCS',
    'Heart Rate': 'ritmo_cardiaco',
    'Platelet Count': 'plaquetas',
    'Potassium, Whole Blood': 'potasio',
    'Whole Blood': 'sangre_total',
    'Respiratory Rate': 'ritmo_respiratorio',
    'Sodium, Whole Blood': 'sodio',
    'Temperature Fahrenheit': 'temperatura_F',
    'Urea Nitrogen': 'urea',
    'Vancomycin/Random': 'vancomicina',
    'WBC Count': 'glucemia',
    'pO2': 'pO2'
})


In [16]:
#Visualizo
df.head()

Unnamed: 0,paciente,fecha,PAS,PAM,sistolica,bicarbonato,bilirrubina,creatinina,GCS,ritmo_cardiaco,plaquetas,potasio,ritmo_respiratorio,sodio,temperatura_F,urea,vancomicina,glucemia,pO2
0,10006,2164-09-24,,,,27.0,,7.0,,,157.0,,,,,31.0,,,
1,10006,2164-09-25,,,,28.0,,7.4,,,168.0,,,,,34.0,,,
2,10006,2164-09-26,,,,28.0,,8.3,,,152.0,,,,,42.0,,,
3,10006,2164-09-27,,,,,,,,,127.0,,,,,,,,
4,10006,2164-09-28,,,,27.0,,6.2,,,,,,,,28.0,,,


In [17]:
#Verifico si todos los registros son unicos
count = df.nunique()
print(count)

paciente               100
fecha                 1649
PAS                     80
PAM                     58
sistolica               73
bicarbonato             40
bilirrubina            116
creatinina              92
GCS                     14
ritmo_cardiaco          84
plaquetas              456
potasio                 35
ritmo_respiratorio      36
sodio                   18
temperatura_F           68
urea                   135
vancomicina             47
glucemia                 1
pO2                    182
dtype: int64


In [18]:
# Cuenta la cantidad de valores nulos en cada fila del DataFrame
null_counts = df.isnull().sum(axis=1)

# Imprime los resultados
print(null_counts)


0       13
1       13
2       13
3       16
4       14
        ..
1713    12
1714    13
1715    13
1716    13
1717    16
Length: 1718, dtype: int64


In [19]:
#Elimino aquellos registros cuyos nulos sean >60 % (ya que si tiene tantos valores faltantes no me va a servir para entrenar el modelo)

# Define el umbral de valores no nulos
threshold = len(df.columns) * 0.4

# Elimina las filas que no cumplen el umbral de valores no nulos
df = df.dropna(thresh=threshold)


In [20]:
#Corroboro
df.head()

Unnamed: 0,paciente,fecha,PAS,PAM,sistolica,bicarbonato,bilirrubina,creatinina,GCS,ritmo_cardiaco,plaquetas,potasio,ritmo_respiratorio,sodio,temperatura_F,urea,vancomicina,glucemia,pO2
20,10006,2164-10-23,,,,29.0,1.0,3.0,15.0,104.0,116.0,,25.0,,,9.0,,,
21,10006,2164-10-24,,,,31.0,,3.5,15.0,96.0,106.0,,20.0,,,11.0,21.6,,
22,10006,2164-10-25,,,,29.0,,5.3,15.0,87.0,108.0,,20.0,,,20.0,,,
30,10006,2164-11-16,,,,31.0,,3.2,,,160.0,3.6,,,,7.0,,,94.0
40,10006,2165-03-01,,,,25.0,0.3,8.6,,,166.0,4.8,,,,34.0,,,


In [21]:
# Cuenta la cantidad de valores nulos en cada fila del DataFrame
null_counts = df.isnull().sum(axis=1)

# Imprime los resultados
print(null_counts)

20       9
21       9
22      10
30      11
40      11
        ..
1708     8
1709     8
1710     8
1711    11
1712    11
Length: 584, dtype: int64


In [22]:
#Selecciono pacientes que hayan tenido como mínimo 3 mediciones de signos vitales

# Agrupar por paciente y contar registros no nulos por columna de signos vitales
counts = df.groupby('paciente').agg({'PAS': 'count',
                                       'PAM': 'count',
                                       'bicarbonato': 'count',
                                       'bilirrubina': 'count',
                                       'creatinina': 'count',
                                       'GCS': 'count',
                                       'ritmo_cardiaco': 'count',
                                       'plaquetas': 'count',
                                       'potasio': 'count',
                                       'ritmo_respiratorio': 'count',
                                       'sodio': 'count',
                                       'temperatura_F': 'count',
                                       'urea': 'count',
                                       'glucemia': 'count',
                                       'pO2': 'count'})

# Seleccionar solamente los registros de pacientes con al menos 3 registros de signos vitales
selected_subject_ids = counts[counts['PAS'] >= 3].index

# Filtrar los datos del DataFrame original por los pacientes seleccionados
df_selected = df[df['paciente'].isin(selected_subject_ids)]


In [None]:
df_selected.head()

Entrenamiento de modelo de machine learning


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Crear la variable objetivo
df_selected['condicion_medica'] = ...

# Separar las características y la variable objetivo
X = df_selected.drop(['paciente', 'fecha', 'condicion_medica'], axis=1)
y = df_selected['condicion_medica']

# Dividir los datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo de árbol de decisión
tree_clf = DecisionTreeClassifier(max_depth=3, random_state=42)
tree_clf.fit(X_train, y_train)

# Evaluar el modelo en los datos de prueba
y_pred = tree_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Exactitud:", accuracy)


In [24]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

selected_subject_ids = counts[counts['PAS'] >= 3].index
df_selected = df[df['paciente'].isin(selected_subject_ids)]

# Definir la variable objetivo 'sepsis'
df_selected['sepsis'] = ((df_selected['ritmo_cardiaco'] >= 90) & 
                         (df_selected['PAS'] <= 90) &
                         (df_selected['temperatura_F'] <= 36.0)).astype(int)

# Seleccionar los signos vitales relevantes
X = df_selected[['PAS', 'ritmo_cardiaco', 'temperatura_F']]
y = df_selected['sepsis']

# Dividir los datos en conjuntos de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Crear y entrenar el modelo de árbol de decisión
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Evaluar el modelo
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f'Train score: {train_score:.3f}, Test score: {test_score:.3f}')




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['sepsis'] = ((df_selected['ritmo_cardiaco'] >= 90) &


ValueError: Input X contains NaN.
DecisionTreeClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values