# Preprocesamiento

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Importar Dataset como dataframe de pandas
df = pd.read_csv("Data/healthcare-dataset-stroke-data.csv")
df.head().T

Unnamed: 0,0,1,2,3,4
id,9046,51676,31112,60182,1665
gender,Male,Female,Male,Female,Female
age,67.0,61.0,80.0,49.0,79.0
hypertension,0,0,0,0,1
heart_disease,1,0,1,0,0
ever_married,Yes,Yes,Yes,Yes,Yes
work_type,Private,Self-employed,Private,Private,Self-employed
Residence_type,Urban,Rural,Rural,Urban,Rural
avg_glucose_level,228.69,202.21,105.92,171.23,174.12
bmi,36.6,,32.5,34.4,24.0


In [3]:
df.drop(columns="id", inplace=True)

In [4]:
df.isnull().sum()

gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

Fijemonos en los datos de tipo categórico

In [5]:
# stats of categorical data
round (df.describe(exclude = ['float', 'int64']),2)

Unnamed: 0,gender,ever_married,work_type,Residence_type,smoking_status
count,5110,5110,5110,5110,5110
unique,3,2,5,2,4
top,Female,Yes,Private,Urban,never smoked
freq,2994,3353,2925,2596,1892


## Información inicial sobre el conjunto de datos
Faltan datos de la función bmi.
Hay datos categóricos y numéricos.
- Características categóricas: gender, ever_married, work_type, Residence_type, smoking_status
- Características numéricas binarias: hypertension,heart_disease, stroke
- Características numéricas continuas: age, avg_glucose_level, bmi

La mayoría de los datos son categóricos, por lo que hay que prestar especial atención a su visualización.

La extracción de características se puede realizar agrupando las características continuas (edad, nivel de azúcar y PMI) y dejando todo como está.

### Rellenar datos faltantes
Se imputan los valores faltantes en la variable del bmi con KNN.

El método KNN para imputación de datos busca los vecinos más cercanos para predecir los valores faltantes basándose en las características de los datos disponibles.

In [6]:
df["bmi"]

0       36.6
1        NaN
2       32.5
3       34.4
4       24.0
        ... 
5105     NaN
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 5110, dtype: float64

In [7]:
from sklearn.impute import KNNImputer

In [8]:
# Separar las columnas numéricas para la imputación
numeric_features = df[['bmi', 'age', 'avg_glucose_level']]

# Inicializar el imputador KNN con el número de vecinos deseado (por ejemplo, 3)
knn_imputer = KNNImputer(n_neighbors=3)

# Imputar los datos
imputed_data = knn_imputer.fit_transform(numeric_features)

# Convertir la matriz de nuevo a un DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=numeric_features.columns)

# Reemplazar los datos originales con los datos imputados
df['bmi'] = imputed_df['bmi']


df.isnull().sum()

gender               0
age                  0
hypertension         0
heart_disease        0
ever_married         0
work_type            0
Residence_type       0
avg_glucose_level    0
bmi                  0
smoking_status       0
stroke               0
dtype: int64

In [9]:
df["bmi"]

0       36.6
1       30.5
2       32.5
3       34.4
4       24.0
        ... 
5105    26.8
5106    40.0
5107    30.6
5108    25.6
5109    26.2
Name: bmi, Length: 5110, dtype: float64

En un notebook encontré la siguiente generación de caracteristicas a partir de las que ya tenemos. Los valores de agrupacion se toman de los siguientes articulos:
- https://www.medicalnewstoday.com/articles/323446#body-mass-index
- https://kidspicturedictionary.com/english-through-pictures/people-english-through-pictures/age-physical-description/
- https://agamatrix.com/blog/normal-blood-sugar-level-chart/

In [10]:
# df['bmi_categ'] = pd.cut(df['bmi'], bins = [0, 19, 25,30,10000], labels = ['Underweight', 'Ideal', 'Overweight', 'Obesity'])
# df['age_categ'] = pd.cut(df['age'], bins = [0,13,18, 45,60,200], labels = ['Children', 'Teens', 'Adults','Mid Adults','Elderly'])
# df['glucose_categ'] = pd.cut(df['avg_glucose_level'], bins = [0,90,160,230,500], labels = ['Low', 'Normal', 'High', 'Very High'])

In [11]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,30.5,never smoked,1
2,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [12]:
numeric_columns = ['age', 'bmi', 'avg_glucose_level']
categorical_columns = ['gender', 'hypertension', 'heart_disease', 'ever_married', 'work_type', 'Residence_type', 'smoking_status', 'stroke']

In [13]:
# ALgo de Analisis estadistico
# i = 0
# fig, ax = plt.subplots(3, 3, figsize=(15, 8))
# plt.subplots_adjust(hspace = 0.5)
# for num_col in numeric_columns :
#     sns.kdeplot(x=num_col, hue='stroke', data=df, multiple='stack', ax=ax[i,0])
#     sns.boxplot(x=num_col, data=df, ax=ax[i, 1])
#     sns.scatterplot(x=num_col, y='stroke', data=df, ax=ax[i, 2])
#     i+=1
# plt.show()

### Cambiar variables categóricas por numéricas

In [14]:
# Reemplazo de valores
data = df.replace({
    'gender': {'Male': 0, 'Female': 1, 'Other': 2},
    'ever_married': {'Yes': 0, 'No': 1},
    'work_type': {'Private': 0, 'Self-employed': 1, 'Govt_job': 2, 'children': 3, 'Never_worked': 4},
    'smoking_status': {'formerly smoked': 0, 'never smoked': 1, 'smokes': 2, 'Unknown': 3},
    'Residence_type': {'Urban': 0, 'Rural': 1},
    # 'bmi_categ':  {'Underweight':0, 'Ideal': 1, 'Overweight': 2, 'Obesity': 3},
    # 'age_categ':  {'Children':0, 'Teens': 1, 'Adults': 2,'Mid Adults': 3,'Elderly': 4},
    # 'glucose_categ':  {'Low': 0, 'Normal': 1, 'High': 2, 'Very High': 3},
})


  data = df.replace({


In [15]:
data

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,0,0,228.69,36.6,0,1
1,1,61.0,0,0,0,1,1,202.21,30.5,1,1
2,0,80.0,0,1,0,0,1,105.92,32.5,1,1
3,1,49.0,0,0,0,0,0,171.23,34.4,2,1
4,1,79.0,1,0,0,1,1,174.12,24.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,0,0,0,83.75,26.8,1,0
5106,1,81.0,0,0,0,1,0,125.20,40.0,1,0
5107,1,35.0,0,0,0,1,1,82.99,30.6,1,0
5108,0,51.0,0,0,0,0,1,166.29,25.6,0,0


In [16]:
data.to_csv("Data/stroke_data.csv", index=False)

In [17]:
# Leer el archivo CSV guardado para verificar
df_loaded = pd.read_csv("Data/stroke_data.csv")
df_loaded

Unnamed: 0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,67.0,0,1,0,0,0,228.69,36.6,0,1
1,1,61.0,0,0,0,1,1,202.21,30.5,1,1
2,0,80.0,0,1,0,0,1,105.92,32.5,1,1
3,1,49.0,0,0,0,0,0,171.23,34.4,2,1
4,1,79.0,1,0,0,1,1,174.12,24.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
5105,1,80.0,1,0,0,0,0,83.75,26.8,1,0
5106,1,81.0,0,0,0,1,0,125.20,40.0,1,0
5107,1,35.0,0,0,0,1,1,82.99,30.6,1,0
5108,0,51.0,0,0,0,0,1,166.29,25.6,0,0
