In [1]:
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

file_path = '../Ficheiros CSV/CVD_cleaned.csv'  # chamar file com data reduction 
data = pd.read_csv(file_path)

data_discretized = data.copy()

numeric_columns = [
    "Height_(cm)",
    "Weight_(kg)",
    "BMI",
    "Alcohol_Consumption",
    "Fruit_Consumption",
    "Green_Vegetables_Consumption",
    "FriedPotato_Consumption",
]

# Configurar o discretizador
n_bins = 10  # Número de bins
discretizer = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='kmeans')

# Aplicar a discretização nas colunas numéricas
data_discretized[numeric_columns] = discretizer.fit_transform(data_discretized[numeric_columns])

# Mostrar os limites dos bins para cada coluna
for col in numeric_columns:
    print(f"Intervalos para {col}: {discretizer.bin_edges_}")

# Guardar o dataset discretizado num novo arquivo
output_file_path = '../Ficheiros CSV/CVD_discretized.csv'  # Nome do arquivo de saída
data_discretized.to_csv(output_file_path, index=False)

print(f"Dataset discretizado salvo como: {output_file_path}")

data_discretized.head()

Intervalos para Height_(cm): [array([ 91.        , 114.16637529, 134.95423424, 149.8065543 ,
        158.84785405, 167.0437888 , 176.12371207, 185.35376896,
        193.8421687 , 208.00602872, 241.        ])
 array([ 25.4       ,  63.45764499,  78.71478367,  95.5299662 ,
        114.62206791, 136.26746441, 159.28002091, 182.89784807,
        209.46355568, 243.275     , 293.02      ])
 array([12.02      , 22.5105613 , 26.05520107, 29.27837861, 32.49629865,
        36.06552477, 40.33654574, 45.76990169, 53.54182428, 67.39755762,
        99.33      ])
 array([ 0.        ,  2.152728  ,  5.74025613,  9.28771889, 12.9790314 ,
        15.51777397, 18.07362393, 21.88470726, 24.46896815, 27.26205172,
        30.        ])
 array([  0.        ,  10.08185288,  22.46519399,  34.756196  ,
         43.48481511,  53.58687928,  69.36016772,  84.36368695,
         94.40746488, 109.40303719, 120.        ])
 array([  0.        ,   9.51086364,  22.08399954,  35.0996624 ,
         50.36784385,  63.48023578

Unnamed: 0,General_Health,Checkup,Exercise,Heart_Disease,Skin_Cancer,Other_Cancer,Depression,Diabetes,Arthritis,Sex,Age_Category,Height_(cm),Weight_(kg),BMI,Smoking_History,Alcohol_Consumption,Fruit_Consumption,Green_Vegetables_Consumption,FriedPotato_Consumption
0,Poor,Within the past 2 years,No,No,No,No,No,No,Yes,Female,70-74,3.0,0.0,0.0,Yes,0.0,2.0,1.0,1.0
1,Very Good,Within the past year,No,Yes,No,No,No,Yes,No,Female,70-74,4.0,1.0,2.0,No,0.0,2.0,0.0,0.0
2,Very Good,Within the past year,Yes,No,No,No,No,Yes,No,Female,60-64,4.0,2.0,4.0,No,1.0,1.0,0.0,1.0
3,Poor,Within the past year,Yes,Yes,No,No,No,Yes,No,Male,75-79,6.0,2.0,2.0,No,0.0,2.0,2.0,0.0
4,Good,Within the past year,No,No,No,No,No,No,No,Male,80+,7.0,2.0,1.0,Yes,0.0,0.0,0.0,0.0
