In [1]:
"""
En este cuaderno la intencion es discretizar una serie de datasets en formato .ord usando KbinsDiscretizer de sklearn
"""
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import os


In [2]:
# Cargamos todos los datasets
#datasets = os.listdir('0_Datasets/No_discretizados')
datasets= os.listdir(os.getcwd())
datasets = [x for x in datasets if x.endswith('.data')]

data_list=[]
for data in datasets:
    data_list.append(pd.read_csv(data,sep=",",header=None))

datasets


['pyrim.data',
 'auto.data',
 'triazines.data',
 'abalone.data',
 'housing.data',
 'wpbc.data',
 'diabetes.data',
 'stock.data',
 'machine.data']

In [3]:
print("\n Se cargaron los datasets: \n")
for name,data in zip(datasets,data_list):
    print(name)
    print(data.shape)
    print(data.head())
    print("\n")


 Se cargaron los datasets: 

pyrim.data
(74, 28)
    0     1    2    3    4    5    6      7     8     9   ...     18     19  \
0  0.5  0.26  0.1  0.9  0.9  0.9  0.1  0.367  0.42  0.26  ...  0.367  0.100   
1  0.5  0.26  0.1  0.1  0.1  0.5  0.1  0.367  0.58  0.42  ...  0.900  0.367   
2  0.3  0.42  0.1  0.1  0.5  0.5  0.1  0.367  0.26  0.26  ...  0.633  0.633   
3  0.1  0.74  0.7  0.1  0.5  0.1  0.1  0.367  0.10  0.26  ...  0.367  0.100   
4  0.1  0.42  0.4  0.1  0.5  0.1  0.1  0.367  0.10  0.26  ...  0.367  0.100   

    20   21   22   23   24     25     26     27  
0  0.1  0.1  0.1  0.1  0.0  0.100  0.100  0.571  
1  0.1  0.1  0.1  0.5  0.0  0.367  0.900  0.900  
2  0.1  0.1  0.5  0.5  0.0  0.367  0.367  0.833  
3  0.1  0.1  0.1  0.1  0.0  0.100  0.100  0.582  
4  0.1  0.1  0.1  0.1  0.0  0.100  0.100  0.587  

[5 rows x 28 columns]


auto.data
(392, 8)
   0      1      2       3     4   5  6     7
0  8  307.0  130.0  3504.0  12.0  70  1  18.0
1  8  350.0  165.0  3693.0  11.5  70  1

In [4]:
def discretize_last_column(X, bins):
    # X es el dataset completo
    # Selecciona solo la última columna de X
    last_column = X[:, -1].reshape(-1, 1)
    
    # Crea el discretizador
    est = KBinsDiscretizer(n_bins=bins, encode='ordinal', strategy='uniform')
    
    # Ajusta y transforma la última columna
    last_column_discretized = est.fit_transform(last_column)

    # los hacemos tipo int
    last_column_discretized = last_column_discretized.astype(int)    
   
    X[:, -1] = last_column_discretized.ravel()
    
    return X

In [13]:
# y=pyrim.iloc[:,-1].values.reshape(-1,1)
# est=KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='quantile')
# y_binned = est.fit_transform(y)
# pyrim.iloc[:,-1]=y_binned.ravel()+1

# #ordenamos segun el valor de la variable de respuesta (menor a mayor)
# pyrim.sort_values(by=pyrim.columns[-1], inplace=True)

# #se guarda como un texto plano (txt)
# pyrim.to_csv('pyrim_ord.data', header=False, index=False, sep=' ', mode='w')



In [5]:
# Para cada dataset, aplicamos la función de discretización
ruta_5bins="Discretizados/5_bins"
ruta_10bins="Discretizados/10_bins"

for data,name in zip(data_list,datasets):
    #rutas donde guardaremos los datasets discretizados
    ruta5=os.path.join(ruta_5bins,name)
    ruta10=os.path.join(ruta_10bins,name)
    #discretizacion de la ultima columna en 5 y 10 bins
    data_aux5 = discretize_last_column(data.values, 5)
    data_aux5 = pd.DataFrame(data_aux5)
    data_aux10 = discretize_last_column(data.values, 10)
    data_aux10 = pd.DataFrame(data_aux10)
    #guardamos el dataset discretizado como un csv
    data_aux5.to_csv(ruta5, header=False, index=False, sep=',', mode='w')
    data_aux10.to_csv(ruta10, header=False, index=False, sep=',', mode='w')



