In [16]:
# Librerías
# Para leer/guardar archivos en formato pickle
import pickle

# Para el manejo y análisis de estructuras de datos
import pandas as pd

# Para realizar One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder 

In [19]:
# Leer el archivo
df = pd.read_pickle("datos/1-bikes-eda.pkl")
df.head()

Unnamed: 0_level_0,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
instant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,2018-01-01,winter,0,1,1,0,0,2,14.110847,18.18125,80.5833,10.749882,331,654,985
2,2018-01-02,winter,0,1,0,1,1,2,14.902598,17.68695,69.6087,16.652113,131,670,801
3,2018-01-03,winter,0,1,0,2,1,1,8.050924,9.47025,43.7273,16.636703,120,1229,1349
4,2018-01-04,winter,0,1,0,3,1,1,8.2,10.6061,59.0435,10.739832,108,1454,1562
5,2018-01-05,winter,0,1,0,4,1,1,9.305237,11.4635,43.6957,12.5223,82,1518,1600


In [20]:
df.dtypes

dteday        datetime64[ns]
season              category
yr                  category
mnth                category
holiday             category
weekday             category
workingday          category
weathersit          category
temp                 float64
atemp                float64
hum                  float64
windspeed            float64
casual                 int64
registered             int64
cnt                    int64
dtype: object

Encoding columna categórica "season", no le damos orden, indistintamente se puede utilizar el método One-Hot Encoding o get_dummies, ya que ambos métodos crean tantas columnas como opciones, y el valor es: 
- 1 cuando está en la columna original
- 0 cuando no está en la columna original

In [21]:
# Función para aplicar el método One-Hot Encoding
def aplicar_one_hot_encoding(dfe, columna):
    
    # Iniciar el método
    oh = OneHotEncoder()
    
    # Hacer la codificación de los datos para la variable dada 
    transformados = oh.fit_transform(dfe[[columna]])
    
    # Convertir nuestro array con la codificación hecha en un dataframe
    oh_df = pd.DataFrame(transformados.toarray())
    
    # El método get_feature_names nos va a dar el nombre de las columnas nuevas que se nos generarán
    oh_df.columns = oh.get_feature_names_out()
    
    # Concatenar el dataframe original con el dataframe que acabamos de crear
    final = pd.concat([dfe,oh_df], axis=1)
    
    # Eliminar la columna original 
    final.drop(columna, axis=1, inplace=True)

    # Convertir a tipo entero las columnas generadas
    lista_columnas = list(oh_df.columns)
    for col in lista_columnas:
        final[col] = final[col].astype("Int64")

    return final

In [22]:
# Aplicar el método One-Hot Encoding
df = aplicar_one_hot_encoding(df, "season")

In [23]:
df.head()

Unnamed: 0,dteday,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,season_autumn,season_spring,season_summer,season_winter
1,2018-01-01,0,1,1,0,0,2,14.110847,18.18125,80.5833,10.749882,331.0,654.0,985.0,0,0,0,1
2,2018-01-02,0,1,0,1,1,2,14.902598,17.68695,69.6087,16.652113,131.0,670.0,801.0,0,0,0,1
3,2018-01-03,0,1,0,2,1,1,8.050924,9.47025,43.7273,16.636703,120.0,1229.0,1349.0,0,0,0,1
4,2018-01-04,0,1,0,3,1,1,8.2,10.6061,59.0435,10.739832,108.0,1454.0,1562.0,0,0,0,1
5,2018-01-05,0,1,0,4,1,1,9.305237,11.4635,43.6957,12.5223,82.0,1518.0,1600.0,0,0,0,1


In [24]:
df.dtypes

dteday           datetime64[ns]
yr                     category
mnth                   category
holiday                category
weekday                category
workingday             category
weathersit             category
temp                    float64
atemp                   float64
hum                     float64
windspeed               float64
casual                  float64
registered              float64
cnt                     float64
season_autumn             Int64
season_spring             Int64
season_summer             Int64
season_winter             Int64
dtype: object

In [27]:
# Reorganizar el orden de las columnas
df = df.reindex(columns=['dteday', 'season_autumn', 'season_spring', 'season_summer', 'season_winter', 
                         'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit', 'temp', 'atemp', 
                         'hum', 'windspeed', 'casual', 'registered', 'cnt'])

In [28]:
df.head()

Unnamed: 0,dteday,season_autumn,season_spring,season_summer,season_winter,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
1,2018-01-01,0,0,0,1,0,1,1,0,0,2,14.110847,18.18125,80.5833,10.749882,331.0,654.0,985.0
2,2018-01-02,0,0,0,1,0,1,0,1,1,2,14.902598,17.68695,69.6087,16.652113,131.0,670.0,801.0
3,2018-01-03,0,0,0,1,0,1,0,2,1,1,8.050924,9.47025,43.7273,16.636703,120.0,1229.0,1349.0
4,2018-01-04,0,0,0,1,0,1,0,3,1,1,8.2,10.6061,59.0435,10.739832,108.0,1454.0,1562.0
5,2018-01-05,0,0,0,1,0,1,0,4,1,1,9.305237,11.4635,43.6957,12.5223,82.0,1518.0,1600.0


In [29]:
# Guardar en pickle
df.to_pickle('datos/3-bikes-encoding-sin-orden.pkl')