In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (
    OneHotEncoder, StandardScaler
)

In [3]:
df_cacao = pd.read_csv("flavors_of_cacao.csv")
df_cacao.head()

Unnamed: 0,Company \n(Maker-if known),Specific Bean Origin\nor Bar Name,REF,Review\nDate,Cocoa\nPercent,Company\nLocation,Rating,Bean\nType,Broad Bean\nOrigin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [6]:
# Limpiamos los nombres de las columnas
df_cacao.columns = df_cacao.columns.str.replace(r"[\n\s]+", "_", regex=True)
df_cacao.head()

Unnamed: 0,Company_(Maker-if_known),Specific_Bean_Origin_or_Bar_Name,REF,Review_Date,Cocoa_Percent,Company_Location,Rating,Bean_Type,Broad_Bean_Origin
0,A. Morin,Agua Grande,1876,2016,63%,France,3.75,,Sao Tome
1,A. Morin,Kpime,1676,2015,70%,France,2.75,,Togo
2,A. Morin,Atsane,1676,2015,70%,France,3.0,,Togo
3,A. Morin,Akata,1680,2015,70%,France,3.5,,Togo
4,A. Morin,Quilla,1704,2015,70%,France,3.5,,Peru


In [7]:
# Identificamos las columnas númericas y categoricas
num_features = df_cacao.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_features = df_cacao.select_dtypes(include=['object']).columns.tolist()

In [10]:
# Definimos las transformaciones para cada tipo de dato
num_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Escalado de datos numéricos
])

cat_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Codificación OneHot
])

In [11]:
# Hacemos una combinación en ColumnTransformer
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, num_features),
    ('cat', cat_transformer, cat_features)
])

# Crear el pipeline final
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

In [12]:
# Aplicamos el Pipeline
df_cacao_transformed = pipeline.fit_transform(df_cacao)

# Convertir a DataFrame si es necesario
df_cacao_transformed = pd.DataFrame(df_cacao_transformed.toarray()) 

In [13]:
# Visualizamos las primeras filas del DataFrame transformado
df_cacao_transformed.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1696,1697,1698,1699,1700,1701,1702,1703,1704,1705
0,1.519895,1.255692,1.180231,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.158056,0.913975,-0.912129,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.158056,0.913975,-0.389039,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.165293,0.913975,0.657141,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.208714,0.913975,0.657141,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
