In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.2-py2.py3-none-any.whl (81 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━[0m [32m71.7/81.8 kB[0m [31m2.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.8/81.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.2


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from category_encoders import BinaryEncoder, CountEncoder, TargetEncoder

In [None]:
data = pd.DataFrame({
    'color': ['rojo', 'verde', 'verde', 'verde', 'rojo'],
    'tamaño': ['grande', 'pequeño', 'mediano', 'grande', 'mediano'],
    'precio': [100, 200, 150, 300, 250],
    'clase': ['A', 'B', 'D', 'C', 'E']
})

In [None]:
data_original = data.copy()

In [None]:
data.head()

Unnamed: 0,color,tamaño,precio,clase
0,rojo,grande,100,A
1,verde,pequeño,200,B
2,verde,mediano,150,D
3,verde,grande,300,C
4,rojo,mediano,250,E


In [None]:
df = data.copy()

# One Hot Encoder

## Sklearn

In [None]:
# Inicializar el codificador
one_hot_encoder = OneHotEncoder()

# Ajustar y transformar los datos
one_hot_encoded = one_hot_encoder.fit_transform(data[['color']])

# Convertir el resultado a un DataFrame de Pandas
one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=one_hot_encoder.categories_)

# Mostrar el resultado
print("One-Hot Encoder:")
print(one_hot_encoded_df)

One-Hot Encoder:
  rojo verde
0  1.0   0.0
1  0.0   1.0
2  0.0   1.0
3  0.0   1.0
4  1.0   0.0


In [None]:
Encoder_1 = OneHotEncoder()
Encoding_1 = Encoder_1.fit_transform(df[['clase']])

In [None]:
df_nuevo = pd.DataFrame(Encoding_1.toarray(), columns=Encoder_1.categories_)

In [None]:
df_nuevo

Unnamed: 0,A,B,C,D,E
0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0


## Pandas

In [None]:
# Unir los DataFrames
df_ohe = pd.concat([data, one_hot_encoded_df], axis=1)
df_ohe.columns = data.columns.tolist() + ['_'.join(col) for col in one_hot_encoded_df.columns]

df_ohe = df_ohe.drop('color', axis=1)
df_ohe.head()


Unnamed: 0,tamaño,precio,clase,rojo,verde
0,grande,100,A,1.0,0.0
1,pequeño,200,B,0.0,1.0
2,mediano,150,D,0.0,1.0
3,grande,300,C,0.0,1.0
4,mediano,250,E,1.0,0.0


In [None]:
# Obtener la codificación one-hot utilizando pd.get_dummies()
one_hot_encoded_df2 = pd.get_dummies(data['clase'], prefix='clase', drop_first = True)

# Unir los DataFrames sin paréntesis ni comas en los nombres de las columnas
df_ohe2 = pd.concat([data, one_hot_encoded_df2], axis=1)
df_ohe2.drop('clase', axis=1, inplace=True)

df_ohe2.head()

Unnamed: 0,color,tamaño,precio,clase_B,clase_C,clase_D,clase_E
0,rojo,grande,100,0,0,0,0
1,verde,pequeño,200,1,0,0,0
2,verde,mediano,150,0,0,1,0
3,verde,grande,300,0,1,0,0
4,rojo,mediano,250,0,0,0,1


# Label Encoder

## Sklearn

In [None]:
# Inicializar el codificador
label_encoder = LabelEncoder()

# Ajustar y transformar los datos
label_encoded = label_encoder.fit_transform(data['tamaño'])

# Mostrar el resultado
print("Label Encoder:")
print(label_encoded)

Label Encoder:
[0 2 1 4 3]


In [None]:
# agregar columna al dataframe
df_le = data.copy()
df_le['tamaño_encoded'] = label_encoded

df_le.head()

Unnamed: 0,color,tamaño,precio,clase,tamaño_encoded
0,rojo,grande,100,A,0
1,verde,pequeño,200,B,2
2,verde,mediano,150,D,1
3,verde,grande,300,C,4
4,rojo,mediano,250,E,3


## Pandas

In [None]:
# Aplicar label encoding utilizando pd.factorize()
label_encoded2, _ = pd.factorize(data['tamaño'])

# Agregar la columna al DataFrame original
df_lee = data.copy()
df_lee['clase_label_encoded'] = label_encoded2

# Mostrar el resultado
print("DataFrame con Label Encoder:")
df_lee.head()

DataFrame con Label Encoder:


Unnamed: 0,color,tamaño,precio,clase,clase_label_encoded
0,rojo,grande,100,A,0
1,verde,pequeño,200,B,1
2,verde,mediano,150,D,2
3,verde,grande,300,C,0
4,rojo,mediano,250,E,2


# Ordinal Encoder

## Sklearn

In [None]:
# Definir el orden de las categorías
orden_categorias = ['pequeño','mediano','grande']

# Inicializar el codificador
ordinal_encoder = OrdinalEncoder(categories=[orden_categorias])

# Ajustar y transformar los datos
ordinal_encoded = ordinal_encoder.fit_transform(data[['tamaño']])

# Convertir el resultado a un DataFrame de Pandas
ordinal_encoded_df = pd.DataFrame(ordinal_encoded, columns=['tamaño'])

# Mostrar el resultado
print("Ordinal Encoder:")
print(ordinal_encoded_df)

Ordinal Encoder:
   tamaño
0     2.0
1     0.0
2     1.0
3     2.0
4     1.0


In [None]:
# agregar columna al dataframe
df_oe = data.copy()
df_oe['tamaño_ordinal'] = ordinal_encoded_df

df_oe.head()

Unnamed: 0,color,tamaño,precio,clase,tamaño_ordinal
0,rojo,grande,100,A,2.0
1,verde,pequeño,200,B,0.0
2,verde,mediano,150,D,1.0
3,verde,grande,300,C,2.0
4,rojo,mediano,250,E,1.0


## Mapeo con ciclo for

In [None]:
# Definir el orden de las categorías
orden_categorias = ['A', 'B', 'C','D','E']

# Definir el diccionario de mapeo
mapeo = {categoria: indice for indice, categoria in enumerate(orden_categorias)}

# Aplicar el mapeo utilizando la función map()
ordinal_encoded_df2 = pd.DataFrame(data['clase'].map(mapeo), columns=['clase'])

# Mostrar el resultado
print("Ordinal Encoder:")
print(ordinal_encoded_df2)

Ordinal Encoder:
   clase
0      0
1      1
2      3
3      2
4      4


In [None]:
# agregar columna al dataframe
df_oe2 = data.copy()
df_oe2['clase_ordinal'] = ordinal_encoded_df2

df_oe2.head()

Unnamed: 0,color,tamaño,precio,clase,clase_ordinal
0,rojo,grande,100,A,0
1,verde,pequeño,200,B,1
2,verde,mediano,150,D,3
3,verde,grande,300,C,2
4,rojo,mediano,250,E,4


# Binary Encoder

In [None]:
# Inicializar el codificador
binary_encoder = BinaryEncoder()

# Ajustar y transformar los datos
binary_encoded = binary_encoder.fit_transform(data[['clase']])

# Convertir el resultado a un DataFrame de Pandas
binary_encoded_df = pd.DataFrame(binary_encoded, columns=binary_encoder.get_feature_names_out())

# Mostrar el resultado
print("Binary Encoder:")
print(binary_encoded_df)

Binary Encoder:
   clase_0  clase_1  clase_2
0        0        0        1
1        0        1        0
2        0        1        1
3        1        0        0
4        1        0        1


In [None]:
# Unir los DataFrames
df_be = pd.concat([data, binary_encoded_df], axis=1)

df_be.head()

Unnamed: 0,color,tamaño,precio,clase,clase_0,clase_1,clase_2
0,rojo,grande,100,A,0,0,1
1,verde,pequeño,200,B,0,1,0
2,verde,mediano,150,D,0,1,1
3,verde,grande,300,C,1,0,0
4,rojo,mediano,250,E,1,0,1


# Count Encoder

## Category Encoders

In [None]:
# Inicializar el codificador
count_encoder = CountEncoder()

# Ajustar y transformar los datos
count_encoded = count_encoder.fit_transform(data['color'])

# Mostrar el resultado
print("Count Encoder:")
print(count_encoded)

Count Encoder:
   color
0      2
1      3
2      3
3      3
4      2


In [None]:
# agregar columna al dataframe
df_ce = data.copy()
df_ce['color_encoded'] = count_encoded

df_ce.head(10)

Unnamed: 0,color,tamaño,precio,clase,color_encoded
0,rojo,grande,100,A,2
1,verde,pequeño,200,B,3
2,verde,mediano,150,D,3
3,verde,grande,300,C,3
4,rojo,mediano,250,E,2


## Mapeo

In [None]:
# Aplicar el conteo y codificación
count_encoded2 = data['color'].map(data['color'].value_counts())

# Mostrar el resultado
print("Count Encoder:")
print(count_encoded2)

Count Encoder:
0    2
1    3
2    3
3    3
4    2
Name: color, dtype: int64


In [None]:
# agregar columna al dataframe
df_ce2 = data.copy()
df_ce2['color_count'] = count_encoded2

df_ce2.head(10)

Unnamed: 0,color,tamaño,precio,clase,color_count
0,rojo,grande,100,A,2
1,verde,pequeño,200,B,3
2,verde,mediano,150,D,3
3,verde,grande,300,C,3
4,rojo,mediano,250,E,2


# Target Encoder

## Category Encoders

In [None]:
# Inicializar el codificador
target_encoder = TargetEncoder()

# Ajustar y transformar los datos
target_encoded = target_encoder.fit_transform(data['color'], data['precio'])

# Mostrar el resultado
print("Target Encoder:")
print(target_encoded)

Target Encoder:
        color
0  196.453723
1  202.574421
2  202.574421
3  202.574421
4  196.453723


In [None]:
# agregar columna al dataframe
df_te = data.copy()
df_te['color_encoded'] = target_encoded

df_te.head(10)

Unnamed: 0,color,tamaño,precio,clase,color_encoded
0,rojo,grande,100,A,196.453723
1,verde,pequeño,200,B,202.574421
2,verde,mediano,150,D,202.574421
3,verde,grande,300,C,202.574421
4,rojo,mediano,250,E,196.453723


#Mean & Group by

In [None]:
# Calcular la media del objetivo por categoría
mean_target = data.groupby('color')['precio'].transform('mean')

# Mostrar el resultado
print("Target Encoder:")
print(mean_target)


Target Encoder:
0    175.000000
1    216.666667
2    216.666667
3    216.666667
4    175.000000
Name: precio, dtype: float64


In [None]:
# agregar columna al dataframe
df_te2 = data.copy()
df_te2['mean encoded'] = mean_target

df_te2.head(10)

Unnamed: 0,color,tamaño,precio,clase,mean encoded
0,rojo,grande,100,A,175.0
1,verde,pequeño,200,B,216.666667
2,verde,mediano,150,D,216.666667
3,verde,grande,300,C,216.666667
4,rojo,mediano,250,E,175.0
