# Datos Categóricos

In [1]:
import pandas as pd

datos = {"nombre" : ["Mariana", "Ana", "Elsa", "Gustavo",
                     "Pedro", "Raúl", "Carlos", "José", "Luis"],
         
         "saldo" : [10000.00, 8000.00, 9000.00, 2000.00,
                    2100.00, 12000.00, 5000.00, 10000.00, 200.00],
         
         "pais" : ["Argentina", "Bolivia", "Chile", "Colombia",
                   "Costa Rica", "Ecuador", "México", "Perú", "Perú"]}

In [2]:
# Crear el dataframe
datos = pd.DataFrame(datos)

print("\033[1;35m" + "DATAFRAME" + "\033[0m\n")
datos

[1;35mDATAFRAME[0m



Unnamed: 0,nombre,saldo,pais
0,Mariana,10000.0,Argentina
1,Ana,8000.0,Bolivia
2,Elsa,9000.0,Chile
3,Gustavo,2000.0,Colombia
4,Pedro,2100.0,Costa Rica
5,Raúl,12000.0,Ecuador
6,Carlos,5000.0,México
7,José,10000.0,Perú
8,Luis,200.0,Perú


In [3]:
# Observar que la columna pais es de tipo object, cambiarla por el tipo category
datos['pais'] = datos['pais'].astype('category')

print("\033[1;33m" + "Cambiar la columna pais a tipo de dato category" + "\033[0m\n")
datos.info()

[1;33mCambiar la columna pais a tipo de dato category[0m

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   nombre  9 non-null      object  
 1   saldo   9 non-null      float64 
 2   pais    9 non-null      category
dtypes: category(1), float64(1), object(1)
memory usage: 649.0+ bytes


# INAPROPIADA
### Codificación con Reemplazo

In [8]:
# Obtener una copia de los datos
datos_sesgados = datos.copy()
datos_sesgados

# Reemplazar los datos de 'pais' por valores numéricos
# Crear el JSON de reemplazo
reemplazo = {
    "Argentina": 1,
    "Bolivia": 2,
    "Chile": 3,
    "Colombia": 4,
    "Costa Rica": 5,
    "Ecuador": 6,
    "México": 7,
    "Perú": 8
}

datos_sesgados.replace( {'pais': reemplazo}, inplace = True )
print("\033[1;35m" + "COPIA DE LOS DATOS" + "\033[0m")
datos_sesgados

[1;35mCOPIA DE LOS DATOS[0m


  datos_sesgados.replace( {'pais': reemplazo}, inplace = True )
  datos_sesgados.replace( {'pais': reemplazo}, inplace = True )


Unnamed: 0,nombre,saldo,pais
0,Mariana,10000.0,1
1,Ana,8000.0,2
2,Elsa,9000.0,3
3,Gustavo,2000.0,4
4,Pedro,2100.0,5
5,Raúl,12000.0,6
6,Carlos,5000.0,7
7,José,10000.0,8
8,Luis,200.0,8


# Codificación de Categorías One-hot

In [11]:
# Importar de la librería sklearn.preprocessing el módulo OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# Crear un codificador de tipo OneHotEncoder()
codificador = OneHotEncoder()

# Crear una codificación con valores numéricos sin introducir un sesgo con el método fit_transform()
codificacion = codificador.fit_transform(datos[['pais']]) # trabaja con DataFrames, por ello lleva doble corchete

# Convertir a una matriz de Numpy
print("\033[1;35m" + "MATRIZ" + "\033[0m")
print(codificacion.toarray())

# Construir un DataFrame de nombre nuevas_columnas
nuevas_cols = pd.DataFrame( codificacion.toarray(),
                          columns = codificador.categories_)
print("\n\033[1;35m" + "DATAFRAME NUEVAS COLUMNAS" + "\033[0m")
print(nuevas_cols)

# De esta manera se evita un sesgo, no existe distancia entre los valores

# Incorporar las nuevas columnas a nuestro DataFrame original
datos = pd.concat( [datos, nuevas_cols], axis = 'columns' )

[1;35mMATRIZ[0m
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1.]]

[1;35mDATAFRAME NUEVAS COLUMNAS[0m
  Argentina Bolivia Chile Colombia Costa Rica Ecuador México Perú
0       1.0     0.0   0.0      0.0        0.0     0.0    0.0  0.0
1       0.0     1.0   0.0      0.0        0.0     0.0    0.0  0.0
2       0.0     0.0   1.0      0.0        0.0     0.0    0.0  0.0
3       0.0     0.0   0.0      1.0        0.0     0.0    0.0  0.0
4       0.0     0.0   0.0      0.0        1.0     0.0    0.0  0.0
5       0.0     0.0   0.0      0.0        0.0     1.0    0.0  0.0
6       0.0     0.0   0.0      0.0        0.0     0.0    1.0  0.0
7       0.0     0.0   0.0      0.0        0.0     0.0    0.0  1.0
8       0.0     0.0   0.0      0.0        0.0     0.0    0.0  1.0


In [12]:
# Verificar el nuevo contenido

print("\033[1;33m" + "Nuevo contenido de DataFrame" + "\033[0m\n")
datos

[1;33mNuevo contenido de DataFrame[0m



Unnamed: 0,nombre,saldo,pais,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(México,)","(Perú,)"
0,Mariana,10000.0,Argentina,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,Bolivia,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,Chile,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,Colombia,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,Costa Rica,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raúl,12000.0,Ecuador,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,México,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,José,10000.0,Perú,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,Perú,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [15]:
# Eliminar la columna 'pais'
datos.drop('pais', axis = 1, inplace = True)

In [17]:
# Verificar el nuevo contenido

print("\033[1;33m" + "Después de eliminar la columna país" + "\033[0m\n")
datos

[1;33mDespués de eliminar la columna país[0m



Unnamed: 0,nombre,saldo,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(México,)","(Perú,)"
0,Mariana,10000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raúl,12000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,José,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


# Datos Categóricos Ordinales

### Nominales
Son categorías que no tienen un orden o jerarquía inherente. Simplemente representan grupos o etiquetas sin una secuencia lógica, como los colores.

### Ordinales
Son categorías que tienen un orden o jerarquía específica. Aunque no son numéricos, siguen una secuencia lógica, como los colores del arcoíris o el nivel educativo.

In [408]:
# Eliminar columnas duplicadas
datos = datos.loc[:, ~datos.columns.duplicated()]

print("\033[1;35m" + "ELIMINAR COLUMNAS DUPLICADAS" + "\033[0m\n")
datos

[1;35mELIMINAR COLUMNAS DUPLICADAS[0m



Unnamed: 0,nombre,saldo,"(Argentina,)","(Bolivia,)","(Chile,)","(Colombia,)","(Costa Rica,)","(Ecuador,)","(México,)","(Perú,)"
0,Mariana,10000.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Ana,8000.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Elsa,9000.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,Gustavo,2000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,Pedro,2100.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,Raúl,12000.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,Carlos,5000.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,José,10000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,Luis,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [410]:
# Renombrar columnas
nuevos_paises = {
    ('Argentina',): 'Argentina',
    ('Bolivia',): 'Bolivia',
    ('Chile',): 'Chile',
    ('Colombia',): 'Colombia',
    ('Costa Rica',): 'Costa Rica',
    ('Ecuador',): 'Ecuador',
    ('México',): 'México',
    ('Perú',): 'Perú'
}

datos.rename(columns = nuevos_paises, inplace = True)
print("\033[1;35m" + "RENOMBRAR COLUMNAS" + "\033[0m\n")
print(datos.columns.tolist())

[1;35mRENOMBRAR COLUMNAS[0m

['nombre', 'saldo', 'Argentina', 'Bolivia', 'Chile', 'Colombia', 'Costa Rica', 'Ecuador', 'México', 'Perú']


In [412]:
# Lista de países (nombres de las columnas generadas por OneHotEncoder)
paises = codificador.categories_[0]

# Mapear los valores 1.0 y 0.0 a las categorías
for pais in paises:
    datos[pais] = datos[pais].map({1.0: 1, 0.0: 0})

# Verificar los valores únicos en cada columna
for pais in paises:
    print(f"\033[1;33m Valores en {pais}:\033[0m {datos[pais].unique()}")

[1;33m Valores en Argentina:[0m [1 0]
[1;33m Valores en Bolivia:[0m [0 1]
[1;33m Valores en Chile:[0m [0 1]
[1;33m Valores en Colombia:[0m [0 1]
[1;33m Valores en Costa Rica:[0m [0 1]
[1;33m Valores en Ecuador:[0m [0 1]
[1;33m Valores en México:[0m [0 1]
[1;33m Valores en Perú:[0m [0 1]


In [414]:
# Definir el orden de las categorías
orden = [1, 0]

# Convertir las columnas a tipo categórico ordinal
for pais in paises:
    datos[pais] = pd.Categorical(datos[pais], categories = orden, ordered = True)

print("\033[1;35m" + "TIPO DE DATO CATEGÓRICO ORDINAL" + "\033[0m\n")
datos.info()

[1;35mTIPO DE DATO CATEGÓRICO ORDINAL[0m

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   nombre      9 non-null      object  
 1   saldo       9 non-null      float64 
 2   Argentina   9 non-null      category
 3   Bolivia     9 non-null      category
 4   Chile       9 non-null      category
 5   Colombia    9 non-null      category
 6   Costa Rica  9 non-null      category
 7   Ecuador     9 non-null      category
 8   México      9 non-null      category
 9   Perú        9 non-null      category
dtypes: category(8), float64(1), object(1)
memory usage: 1.3+ KB


In [416]:
print("\033[1;33m" + "Verificar que son de tipo categórico ordinal" + "\033[0m")
print(datos["Argentina"].cat.ordered)

[1;33mVerificar que son de tipo categórico ordinal[0m
True


In [424]:
print("\033[1;33m" + "Verificar DataFrame" + "\033[0m\n")
datos

[1;33mVerificar DataFrame[0m



Unnamed: 0,nombre,saldo,Argentina,Bolivia,Chile,Colombia,Costa Rica,Ecuador,México,Perú
0,Mariana,10000.0,1,0,0,0,0,0,0,0
1,Ana,8000.0,0,1,0,0,0,0,0,0
2,Elsa,9000.0,0,0,1,0,0,0,0,0
3,Gustavo,2000.0,0,0,0,1,0,0,0,0
4,Pedro,2100.0,0,0,0,0,1,0,0,0
5,Raúl,12000.0,0,0,0,0,0,1,0,0
6,Carlos,5000.0,0,0,0,0,0,0,1,0
7,José,10000.0,0,0,0,0,0,0,0,1
8,Luis,200.0,0,0,0,0,0,0,0,1


### Técnica para cambiar datos ordinales
Práctica de clase

In [24]:
# Crear el Dataframe
categorias_servicio = ["Muy insatisfecho", "Insatisfecho", "Neutral", "Satisfecho", "Muy satisfecho"]

categorias_calidad = ["Mala", "Buena", "Muy buena", "Excelente"]

encuesta = {
    "Servicio": ["Muy insatisfecho", "Insatisfecho", "Neutral", "Satisfecho", "Muy satisfecho", "Muy insatisfecho"],
    "Alimentos": ["Mala", "Buena", "Muy buena", "Excelente", "Mala", "Buena"]
}

# Tipos de Cliente
# 0 ---> Cliente Esporádico
# 1 ---> Cliente Frecuente

df_encuesta = pd.DataFrame(encuesta)
df_encuesta

Unnamed: 0,Servicio,Alimentos
0,Muy insatisfecho,Mala
1,Insatisfecho,Buena
2,Neutral,Muy buena
3,Satisfecho,Excelente
4,Muy satisfecho,Mala
5,Muy insatisfecho,Buena


### Codificador Ordinal
Evita la dimensionalidad

In [30]:
# Importar la librería para codificar de forma ordinal
from sklearn.preprocessing import OrdinalEncoder

datos_ord = pd.DataFrame(encuesta)

# Crear codificador
codificador = OrdinalEncoder(categories = [categorias_servicio, categorias_calidad])

datos_ord = pd.DataFrame(codificador.fit_transform(datos_ord), columns = ['servicio', 'alimentos'])
datos_ord

Unnamed: 0,servicio,alimentos
0,0.0,0.0
1,1.0,1.0
2,2.0,2.0
3,3.0,3.0
4,4.0,0.0
5,0.0,1.0


### Codificador OneHotEncoder

In [45]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder

datos_one = pd.DataFrame(encuesta)
codificador = OneHotEncoder()

# Mostrar la matriz dispersa
# print(codificador.fit_transform(datos_one).toarray())

# Mostrar títulos de las columnas
# print(np.concatenate(codificador.categories_))

# Generar el DataFrame
datos_one = pd.DataFrame(codificador.fit_transform(datos_one).toarray(),
                        columns = np.concatenate(codificador.categories_))
datos_one

Unnamed: 0,Insatisfecho,Muy insatisfecho,Muy satisfecho,Neutral,Satisfecho,Buena,Excelente,Mala,Muy buena
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
