# Preprocesamiento de Datos - Ejemplo Práctico

In [22]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder

In [23]:
# Importemos los datos:
df = pd.read_csv('census.csv')


In [24]:
# Veamos el dataset:
df.head()


Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,Male,40.0,1.0,<=50K
1,50.0,Self-emp-not-inc,Bachelors,White,Male,13.0,1.0,<=50K
2,38.0,Private,High-school,White,Male,40.0,1.0,<=50K
3,53.0,Private,Some-high-school,Black,Male,40.0,1.0,<=50K
4,28.0,Private,Bachelors,Black,Female,40.0,0.0,<=50K


In [25]:
# Descripción de las columnas:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41716 entries, 0 to 41715
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41617 non-null  float64
 1   workclass       41705 non-null  object 
 2   education       41702 non-null  object 
 3   race            41700 non-null  object 
 4   sex             41701 non-null  object 
 5   hours_per_week  41631 non-null  float64
 6   USA_born        41701 non-null  float64
 7   label           41716 non-null  object 
dtypes: float64(3), object(5)
memory usage: 2.5+ MB
None


In [26]:
# Verifiquemos si hay datos faltantes:
df.isnull().sum()

age               99
workclass         11
education         14
race              16
sex               15
hours_per_week    85
USA_born          15
label              0
dtype: int64

In [27]:
# Descartemos las filas que tengan 3 o más datos faltantes:
df1 = df[df.isnull().sum(axis=1) < 3]
print(df1)
print(f"Número de filas restantes: {len(df1)}")

        age         workclass         education                race     sex  \
0      39.0         State-gov         Bachelors               White    Male   
1      50.0  Self-emp-not-inc         Bachelors               White    Male   
2      38.0           Private       High-school               White    Male   
3      53.0           Private  Some-high-school               Black    Male   
4      28.0           Private         Bachelors               Black  Female   
...     ...               ...               ...                 ...     ...   
41711  33.0           Private         Bachelors               White    Male   
41712  39.0           Private         Bachelors               White  Female   
41713  38.0           Private         Bachelors               White    Male   
41714  44.0           Private         Bachelors  Asian-Pac-Islander    Male   
41715  35.0      Self-emp-inc         Bachelors               White    Male   

       hours_per_week  USA_born  label  
0         

In [28]:
# Contemos de nuevo los datos faltantes:
print(df1.isnull().sum())

age               85
workclass          0
education          0
race               0
sex                0
hours_per_week    68
USA_born           0
label              0
dtype: int64


In [29]:
# Imputemos los datos faltantes de edad y horas trabajadas por semana con la mediana de cada una de esas columnas
df1 = df1.copy()  # Crear copia para evitar warnings

# Calcular las medianas antes de imputar
mediana_edad = df1['age'].median()
mediana_horas = df1['hours_per_week'].median()

# Imputar los valores faltantes
df1['age'] = df1['age'].fillna(mediana_edad)
df1['hours_per_week'] = df1['hours_per_week'].fillna(mediana_horas)

# Mostrar las medianas usadas
print("Mediana de edad:", mediana_edad)
print("Mediana de horas por semana:", mediana_horas)

# Verificar que no quedan valores nulos en estas columnas
print("\nValores nulos en 'age':", df1['age'].isnull().sum())
print("Valores nulos en 'hours_per_week':", df1['hours_per_week'].isnull().sum())


Mediana de edad: 37.0
Mediana de horas por semana: 40.0

Valores nulos en 'age': 0
Valores nulos en 'hours_per_week': 0


In [30]:
# Contemos de nuevo los datos faltantes:
print("Datos faltantes por columna:")
print(df1.isnull().sum())

# O solo mostrar las columnas que tienen datos faltantes:
print("\nColumnas con datos faltantes:")
nulos = df1.isnull().sum()
print(nulos[nulos > 0])

Datos faltantes por columna:
age               0
workclass         0
education         0
race              0
sex               0
hours_per_week    0
USA_born          0
label             0
dtype: int64

Columnas con datos faltantes:
Series([], dtype: int64)


In [31]:
# Apliquemos one-hot encoding a la columna "workclass":
cat_cols = ['workclass', 'race']

print(f"Columnas antes: {len(df1.columns)}")
df1 = pd.get_dummies(df1, columns=cat_cols, prefix=cat_cols, drop_first=True)
print(f"Columnas después: {len(df1.columns)}")

print("\nDataset después de One-Hot Encoding:")
display(df1.head())

Columnas antes: 8
Columnas después: 16

Dataset después de One-Hot Encoding:


Unnamed: 0,age,education,sex,hours_per_week,USA_born,label,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39.0,Bachelors,Male,40.0,1.0,<=50K,False,False,False,False,True,False,False,False,False,True
1,50.0,Bachelors,Male,13.0,1.0,<=50K,False,False,False,True,False,False,False,False,False,True
2,38.0,High-school,Male,40.0,1.0,<=50K,False,True,False,False,False,False,False,False,False,True
3,53.0,Some-high-school,Male,40.0,1.0,<=50K,False,True,False,False,False,False,False,True,False,False
4,28.0,Bachelors,Female,40.0,0.0,<=50K,False,True,False,False,False,False,False,True,False,False


In [32]:
# Hallemos los valores que toma la columna "education":
df['education'].unique()

array(['Bachelors', 'High-school', 'Some-high-school', 'Masters',
       'Some-college', 'Middle-school', 'Doctorate', 'Some-middle-school',
       'Preschool', 'Elementary-school', nan], dtype=object)

In [33]:

# Apliquemos ordinal encoding a la columna "education":
oe= OrdinalEncoder()
df1['education'] = oe.fit_transform(df1[['education']])
df1.head()

Unnamed: 0,age,education,sex,hours_per_week,USA_born,label,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,race_Asian-Pac-Islander,race_Black,race_Other,race_White
0,39.0,0.0,Male,40.0,1.0,<=50K,False,False,False,False,True,False,False,False,False,True
1,50.0,0.0,Male,13.0,1.0,<=50K,False,False,False,True,False,False,False,False,False,True
2,38.0,3.0,Male,40.0,1.0,<=50K,False,True,False,False,False,False,False,False,False,True
3,53.0,8.0,Male,40.0,1.0,<=50K,False,True,False,False,False,False,False,True,False,False
4,28.0,0.0,Female,40.0,0.0,<=50K,False,True,False,False,False,False,False,True,False,False


In [34]:
# Verifiquemos que la columna "education" tenga los valores apropiados:
print("Información de la columna education:")
print(f"Valores únicos: {sorted(df1['education'].dropna().unique())}")
print(f"Cantidad de valores nulos: {df1['education'].isnull().sum()}")
print(f"Tipo de dato: {df1['education'].dtype}")

print("\nEstadísticas descriptivas:")
print(df1['education'].describe())

print("\nDistribución de valores:")
print(df1['education'].value_counts().sort_index())

print(df1['education'].unique())
print(df1['education'].nunique())


Información de la columna education:
Valores únicos: [np.float64(0.0), np.float64(1.0), np.float64(2.0), np.float64(3.0), np.float64(4.0), np.float64(5.0), np.float64(6.0), np.float64(7.0), np.float64(8.0), np.float64(9.0)]
Cantidad de valores nulos: 0
Tipo de dato: float64

Estadísticas descriptivas:
count    41694.000000
mean         4.043316
std          2.747751
min          0.000000
25%          3.000000
50%          3.000000
75%          7.000000
max          9.000000
Name: education, dtype: float64

Distribución de valores:
education
0.0     7767
1.0      575
2.0      229
3.0    14968
4.0     2590
5.0      843
6.0       73
7.0    10028
8.0     4153
9.0      468
Name: count, dtype: int64
[0. 3. 8. 4. 7. 5. 1. 9. 6. 2.]
10


In [35]:
# Apliquemos one-hot encoding a la columna 'race':

target = df1 if 'df1' in globals() else df

if 'race' in target.columns:
    target = pd.get_dummies(target, columns=['race'], prefix='race', drop_first=True)
    # Reasignar a la variable global correspondiente para que el resto del notebook use el resultado
    if 'df1' in globals():
        df1 = target
    else:
        df = target
    print("One-hot encoding aplicado para 'race'. Columnas resultantes:")
    display(pd.DataFrame(target.columns.tolist(), columns=['Columnas']))
else:
    print("La columna 'race' no está presente en el DataFrame objetivo. Ningún cambio realizado.")
    print('Columnas actuales:')
    display(pd.DataFrame(target.columns.tolist(), columns=['Columnas']))

La columna 'race' no está presente en el DataFrame objetivo. Ningún cambio realizado.
Columnas actuales:


Unnamed: 0,Columnas
0,age
1,education
2,sex
3,hours_per_week
4,USA_born
5,label
6,workclass_Local-gov
7,workclass_Private
8,workclass_Self-emp-inc
9,workclass_Self-emp-not-inc


In [36]:
# Apliquemos binary encoding a la columna "sex":
# Aplicar binary encoding a la columna "sex" en df1
le= LabelEncoder()
df1["sex"]= le.fit_transform(df1["sex"])
df1.head()


Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,1,40.0,1.0,<=50K
1,50.0,Self-emp-not-inc,Bachelors,White,1,13.0,1.0,<=50K
2,38.0,Private,High-school,White,1,40.0,1.0,<=50K
3,53.0,Private,Some-high-school,Black,1,40.0,1.0,<=50K
4,28.0,Private,Bachelors,Black,0,40.0,0.0,<=50K


In [37]:
#Apliquemos binary encoding a la etiqueta:
df1['label'] = oe.fit_transform(df1[['label']])
df1.head()

Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,1,40.0,1.0,0.0
1,50.0,Self-emp-not-inc,Bachelors,White,1,13.0,1.0,0.0
2,38.0,Private,High-school,White,1,40.0,1.0,0.0
3,53.0,Private,Some-high-school,Black,1,40.0,1.0,0.0
4,28.0,Private,Bachelors,Black,0,40.0,0.0,0.0


In [38]:
# Veamos el dataset resultante:
print(df1.shape)
df1.head()

(41694, 16)


Unnamed: 0,age,workclass,education,race,sex,hours_per_week,USA_born,label
0,39.0,State-gov,Bachelors,White,1,40.0,1.0,0.0
1,50.0,Self-emp-not-inc,Bachelors,White,1,13.0,1.0,0.0
2,38.0,Private,High-school,White,1,40.0,1.0,0.0
3,53.0,Private,Some-high-school,Black,1,40.0,1.0,0.0
4,28.0,Private,Bachelors,Black,0,40.0,0.0,0.0


In [39]:
# Veamos la nueva descripción del dataset:
df1.info()
df1.describe(include='all')


<class 'pandas.core.frame.DataFrame'>
Index: 41694 entries, 0 to 41715
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   age                         41694 non-null  float64
 1   education                   41694 non-null  float64
 2   sex                         41694 non-null  object 
 3   hours_per_week              41694 non-null  float64
 4   USA_born                    41694 non-null  float64
 5   label                       41694 non-null  object 
 6   workclass_Local-gov         41694 non-null  bool   
 7   workclass_Private           41694 non-null  bool   
 8   workclass_Self-emp-inc      41694 non-null  bool   
 9   workclass_Self-emp-not-inc  41694 non-null  bool   
 10  workclass_State-gov         41694 non-null  bool   
 11  workclass_Without-pay       41694 non-null  bool   
 12  race_Asian-Pac-Islander     41694 non-null  bool   
 13  race_Black                  41694 no

Unnamed: 0,age,education,sex,hours_per_week,USA_born,label,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,race_Asian-Pac-Islander,race_Black,race_Other,race_White
count,41694.0,41694.0,41694,41694.0,41694.0,41694,41694,41694,41694,41694,41694,41694,41694,41694,41694,41694
unique,,,2,,,2,2,2,2,2,2,2,2,2,2,2
top,,,Male,,,<=50K,False,True,False,False,False,False,False,False,False,True
freq,,,28187,,,31797,38854,30954,40246,38310,39897,41675,40426,37676,41343,35665
mean,38.471651,4.043316,,40.742529,0.895045,,,,,,,,,,,
std,13.351404,2.747751,,11.990589,0.306499,,,,,,,,,,,
min,17.0,0.0,,1.0,0.0,,,,,,,,,,,
25%,28.0,3.0,,40.0,1.0,,,,,,,,,,,
50%,37.0,3.0,,40.0,1.0,,,,,,,,,,,
75%,47.0,7.0,,45.0,1.0,,,,,,,,,,,


In [40]:
# Verifiquemos el tipo de dato de cada columna:
print("\nTipos de datos por columna:")
print(df1.dtypes)


Tipos de datos por columna:
age                           float64
education                     float64
sex                            object
hours_per_week                float64
USA_born                      float64
label                          object
workclass_Local-gov              bool
workclass_Private                bool
workclass_Self-emp-inc           bool
workclass_Self-emp-not-inc       bool
workclass_State-gov              bool
workclass_Without-pay            bool
race_Asian-Pac-Islander          bool
race_Black                       bool
race_Other                       bool
race_White                       bool
dtype: object


In [41]:
# Carguemos el dataset preprocesado a un nuevo archivo (CSV):
df1.to_csv('census_cleaned.csv', index=False)
print(" Archivo guardado exitosamente")


 Archivo guardado exitosamente
