In [186]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans 
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder


In [187]:
# Asegúrate de cargar el archivo 'caracteristicas_vinos.csv' en tu entorno de Google Colab
df = pd.read_csv('data_evaluacion.csv', header=None)
df.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [188]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       48842 non-null  int64 
 1   1       48842 non-null  object
 2   2       48842 non-null  int64 
 3   3       48842 non-null  object
 4   4       48842 non-null  int64 
 5   5       48842 non-null  object
 6   6       48842 non-null  object
 7   7       48842 non-null  object
 8   8       48842 non-null  object
 9   9       48842 non-null  object
 10  10      48842 non-null  int64 
 11  11      48842 non-null  int64 
 12  12      48842 non-null  int64 
 13  13      48842 non-null  object
 14  14      48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [189]:
# Nuevas etiquetas para las columnas
nuevas_columnas = ['edad', 'clase_trabajo', 'fnlwgt', 'educacion', 'educacion_num', 'estado_civil','ocupacion', 'relacion', 
                   'raza', 'genero', 'ganancia-capital', 'perdida-capital', 'horas_semana', 'pais', 'salario']

# Asignar nuevas etiquetas
df.columns = nuevas_columnas
# Restablece el índice
df.reset_index(drop=True, inplace=True)


In [190]:
# Reemplazar los valores '?' por NaN
df.replace('?', pd.NA, inplace=True)

In [191]:
columnas_categoricas = ['clase_trabajo', 'educacion', 'estado_civil', 'ocupacion', 'relacion', 'raza','genero', 'pais']
for col in columnas_categoricas:
    mode_value = df[col].mode()[0]
    df[col] =  df[col].fillna(mode_value)

In [192]:
columnas_numericas = ['edad','fnlwgt','educacion_num','ganancia-capital','perdida-capital','horas_semana']
for col in columnas_numericas:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

In [193]:
df.head()

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [194]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   edad              48842 non-null  int64 
 1   clase_trabajo     48842 non-null  object
 2   fnlwgt            48842 non-null  int64 
 3   educacion         48842 non-null  object
 4   educacion_num     48842 non-null  int64 
 5   estado_civil      48842 non-null  object
 6   ocupacion         48842 non-null  object
 7   relacion          48842 non-null  object
 8   raza              48842 non-null  object
 9   genero            48842 non-null  object
 10  ganancia-capital  48842 non-null  int64 
 11  perdida-capital   48842 non-null  int64 
 12  horas_semana      48842 non-null  int64 
 13  pais              48842 non-null  object
 14  salario           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [195]:
# Elimina los espacios en blanco de los valores en la columna 'salary'
df['salario'] = df['salario'].str.strip()
salary_map = {'<=50K': 1, '>50K': 0}
df['salario'] = df['salario'].map(salary_map)


In [196]:
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,1
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,1
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,0
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,0


In [197]:
# Elimina los espacios en blanco de los valores en la columna 'genero'
df['genero'] = df['genero'].str.strip()
# Mapea los valores
df['genero'] = df['genero'].map({'Male': 1, 'Female': 0})
print(df.head())

   edad     clase_trabajo  fnlwgt  educacion  educacion_num  \
0    39         State-gov   77516  Bachelors             13   
1    50  Self-emp-not-inc   83311  Bachelors             13   
2    38           Private  215646    HS-grad              9   
3    53           Private  234721       11th              7   
4    28           Private  338409  Bachelors             13   

         estado_civil          ocupacion       relacion   raza  genero  \
0       Never-married       Adm-clerical  Not-in-family  White       1   
1  Married-civ-spouse    Exec-managerial        Husband  White       1   
2            Divorced  Handlers-cleaners  Not-in-family  White       1   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black       1   
4  Married-civ-spouse     Prof-specialty           Wife  Black       0   

   ganancia-capital  perdida-capital  horas_semana           pais  salario  
0              2174                0            40  United-States        1  
1                 0   

In [198]:
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,United-States,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,United-States,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,United-States,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,United-States,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Cuba,1
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,United-States,1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,Jamaica,1
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,United-States,0
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,United-States,0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,United-States,0


In [199]:
# Contar los registros de cada valor en la columna 'country'
country_counts = df['pais'].value_counts()
print(country_counts)


pais
United-States                 44689
Mexico                          951
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru                             46
Ecuador                

In [200]:
# Elimina los espacios en blanco de los valores en la columna 'salary'
df['pais'] = df['pais'].str.strip()
data = [df]
for dataset in data:
    dataset.loc[dataset['pais'] != 'United-States', 'pais'] = 'Non-US'
    dataset.loc[dataset['pais'] == 'United-States', 'pais'] = 'US'

df.head(10)


Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,US,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,US,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,US,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,US,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,Non-US,1
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,US,1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,Non-US,1
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,US,0
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,US,0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,US,0


In [201]:
df['pais'] = df['pais'].map({'US':1,'Non-US':0}).astype(int)
df.head(10)


Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,1,2174,0,40,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,13,1,1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,1,0,0,40,1,1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,1,0,0,40,1,1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,0,40,0,1
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,0,0,0,40,1,1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,0,0,0,16,0,1
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,1,0,0,45,1,0
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,0,14084,0,50,1,0
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,1,5178,0,40,1,0


In [202]:
#Remplazamos estados civiles a Single y Couple
df['estado_civil'] = df['estado_civil'].str.strip()
df['estado_civil'] = df['estado_civil'].replace(['Divorced','Married-spouse-absent','Never-married','Separated','Widowed'],'Single')
df['estado_civil'] = df['estado_civil'].replace(['Married-AF-spouse','Married-civ-spouse'],'Couple')
df.head(10)


Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,Single,Adm-clerical,Not-in-family,White,1,2174,0,40,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,Couple,Exec-managerial,Husband,White,1,0,0,13,1,1
2,38,Private,215646,HS-grad,9,Single,Handlers-cleaners,Not-in-family,White,1,0,0,40,1,1
3,53,Private,234721,11th,7,Couple,Handlers-cleaners,Husband,Black,1,0,0,40,1,1
4,28,Private,338409,Bachelors,13,Couple,Prof-specialty,Wife,Black,0,0,0,40,0,1
5,37,Private,284582,Masters,14,Couple,Exec-managerial,Wife,White,0,0,0,40,1,1
6,49,Private,160187,9th,5,Single,Other-service,Not-in-family,Black,0,0,0,16,0,1
7,52,Self-emp-not-inc,209642,HS-grad,9,Couple,Exec-managerial,Husband,White,1,0,0,45,1,0
8,31,Private,45781,Masters,14,Single,Prof-specialty,Not-in-family,White,0,14084,0,50,1,0
9,42,Private,159449,Bachelors,13,Couple,Exec-managerial,Husband,White,1,5178,0,40,1,0


In [203]:
df['estado_civil'] = df['estado_civil'].map({'Couple':0,'Single':1}) 
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,1,Adm-clerical,Not-in-family,White,1,2174,0,40,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,0,Exec-managerial,Husband,White,1,0,0,13,1,1
2,38,Private,215646,HS-grad,9,1,Handlers-cleaners,Not-in-family,White,1,0,0,40,1,1
3,53,Private,234721,11th,7,0,Handlers-cleaners,Husband,Black,1,0,0,40,1,1
4,28,Private,338409,Bachelors,13,0,Prof-specialty,Wife,Black,0,0,0,40,0,1
5,37,Private,284582,Masters,14,0,Exec-managerial,Wife,White,0,0,0,40,1,1
6,49,Private,160187,9th,5,1,Other-service,Not-in-family,Black,0,0,0,16,0,1
7,52,Self-emp-not-inc,209642,HS-grad,9,0,Exec-managerial,Husband,White,1,0,0,45,1,0
8,31,Private,45781,Masters,14,1,Prof-specialty,Not-in-family,White,0,14084,0,50,1,0
9,42,Private,159449,Bachelors,13,0,Exec-managerial,Husband,White,1,5178,0,40,1,0


In [204]:
label_encoder = LabelEncoder()
df['relacion'] = label_encoder.fit_transform(df['relacion'])
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,1,Adm-clerical,1,White,1,2174,0,40,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,0,Exec-managerial,0,White,1,0,0,13,1,1
2,38,Private,215646,HS-grad,9,1,Handlers-cleaners,1,White,1,0,0,40,1,1
3,53,Private,234721,11th,7,0,Handlers-cleaners,0,Black,1,0,0,40,1,1
4,28,Private,338409,Bachelors,13,0,Prof-specialty,5,Black,0,0,0,40,0,1
5,37,Private,284582,Masters,14,0,Exec-managerial,5,White,0,0,0,40,1,1
6,49,Private,160187,9th,5,1,Other-service,1,Black,0,0,0,16,0,1
7,52,Self-emp-not-inc,209642,HS-grad,9,0,Exec-managerial,0,White,1,0,0,45,1,0
8,31,Private,45781,Masters,14,1,Prof-specialty,1,White,0,14084,0,50,1,0
9,42,Private,159449,Bachelors,13,0,Exec-managerial,0,White,1,5178,0,40,1,0


In [205]:
# Obtener las categorías originales
categories = label_encoder.classes_

# Crear un DataFrame para mostrar el mapeo
mapping_df = pd.DataFrame({
    'Categoría': categories,
    'Código': range(len(categories))
})

print(mapping_df)

        Categoría  Código
0         Husband       0
1   Not-in-family       1
2  Other-relative       2
3       Own-child       3
4       Unmarried       4
5            Wife       5


In [206]:
label_encoder = LabelEncoder()
df['raza'] = label_encoder.fit_transform(df['raza'])
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario
0,39,State-gov,77516,Bachelors,13,1,Adm-clerical,1,4,1,2174,0,40,1,1
1,50,Self-emp-not-inc,83311,Bachelors,13,0,Exec-managerial,0,4,1,0,0,13,1,1
2,38,Private,215646,HS-grad,9,1,Handlers-cleaners,1,4,1,0,0,40,1,1
3,53,Private,234721,11th,7,0,Handlers-cleaners,0,2,1,0,0,40,1,1
4,28,Private,338409,Bachelors,13,0,Prof-specialty,5,2,0,0,0,40,0,1
5,37,Private,284582,Masters,14,0,Exec-managerial,5,4,0,0,0,40,1,1
6,49,Private,160187,9th,5,1,Other-service,1,2,0,0,0,16,0,1
7,52,Self-emp-not-inc,209642,HS-grad,9,0,Exec-managerial,0,4,1,0,0,45,1,0
8,31,Private,45781,Masters,14,1,Prof-specialty,1,4,0,14084,0,50,1,0
9,42,Private,159449,Bachelors,13,0,Exec-managerial,0,4,1,5178,0,40,1,0


In [207]:
# Obtener las categorías originales
categories = label_encoder.classes_

# Crear un DataFrame para mostrar el mapeo
mapping_df = pd.DataFrame({
    'Categoría': categories,
    'Código': range(len(categories))
})

print(mapping_df)


            Categoría  Código
0  Amer-Indian-Eskimo       0
1  Asian-Pac-Islander       1
2               Black       2
3               Other       3
4               White       4


In [208]:
def definir_tipo_empleo(x):
    if x['clase_trabajo'] == 'Federal-gov' or x['clase_trabajo']== 'Local-gov' or x['clase_trabajo']=='State-gov': return 'govt'
    elif x['clase_trabajo'] == 'Private':return 'private'
    elif x['clase_trabajo'] == 'Self-emp-inc' or x['clase_trabajo'] == 'Self-emp-not-inc': return 'self_employed'
    else: return 'without_pay'
 
df['clase_trabajo'] = df['clase_trabajo'].str.strip()  
df['tipo_empleo']=df.apply(definir_tipo_empleo, axis=1)

df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario,tipo_empleo
0,39,State-gov,77516,Bachelors,13,1,Adm-clerical,1,4,1,2174,0,40,1,1,govt
1,50,Self-emp-not-inc,83311,Bachelors,13,0,Exec-managerial,0,4,1,0,0,13,1,1,self_employed
2,38,Private,215646,HS-grad,9,1,Handlers-cleaners,1,4,1,0,0,40,1,1,private
3,53,Private,234721,11th,7,0,Handlers-cleaners,0,2,1,0,0,40,1,1,private
4,28,Private,338409,Bachelors,13,0,Prof-specialty,5,2,0,0,0,40,0,1,private
5,37,Private,284582,Masters,14,0,Exec-managerial,5,4,0,0,0,40,1,1,private
6,49,Private,160187,9th,5,1,Other-service,1,2,0,0,0,16,0,1,private
7,52,Self-emp-not-inc,209642,HS-grad,9,0,Exec-managerial,0,4,1,0,0,45,1,0,self_employed
8,31,Private,45781,Masters,14,1,Prof-specialty,1,4,0,14084,0,50,1,0,private
9,42,Private,159449,Bachelors,13,0,Exec-managerial,0,4,1,5178,0,40,1,0,private


In [209]:
label_encoder = LabelEncoder()
df['tipo_empleo'] = label_encoder.fit_transform(df['tipo_empleo'])
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario,tipo_empleo
0,39,State-gov,77516,Bachelors,13,1,Adm-clerical,1,4,1,2174,0,40,1,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,0,Exec-managerial,0,4,1,0,0,13,1,1,2
2,38,Private,215646,HS-grad,9,1,Handlers-cleaners,1,4,1,0,0,40,1,1,1
3,53,Private,234721,11th,7,0,Handlers-cleaners,0,2,1,0,0,40,1,1,1
4,28,Private,338409,Bachelors,13,0,Prof-specialty,5,2,0,0,0,40,0,1,1
5,37,Private,284582,Masters,14,0,Exec-managerial,5,4,0,0,0,40,1,1,1
6,49,Private,160187,9th,5,1,Other-service,1,2,0,0,0,16,0,1,1
7,52,Self-emp-not-inc,209642,HS-grad,9,0,Exec-managerial,0,4,1,0,0,45,1,0,2
8,31,Private,45781,Masters,14,1,Prof-specialty,1,4,0,14084,0,50,1,0,1
9,42,Private,159449,Bachelors,13,0,Exec-managerial,0,4,1,5178,0,40,1,0,1


In [210]:
# Obtener las categorías originales
categories = label_encoder.classes_

# Crear un DataFrame para mostrar el mapeo
mapping_df = pd.DataFrame({
    'Categoría': categories,
    'Código': range(len(categories))
})

print(mapping_df)

       Categoría  Código
0           govt       0
1        private       1
2  self_employed       2
3    without_pay       3


In [211]:
df['ocupacion'] = df['ocupacion'].replace(0, 'vacio')

In [212]:
label_encoder = LabelEncoder()
df['ocupacion'] = label_encoder.fit_transform(df['ocupacion'])
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario,tipo_empleo
0,39,State-gov,77516,Bachelors,13,1,0,1,4,1,2174,0,40,1,1,0
1,50,Self-emp-not-inc,83311,Bachelors,13,0,3,0,4,1,0,0,13,1,1,2
2,38,Private,215646,HS-grad,9,1,5,1,4,1,0,0,40,1,1,1
3,53,Private,234721,11th,7,0,5,0,2,1,0,0,40,1,1,1
4,28,Private,338409,Bachelors,13,0,9,5,2,0,0,0,40,0,1,1
5,37,Private,284582,Masters,14,0,3,5,4,0,0,0,40,1,1,1
6,49,Private,160187,9th,5,1,7,1,2,0,0,0,16,0,1,1
7,52,Self-emp-not-inc,209642,HS-grad,9,0,3,0,4,1,0,0,45,1,0,2
8,31,Private,45781,Masters,14,1,9,1,4,0,14084,0,50,1,0,1
9,42,Private,159449,Bachelors,13,0,3,0,4,1,5178,0,40,1,0,1


In [213]:
# Obtener las categorías originales
categories = label_encoder.classes_

# Crear un DataFrame para mostrar el mapeo
mapping_df = pd.DataFrame({
    'Categoría': categories,
    'Código': range(len(categories))
})

print(mapping_df)

            Categoría  Código
0        Adm-clerical       0
1        Armed-Forces       1
2        Craft-repair       2
3     Exec-managerial       3
4     Farming-fishing       4
5   Handlers-cleaners       5
6   Machine-op-inspct       6
7       Other-service       7
8     Priv-house-serv       8
9      Prof-specialty       9
10    Protective-serv      10
11              Sales      11
12       Tech-support      12
13   Transport-moving      13


In [214]:
df[['educacion','educacion_num']]

Unnamed: 0,educacion,educacion_num
0,Bachelors,13
1,Bachelors,13
2,HS-grad,9
3,11th,7
4,Bachelors,13
...,...,...
48837,Bachelors,13
48838,HS-grad,9
48839,Bachelors,13
48840,Bachelors,13


In [215]:
label_encoder = LabelEncoder()
df['educacion'] = label_encoder.fit_transform(df['educacion'])
df.head(10)

Unnamed: 0,edad,clase_trabajo,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario,tipo_empleo
0,39,State-gov,77516,9,13,1,0,1,4,1,2174,0,40,1,1,0
1,50,Self-emp-not-inc,83311,9,13,0,3,0,4,1,0,0,13,1,1,2
2,38,Private,215646,11,9,1,5,1,4,1,0,0,40,1,1,1
3,53,Private,234721,1,7,0,5,0,2,1,0,0,40,1,1,1
4,28,Private,338409,9,13,0,9,5,2,0,0,0,40,0,1,1
5,37,Private,284582,12,14,0,3,5,4,0,0,0,40,1,1,1
6,49,Private,160187,6,5,1,7,1,2,0,0,0,16,0,1,1
7,52,Self-emp-not-inc,209642,11,9,0,3,0,4,1,0,0,45,1,0,2
8,31,Private,45781,12,14,1,9,1,4,0,14084,0,50,1,0,1
9,42,Private,159449,9,13,0,3,0,4,1,5178,0,40,1,0,1


In [216]:
# Obtener las categorías originales
categories = label_encoder.classes_

# Crear un DataFrame para mostrar el mapeo
mapping_df = pd.DataFrame({
    'Categoría': categories,
    'Código': range(len(categories))
})

print(mapping_df)

       Categoría  Código
0           10th       0
1           11th       1
2           12th       2
3        1st-4th       3
4        5th-6th       4
5        7th-8th       5
6            9th       6
7     Assoc-acdm       7
8      Assoc-voc       8
9      Bachelors       9
10     Doctorate      10
11       HS-grad      11
12       Masters      12
13     Preschool      13
14   Prof-school      14
15  Some-college      15


In [217]:
df_new = df.drop(columns=['clase_trabajo'])
df_new.head()

Unnamed: 0,edad,fnlwgt,educacion,educacion_num,estado_civil,ocupacion,relacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,salario,tipo_empleo
0,39,77516,9,13,1,0,1,4,1,2174,0,40,1,1,0
1,50,83311,9,13,0,3,0,4,1,0,0,13,1,1,2
2,38,215646,11,9,1,5,1,4,1,0,0,40,1,1,1
3,53,234721,1,7,0,5,0,2,1,0,0,40,1,1,1
4,28,338409,9,13,0,9,5,2,0,0,0,40,0,1,1


In [218]:
df_new = df_new[['edad', 'fnlwgt','educacion' ,'educacion_num', 'estado_civil', 'relacion','ocupacion','raza', 'genero', 'ganancia-capital', 'perdida-capital','horas_semana', 'pais', 'tipo_empleo', 'salario']]
df_new.head()

Unnamed: 0,edad,fnlwgt,educacion,educacion_num,estado_civil,relacion,ocupacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,tipo_empleo,salario
0,39,77516,9,13,1,1,0,4,1,2174,0,40,1,0,1
1,50,83311,9,13,0,0,3,4,1,0,0,13,1,2,1
2,38,215646,11,9,1,1,5,4,1,0,0,40,1,1,1
3,53,234721,1,7,0,0,5,2,1,0,0,40,1,1,1
4,28,338409,9,13,0,5,9,2,0,0,0,40,0,1,1


In [219]:
df_new.loc[(df_new['ganancia-capital'] > 0),'ganancia-capital'] = 1
df_new.loc[(df_new['ganancia-capital'] == 0 ,'ganancia-capital')]= 0

df_new.head(10)

Unnamed: 0,edad,fnlwgt,educacion,educacion_num,estado_civil,relacion,ocupacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,tipo_empleo,salario
0,39,77516,9,13,1,1,0,4,1,1,0,40,1,0,1
1,50,83311,9,13,0,0,3,4,1,0,0,13,1,2,1
2,38,215646,11,9,1,1,5,4,1,0,0,40,1,1,1
3,53,234721,1,7,0,0,5,2,1,0,0,40,1,1,1
4,28,338409,9,13,0,5,9,2,0,0,0,40,0,1,1
5,37,284582,12,14,0,5,3,4,0,0,0,40,1,1,1
6,49,160187,6,5,1,1,7,2,0,0,0,16,0,1,1
7,52,209642,11,9,0,0,3,4,1,0,0,45,1,2,0
8,31,45781,12,14,1,1,9,4,0,1,0,50,1,1,0
9,42,159449,9,13,0,0,3,4,1,1,0,40,1,1,0


In [220]:
df_new.loc[(df_new['perdida-capital'] > 0),'perdida-capital'] = 1
df_new.loc[(df_new['perdida-capital'] == 0 ,'perdida-capital')]= 0

df_new.head(10)

Unnamed: 0,edad,fnlwgt,educacion,educacion_num,estado_civil,relacion,ocupacion,raza,genero,ganancia-capital,perdida-capital,horas_semana,pais,tipo_empleo,salario
0,39,77516,9,13,1,1,0,4,1,1,0,40,1,0,1
1,50,83311,9,13,0,0,3,4,1,0,0,13,1,2,1
2,38,215646,11,9,1,1,5,4,1,0,0,40,1,1,1
3,53,234721,1,7,0,0,5,2,1,0,0,40,1,1,1
4,28,338409,9,13,0,5,9,2,0,0,0,40,0,1,1
5,37,284582,12,14,0,5,3,4,0,0,0,40,1,1,1
6,49,160187,6,5,1,1,7,2,0,0,0,16,0,1,1
7,52,209642,11,9,0,0,3,4,1,0,0,45,1,2,0
8,31,45781,12,14,1,1,9,4,0,1,0,50,1,1,0
9,42,159449,9,13,0,0,3,4,1,1,0,40,1,1,0


In [221]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   edad              48842 non-null  int64
 1   fnlwgt            48842 non-null  int64
 2   educacion         48842 non-null  int32
 3   educacion_num     48842 non-null  int64
 4   estado_civil      48842 non-null  int64
 5   relacion          48842 non-null  int32
 6   ocupacion         48842 non-null  int32
 7   raza              48842 non-null  int32
 8   genero            48842 non-null  int64
 9   ganancia-capital  48842 non-null  int64
 10  perdida-capital   48842 non-null  int64
 11  horas_semana      48842 non-null  int64
 12  pais              48842 non-null  int32
 13  tipo_empleo       48842 non-null  int32
 14  salario           48842 non-null  int64
dtypes: int32(6), int64(9)
memory usage: 4.5 MB


In [222]:
df_new.to_csv("data_evaluacion_decoded.csv") 