In [1]:
import pandas as pd
import numpy as np

## Funciones

In [2]:
# Obtiene los nombres de las columnas categóricas
def get_categorical_col_name(df):
    columns = df.columns
    categorical_cols = []

    for col in columns:
        if df[col].dtype == 'O':
            categorical_cols.append(col)
            
    return categorical_cols

# Crea nuevas columnas con 1s y 0s en base a la cantidad de valores que hayan en la columna original (col_name)
def one_hot_encoding(df, col_name):
    values = df[col_name].value_counts()
    
    for i in range(len(values)):
        new_col_name = col_name + "_" + values.index[i]
        df[new_col_name] = np.where(df[col_name] == values.index[i], 1, 0)
            
# Transforma las columnas categóricas en columnas de 1s y 0s
def categorical_encoder(df):
    categorical_cols = get_categorical_col_name(df)
    
    for col in categorical_cols:
        one_hot_encoding(df, col)
        
    df.drop(columns=categorical_cols, inplace=True)

## Train

In [72]:
train = pd.read_csv("../Train/train_set_sin_building_id.csv")

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [74]:
categorical_encoder(train)

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_c,plan_configuration_a,plan_configuration_o,plan_configuration_m,plan_configuration_n,plan_configuration_f,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_w,legal_ownership_status_r
0,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,1,0,0,0
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,1,0,0,0


In [75]:
cols = list(train.columns)

cols.remove('damage_grade')
cols.append('damage_grade')

train = train[cols]

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_a,plan_configuration_o,plan_configuration_m,plan_configuration_n,plan_configuration_f,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_w,legal_ownership_status_r,damage_grade
0,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,1,0,0,0,3
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,1,0,0,0,2
2,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,1,0,0,0,3
3,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,1,0,0,0,2
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,1,0,0,0,3


In [76]:
train.to_csv('../Train/train_set_one_hot_encoding.csv', index=False)

## Test

In [3]:
test = pd.read_csv("../Test/test_set_sin_building_id.csv")

categorical_encoder(test)

test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,plan_configuration_s,plan_configuration_a,plan_configuration_o,plan_configuration_m,plan_configuration_n,plan_configuration_f,legal_ownership_status_v,legal_ownership_status_a,legal_ownership_status_w,legal_ownership_status_r
0,17,596,11307,3,20,7,6,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,6,141,11987,2,25,13,5,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,22,19,10044,2,5,4,5,0,1,0,...,0,0,0,0,0,0,1,0,0,0
3,26,39,633,1,0,19,3,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,17,289,7970,3,15,8,7,0,1,0,...,0,0,0,0,0,0,1,0,0,0


In [4]:
test.to_csv("../Test/test_set_one_hot_encoding.csv", index=False)