In [1]:
import pandas as pd
import numpy as np
import math

## Funciones

In [2]:
# Obtiene los nombres de las columnas categóricas
def get_categorical_col_name(df):
    columns = df.columns
    categorical_cols = []

    for col in columns:
        if df[col].dtype == 'O':
            categorical_cols.append(col)
            
    return categorical_cols

In [3]:
# Devuelve un diccionario con el encoding de cada valor por columna
def get_encoding(df, categorical_cols):
    enc_cols = {}

    for col in categorical_cols:
        values = df[col].value_counts()
        enc_values = {}
        n = math.ceil(math.log(len(values) + 1, 2))
        padding = '{0:0' + str(n) + 'b}'
        num = 1

        for value in values.index:
            numerical_values = list(padding.format(num))
            numerical_values = [int(numerical_values[i]) for i in range(len(numerical_values))]
            enc_values[value] = numerical_values
            num += 1

        enc_cols[col] = enc_values
    
    return enc_cols

In [4]:
# Devuelve una lista de listas de bools con las filas que cumplen la condición
def get_conditions(df, col, values):
    conditions = []
    
    for value in values:
        conditions.append((df[col] == value))

    return conditions

# Devuelve los valores a asignar dependiendo de la condición que cumpla, deben estar en mismo orden
def get_values(enc_values, values, i):
    numerical_values = []
    
    for value in values:
        numerical_values.append(enc_values[value][i])
        
    return numerical_values

# Convierte las columnas categóricas con binary encoding
def encode_cols(df, categorical_cols, enc_cols):
    for col in categorical_cols:
        categorical_values = list(df[col].value_counts().index)
        conditions = get_conditions(df, col, categorical_values)
        
        i = 0
        n = math.ceil(math.log(len(categorical_values)+1, 2))

        for i in range(n):
            new_col_name = col + "_b" + str(n-i-1)
            numerical_values = get_values(enc_cols[col], categorical_values, i)
            df[new_col_name] = np.select(conditions, numerical_values)
            i += 1

In [5]:
# Transforma las columnas categóricas en 1s y 0s usando binary encoding; elimina las columnas originales
def categorical_encoder(df):
    categorical_cols = get_categorical_col_name(df)
    enc_cols = get_encoding(df, categorical_cols)
    
    encode_cols(df, categorical_cols, enc_cols)
    
    df.drop(columns=categorical_cols, inplace=True)
    

## Train

In [6]:
train = pd.read_csv('../Train/train_set_sin_building_id.csv')

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
0,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
1,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,2
2,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,3
3,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,2
4,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,3


In [7]:
categorical_encoder(train)

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,position_b2,position_b1,position_b0,plan_configuration_b3,plan_configuration_b2,plan_configuration_b1,plan_configuration_b0,legal_ownership_status_b2,legal_ownership_status_b1,legal_ownership_status_b0
0,6,487,12198,2,30,6,5,1,1,0,...,0,1,0,0,0,0,1,0,0,1
1,8,900,2812,2,10,8,7,0,1,0,...,0,0,1,0,0,0,1,0,0,1
2,21,363,8973,2,10,5,5,0,1,0,...,0,1,0,0,0,0,1,0,0,1
3,22,418,10694,2,10,6,5,0,1,0,...,0,0,1,0,0,0,1,0,0,1
4,11,131,1488,3,30,8,9,1,0,0,...,0,0,1,0,0,0,1,0,0,1


In [8]:
cols = list(train.columns)

cols.remove('damage_grade')
cols.append('damage_grade')

train = train[cols]

train.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,position_b1,position_b0,plan_configuration_b3,plan_configuration_b2,plan_configuration_b1,plan_configuration_b0,legal_ownership_status_b2,legal_ownership_status_b1,legal_ownership_status_b0,damage_grade
0,6,487,12198,2,30,6,5,1,1,0,...,1,0,0,0,0,1,0,0,1,3
1,8,900,2812,2,10,8,7,0,1,0,...,0,1,0,0,0,1,0,0,1,2
2,21,363,8973,2,10,5,5,0,1,0,...,1,0,0,0,0,1,0,0,1,3
3,22,418,10694,2,10,6,5,0,1,0,...,0,1,0,0,0,1,0,0,1,2
4,11,131,1488,3,30,8,9,1,0,0,...,0,1,0,0,0,1,0,0,1,3


In [9]:
train.to_csv('../Train/train_set_binary_encoding.csv', index=False)

## Test

In [154]:
test = pd.read_csv('../Test/test_set_sin_building_id.csv')

categorical_encoder(test)

test.head()

Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,position_b2,position_b1,position_b0,plan_configuration_b3,plan_configuration_b2,plan_configuration_b1,plan_configuration_b0,legal_ownership_status_b2,legal_ownership_status_b1,legal_ownership_status_b0
0,17,596,11307,3,20,7,6,0,1,0,...,0,0,1,0,0,0,1,0,0,1
1,6,141,11987,2,25,13,5,0,1,0,...,0,0,1,0,0,0,1,0,0,1
2,22,19,10044,2,5,4,5,0,1,0,...,0,0,1,0,0,0,1,0,0,1
3,26,39,633,1,0,19,3,0,0,0,...,0,1,0,0,0,0,1,0,0,1
4,17,289,7970,3,15,8,7,0,1,0,...,0,1,0,0,0,0,1,0,0,1


In [155]:
test.to_csv('../Test/test_set_binary_encoding.csv', index=False)