In [1]:
import pandas as pd
import numpy as np

types = {'id': int,  'provincia': str}

pd.set_option('display.max_columns', 30)

training_data = pd.read_csv('../../data/TP2/train.csv', dtype = types, usecols=['id','provincia'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id', 'provincia'])

training_data.head()

Unnamed: 0,id,provincia
0,254099,Distrito Federal
1,53461,Distrito Federal
2,247984,Jalisco
3,209067,Edo. de México
4,185997,Jalisco


In [2]:
training_data['provincia'].fillna('sin_provincia', inplace = True)
evaluation_data['provincia'].fillna('sin_provincia', inplace = True)

training_data.head()

Unnamed: 0,id,provincia
0,254099,Distrito Federal
1,53461,Distrito Federal
2,247984,Jalisco
3,209067,Edo. de México
4,185997,Jalisco


In [3]:
region_economica = {
    'noroeste':['Baja California Norte', 'Baja California Sur', 'Chihuahua', 'Durango','Sinaloa','Sonora'],
    'noreste':[ 'Coahuila','Nuevo León','Tamaulipas'],
    'occidente':['Colima','Jalisco','Michoacán','Nayarit'],
    'oriente':['Hidalgo','Puebla','Tlaxcala','Veracruz'],
    'centronorte':['Aguascalientes','Guanajuato','Querétaro','San luis Potosí','Zacatecas'],
    'centrosur':['Edo. de México','Distrito Federal','Morelos'],
    'suroeste':['Chiapas','Guerrero','Oaxaca'],
    'sureste':['Campeche','Quintana Roo','Tabasco','Yucatán']
}

In [4]:
def region_clasification(provincia): 
    for region in region_economica:
        if provincia in region_economica[region]: return region
    return 'sindatos'

In [5]:
training_data['region'] = training_data.provincia.apply(lambda x: region_clasification(x)) 
evaluation_data['region'] = evaluation_data.provincia.apply(lambda x: region_clasification(x))

In [6]:
training_data.drop(columns=['provincia'],inplace=True)
evaluation_data.drop(columns=['provincia'],inplace=True)

In [7]:
training_data.head()

Unnamed: 0,id,region
0,254099,centrosur
1,53461,centrosur
2,247984,occidente
3,209067,centrosur
4,185997,occidente


In [8]:
training_data.to_csv('../../res/ftr/regions_of_mexico_train.csv')
evaluation_data.to_csv('../../res/ftr/regions_of_mexico_evaluation.csv')

In [9]:
training_entities = training_data[['region']].to_numpy()
evaluation_entities = evaluation_data[['region']].to_numpy()
all_entities = np.unique( np.concatenate( (training_entities, evaluation_entities), axis=None) )

In [10]:
for entity in all_entities:
    training_data['region_'+ entity] = training_data['region'].apply(lambda x: 1 if x == entity else 0)
    evaluation_data['region_'+entity] = evaluation_data['region'].apply(lambda x: 1 if x == entity else 0)
    
training_data.drop(columns=['region'], inplace = True)
evaluation_data.drop(columns=['region'], inplace = True)
training_data.head()

Unnamed: 0,id,region_centronorte,region_centrosur,region_noreste,region_noroeste,region_occidente,region_oriente,region_sindatos,region_sureste,region_suroeste
0,254099,0,1,0,0,0,0,0,0,0
1,53461,0,1,0,0,0,0,0,0,0
2,247984,0,0,0,0,1,0,0,0,0
3,209067,0,1,0,0,0,0,0,0,0
4,185997,0,0,0,0,1,0,0,0,0


In [11]:
training_data.to_csv('../../res/ftr/encoded_regions_of_mexico_train.csv')
evaluation_data.to_csv('../../res/ftr/encoded_regions_of_mexico_evaluation.csv')