In [1]:
import pandas as pd
import numpy as np
from statistics import mean

pd.set_option('display.max_columns', 30)

types = {'id': int, 'provincia': str}

training_data = pd.read_csv('../../data/TP2/train.csv', dtype = types, usecols=['id', 'provincia'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id', 'provincia'])

training_data.head()

Unnamed: 0,id,provincia
0,254099,Distrito Federal
1,53461,Distrito Federal
2,247984,Jalisco
3,209067,Edo. de México
4,185997,Jalisco


In [2]:
population_by_entity = {
    'Distrito Federal': 8851080,
    'Jalisco': 7350682,
    'Edo. de México': 15175862,
    'Oaxaca': 3801962,
    'Quintana Roo': 1325578,
    'Colima': 650555,
    'San luis Potosí': 2585518,
    'Nuevo León': 4653458,
    'Querétaro': 1827937,
    'Tamaulipas': 3268554,
    'Puebla': 5779829,
    'Yucatán': 1955577,
    'Morelos': 1777227,
    'Guerrero': 3388768,
    'Tabasco': 2238603,
    'Guanajuato': 5486372,
    'Hidalgo': 2665018,
    'Veracruz': 7643194,
    'Chihuahua': 3406465,
    'Aguascalientes': 1184996,
    'Sonora': 2662480,
    'Michoacán': 4351037,
    'Baja California Norte': 3155070,
    'Baja California Sur': 637026,
    'Coahuila': 2748391,
    'Durango': 1632934,
    'Sinaloa': 2767761,
    'Chiapas': 4796580,
    'Nayarit': 1084979,
    'Tlaxcala': 1169936,
    'Campeche': 822441,
    'Zacatecas': 1490668,
    'nan' : None
}

training_data['provincia'] = training_data['provincia'].apply(lambda x: str(x))
evaluation_data['provincia'] = evaluation_data['provincia'].apply(lambda x: str(x))

training_data['population_of_entity'] = training_data['provincia'].apply(lambda x: population_by_entity[x])
evaluation_data['population_of_entity'] = evaluation_data['provincia'].apply(lambda x: population_by_entity[x])

training_data.head()

Unnamed: 0,id,provincia,population_of_entity
0,254099,Distrito Federal,8851080.0
1,53461,Distrito Federal,8851080.0
2,247984,Jalisco,7350682.0
3,209067,Edo. de México,15175862.0
4,185997,Jalisco,7350682.0


In [3]:
gdp_by_entity = {
    'Distrito Federal': 2254840,
    'Edo. de México': 1209424,
    'Nuevo León': 999407,
    'Jalisco': 850237,
    'Campeche': 610623,
    'Veracruz': 675465,
    'Tabasco': 434948,
    'Guanajuato': 557382,
    'Puebla': 424856,
    'Coahuila': 454047,
    'Tamaulipas': 413971,
    'Chihuahua': 382604,
    'Sonora': 397033,
    'Baja California Norte': 382383,
    'Michoacán': 318308,
    'Querétaro': 292723,
    'Sinaloa': 276520,
    'San luis Potosí': 257972,
    'Chiapas': 234618,
    'Hidalgo': 216423,
    'Quintana Roo': 213670,
    'Oaxaca': 208967,
    'Guerrero': 196501,
    'Yucatán': 195471,
    'Durango': 158956,
    'Aguascalientes': 163567,
    'Morelos': 155936,
    'Zacatecas': 126156,
    'Nayarit': 87956,
    'Colima': 79809,
    'Baja California Sur': 97833,
    'Tlaxcala': 72689,
    'nan': None
}

training_data['gdp_of_entity'] = training_data['provincia'].apply(lambda x: gdp_by_entity[x])
evaluation_data['gdp_of_entity'] = evaluation_data['provincia'].apply(lambda x: gdp_by_entity[x])

training_data.head()

Unnamed: 0,id,provincia,population_of_entity,gdp_of_entity
0,254099,Distrito Federal,8851080.0,2254840.0
1,53461,Distrito Federal,8851080.0,2254840.0
2,247984,Jalisco,7350682.0,850237.0
3,209067,Edo. de México,15175862.0,1209424.0
4,185997,Jalisco,7350682.0,850237.0


In [4]:
training_data['gdp_per_capita_of_entity'] = training_data['gdp_of_entity']/training_data['population_of_entity']
evaluation_data['gdp_per_capita_of_entity'] = evaluation_data['gdp_of_entity']/evaluation_data['population_of_entity']

training_data.drop(columns = ['provincia'], inplace = True )
evaluation_data.drop(columns = ['provincia'], inplace = True )

training_data.head()

Unnamed: 0,id,population_of_entity,gdp_of_entity,gdp_per_capita_of_entity
0,254099,8851080.0,2254840.0,0.254753
1,53461,8851080.0,2254840.0,0.254753
2,247984,7350682.0,850237.0,0.115668
3,209067,15175862.0,1209424.0,0.079694
4,185997,7350682.0,850237.0,0.115668


In [5]:
training_data.to_csv('../../res/ftr/provincia_economy_train.csv')
evaluation_data.to_csv('../../res/ftr/provincia_economy_evaluation.csv')