In [1]:
import pandas as pd
import numpy as np
from statistics import mean

pd.set_option('display.max_columns', 30)

types = {'id': int, 'provincia': str}

training_data = pd.read_csv(r'../../data/TP2/train.csv', dtype = types, usecols=['id', 'provincia'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id', 'provincia'])

training_data.head()

Unnamed: 0,id,provincia
0,254099,Distrito Federal
1,53461,Distrito Federal
2,247984,Jalisco
3,209067,Edo. de México
4,185997,Jalisco


In [2]:
na_shared_border = {#northernmost
    'Chihuahua': 1,
    'Sonora': 1,
    'Baja California Norte': 1,
    'Cohauila':1,
    'Nuevo León':1,
    'Tamaulipas':1,
    'nan' : None
}

pacific_ocean_access = {
    'Jalisco': 1,
    'Oaxaca': 1,
    'Colima': 1,
    'Guerrero': 1,
    'Sonora': 1,
    'Michoacán': 1,
    'Baja California Norte': 1,
    'Baja California Sur': 1,
    'Sinaloa': 1,
    'Chiapas': 1,
    'Nayarit': 1,
    'nan' : None
}

atlantic_ocean_access = {
    'Quintana Roo': 1,
    'Tamaulipas': 1,
    'Yucatán': 1,
    'Tabasco': 1,
    'Veracruz': 1,
    'Campeche': 1,
    'nan' : None
}

sa_shared_border = {#southernmost
    'Quintana Roo': 1,
    'Tabasco': 1,
    'Chiapas': 1,
    'Campeche': 1,
    'nan' : None
}

border_with_capital = {
    'Distrito Federal': 1,
    'Edo. de México': 1,
    'Morelos': 1,
    'nan' : None
}

In [3]:
def get(t, v):
    if v in t:
        return t[v]
    return 0

In [4]:
training_data['provincia'] = training_data['provincia'].apply(lambda x: str(x))
evaluation_data['provincia'] = evaluation_data['provincia'].apply(lambda x: str(x))

training_data['na_border'] = training_data['provincia'].apply(lambda x: get(na_shared_border, x))
evaluation_data['na_border'] = evaluation_data['provincia'].apply(lambda x: get(na_shared_border, x))

training_data['sa_border'] = training_data['provincia'].apply(lambda x: get(sa_shared_border, x))
evaluation_data['sa_border'] = evaluation_data['provincia'].apply(lambda x: get(sa_shared_border, x))

training_data['pacific_o'] = training_data['provincia'].apply(lambda x: get(atlantic_ocean_access, x))
evaluation_data['pacific_o'] = evaluation_data['provincia'].apply(lambda x: get(atlantic_ocean_access, x))

training_data['atlantic_o'] = training_data['provincia'].apply(lambda x: get(pacific_ocean_access, x))
evaluation_data['atlantic_o'] = evaluation_data['provincia'].apply(lambda x: get(pacific_ocean_access, x))

training_data['cap_border'] = training_data['provincia'].apply(lambda x: get(border_with_capital, x))
evaluation_data['cap_border'] = evaluation_data['provincia'].apply(lambda x: get(border_with_capital, x))


training_data.drop(columns=['provincia'],inplace=True)
evaluation_data.drop(columns=['provincia'],inplace=True)

training_data.head()

Unnamed: 0,id,na_border,sa_border,pacific_o,atlantic_o,cap_border
0,254099,0.0,0.0,0.0,0.0,1.0
1,53461,0.0,0.0,0.0,0.0,1.0
2,247984,0.0,0.0,0.0,1.0,0.0
3,209067,0.0,0.0,0.0,0.0,1.0
4,185997,0.0,0.0,0.0,1.0,0.0


In [5]:
training_data.to_csv('../../res/ftr/provincia_borders_analysis_train.csv')
evaluation_data.to_csv('../../res/ftr/provincia_borders_analysis_evaluation.csv')