In [1]:
import pandas as pd

pd.set_option('display.max_columns', 30)

types = {'id': int, 'direccion': str}

training_data = pd.read_csv(r'../../data/TP2/train.csv', dtype = types, usecols=['id','direccion'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id','direccion'])

training_data.head()

Unnamed: 0,id,direccion
0,254099,Avenida Division del Norte 2005
1,53461,AV. MEXICO
2,247984,Urbi Tonala
3,209067,IGNACIO MANUEL ALTAMIRANO 128
4,185997,PASEOS DEL SOL


In [2]:
import html
import re
from unicodedata import normalize

def clean_text(text):
    text = html.unescape(text)
    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", 
                    r"\1", 
                    normalize("NFD", text), 
                    0, 
                    re.I) #re.sub(pattern, repl, string, count=0, flags=0);
    
    text = normalize('NFC', text)
    text = re.sub('[^a-zA-ZñÑ]+', ' ', text)
    text = text.lower()
    return text

In [3]:
training_data['direccion'].fillna("sindireccion", inplace = True)
evaluation_data['direccion'].fillna("sindireccion", inplace = True)

In [4]:
training_data

Unnamed: 0,id,direccion
0,254099,Avenida Division del Norte 2005
1,53461,AV. MEXICO
2,247984,Urbi Tonala
3,209067,IGNACIO MANUEL ALTAMIRANO 128
4,185997,PASEOS DEL SOL
...,...,...
239995,119879,BOSQUES
239996,259178,Filiberto Navas 325
239997,131932,Nicolas San Juan
239998,146867,Javier Rojo Gomez 120


In [5]:
training_data['direccion'] = training_data['direccion'].apply(lambda x: clean_text(x))
evaluation_data['direccion'] = evaluation_data['direccion'].apply(lambda x: clean_text(x))

In [6]:
training_data

Unnamed: 0,id,direccion
0,254099,avenida division del norte
1,53461,av mexico
2,247984,urbi tonala
3,209067,ignacio manuel altamirano
4,185997,paseos del sol
...,...,...
239995,119879,bosques
239996,259178,filiberto navas
239997,131932,nicolas san juan
239998,146867,javier rojo gomez


In [7]:
avenida_vector = ['avenida', 'av', 'abenida', 'havenida', 'habenida', 'avenidas', 'avs', 'havenidas', 'abenidas', 'habenidas'  ]
training_data['avenida'] = training_data['direccion'].apply(lambda x: 1 if any(word in x for word in avenida_vector ) else 0)
evaluation_data['avenida'] = evaluation_data['direccion'].apply(lambda x: 1 if any(word in x for word in avenida_vector ) else 0)

In [8]:
training_data.drop(columns=['direccion'],inplace=True)
evaluation_data.drop(columns=['direccion'],inplace=True)

In [9]:
training_data.head()

Unnamed: 0,id,avenida
0,254099,1
1,53461,1
2,247984,0
3,209067,0
4,185997,0


In [10]:
evaluation_data.to_csv('../../res/ftr/avenida_in_direction_evaluation.csv')
training_data.to_csv('../../res/ftr/avenida_in_direction_train.csv')