In [1]:
import pandas as pd

pd.set_option('display.max_columns', 30)

types = {'id': int, 'descripcion': str}

training_data = pd.read_csv('../../data/TP2/train.csv', dtype = types, usecols=['id','descripcion'])
evaluation_data = pd.read_csv('../../data/TP2/test.csv', dtype = types, usecols=['id', 'descripcion'])

training_data.head()

Unnamed: 0,id,descripcion
0,254099,"depto. interior de 80.15m2, consta de sala com..."
1,53461,"<p>entre sonora y guerrero, atr&aacute;s del h..."
2,247984,descripcion \nla mejor ubicacion residencial e...
3,209067,casa en privada con caseta de vigilancia casas...
4,185997,bonito departamento en excelentes condiciones ...


In [2]:
training_data['descripcion'].fillna("sindescripcion", inplace = True)
evaluation_data['descripcion'].fillna("sindescripcion", inplace = True)

In [3]:
import html
import re
from unicodedata import normalize

def clean_text(text):
    text = html.unescape(text)
    text = re.sub(r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", 
                    r"\1", 
                    normalize("NFD", text), 
                    0, 
                    re.I) #re.sub(pattern, repl, string, count=0, flags=0);
    
    text = normalize('NFC', text)
    text = re.sub('[^a-zA-ZñÑ]+', ' ', text)
    text = text.lower()
    return text

In [4]:
training_data['descripcion'] = training_data['descripcion'].apply(lambda x: clean_text(x))
evaluation_data['descripcion'] = evaluation_data['descripcion'].apply(lambda x: clean_text(x))

In [5]:
adjetivos_vector = ['linda', 'hermosa', 'amplia', 'preciosa', 'bonita', 'comoda', 'lujosa']
training_data['uso_de_adjetivos'] = training_data['descripcion'].apply(lambda x: 1 if any(word in x for word in adjetivos_vector ) else 0)
evaluation_data['uso_de_adjetivos'] = evaluation_data['descripcion'].apply(lambda x: 1 if any(word in x for word in adjetivos_vector ) else 0)

In [6]:
training_data.drop(columns=['descripcion'],inplace=True)
evaluation_data.drop(columns=['descripcion'],inplace=True)

In [7]:
training_data.head()

Unnamed: 0,id,uso_de_adjetivos
0,254099,0
1,53461,0
2,247984,1
3,209067,0
4,185997,0


In [8]:
evaluation_data.to_csv('../../res/ftr/qualificative_adjectives_in_description_evaluation.csv')
training_data.to_csv('../../res/ftr/qualificative_adjectives_in_description_train.csv')