In [1]:
import pandas as pd
import numpy as np
import os
import ast
import nltk
import re
import unidecode
from nltk.corpus import stopwords

In [2]:
stop_words = nltk.corpus.stopwords.words('spanish')

# Datos

In [3]:
#Subir un directorio
os.chdir("..")

In [4]:
df = pd.read_csv('data/places_reviews_processed.csv')
df_1 = pd.read_csv('data/places_types.csv')

# Formato

In [5]:
df = df[['place_id', 'url', 'text']]

In [6]:
df = pd.merge(df, df_1, how='left', on='place_id')

In [7]:
df = df.dropna()

In [8]:
df['tipo_lugar'].unique()

array(['Museo', 'Comida mexicana', 'Templo', 'Teatro', 'Ecoturismo',
       'Pirámides'], dtype=object)

# Temática: Restaurantes

In [9]:
restaurantes = df[df['tipo_lugar']=='Comida mexicana']

In [10]:
corpus = restaurantes.text
data_corpus = restaurantes[['place_id', 'url']]

# Análisis

### Limpiar el texto

In [11]:
def doClean(text):   
    
    # Quitar las expresiones entre llaves que denotan las partes de la canción
    a = re.sub(r'\[.+\]', ' ', str(text))
    # Quitar las aclaraciones entre paréntesis
    a = re.sub(r'\(.+\)', ' ', str(a))
    # Quitar acentos 
    a = unidecode.unidecode(a)
    # Quitar aquello que no sean palabras o cosa que se le parezca
    a = re.sub(r'\W', ' ', a)
    # Quitar espacios extra en caso de haber
    a = re.sub(r'\s+', ' ', a, flags=re.I)
    # Pasar el texto a minúsulas
    a = a.lower()
    # retirar stopwords
    a = a.split()
    a = [ word for word in a if word not in stopwords.words('spanish')]
    a = " ".join(a)

    return a

In [12]:
corpus_clean = corpus.apply(doClean)

# TF-IDF Model

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=.9, use_idf=True, max_features=300)
tv_matrix = tv.fit_transform(corpus_clean)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
prueba = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

In [14]:
# Conteo de veces que aparecen palabras claves
comida_words = ['buen', 'buena', 'buenas', 'bueno', 'buenos', 'calidad', 'delicia', 'deliciosa', 'deliciosas',
                'delicioso', 'deliciosos', 'delicious', 'exquisito', 'presentacion', 'rica', 'ricas',
                'rico', 'ricos', 'riquisima', 'riquisimo', 'sabor', 'sabores', 'sabroso', 'sazon']
servicio_words = ['amable', 'amables', 'atencion', 'atento', 'atentos', 'atienden', 'limpieza', 'higiene', 
                 'limpio', 'rapido', 'service', 'servicio']
ambiente_words = ['acogedor', 'agradable', 'ambiente', 'bonito', 'decoracion', 
                  'familiar', 'musica', 'tranquilo']
satisfaccion_words = ['encanta', 'encanto', 'espectacular', 'especial', 'espectacular', 'excelente',
                      'excelentes', 'exelente', 'good', 'great', 'increible', 'nice', 'recomendable',
                     'recomendable', 'recomendado', 'recomiendo']
specials_words = ['barbacoa', 'birria', 'carne', 'carnitas', 'chocolate', 'cochinita', 'enchiladas', 
                  'mariscos', 'mole', 'nogada', 'pozole', 'tacos', 'tortas', 
                 'yucateca']

In [15]:
corpus_clean_df = pd.DataFrame(corpus_clean)

In [16]:
#Reser index 
corpus_clean_df = corpus_clean_df.reset_index()

In [17]:
corpus_clean_df_2 = corpus_clean_df.copy()

In [18]:
#Columnas
corpus_clean_df_2['comida'] = float('nan')
corpus_clean_df_2['servicio'] = float('nan')
corpus_clean_df_2['ambiente'] = float('nan')
corpus_clean_df_2['satisfaccion'] = float('nan')
corpus_clean_df_2['specials'] = ''

In [19]:
for i in range(0, len(corpus_clean_df)):
    res = corpus_clean_df['text'][i].split()
    #Counters
    comida_counter = 0
    service_counter = 0
    ambiente_counter = 0
    satisfaccion_counter = 0
    specials = []
    for j in res:
        if j in comida_words:
            comida_counter = comida_counter+1
        if j in servicio_words:
            service_counter = service_counter+1
        if j in ambiente_words:
            ambiente_counter = ambiente_counter+1
        if j in satisfaccion_words:
            satisfaccion_counter = satisfaccion_counter+1
        if j in specials_words:
            specials.append(j)
    
    #Asignar valores
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('comida')] = comida_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('servicio')] = service_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('ambiente')] = ambiente_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('satisfaccion')] = satisfaccion_counter
    
    specials_concatenation = '-'.join(specials)
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('specials')] = specials_concatenation

In [20]:
# Unir con datos de los restaurantes
corpus_clean_df_2 = corpus_clean_df_2.set_index('index')

In [21]:
restaurantes_final = pd.merge(data_corpus, corpus_clean_df_2, how='left', left_index=True,
                              right_index=True)

## Agrupar a nivel restaurante 

In [22]:
restaurantes_final_2 = pd.DataFrame()
place_id = restaurantes_final['place_id'].unique().tolist()

In [24]:
for i in place_id:
    df_temp = restaurantes_final[restaurantes_final['place_id']==i]
    df_temp_2 = df_temp.copy()
    
    df_temp_2_grouped = df_temp_2.groupby('place_id').agg({'url':'count',
                                                           'comida':'sum',
                                                           'servicio':'sum',
                                                           'ambiente':'sum',
                                                           'satisfaccion':'sum',
                                                          })
    
    temp_list = []
    for j in df_temp_2['specials']:
        if j == '':
            None
        else:
            temp_list.append(j)
    #Separar los specials
    temp = []
    for element in temp_list:
        splitted = element.split("-")
        for word in splitted:
            temp.append(word)
    #Obtener specials únicos
    myset = set(temp)
    specials_uniques = list(myset)
    #Concatenar en un sólo string
    specials_uniques_concatenation = '-'.join(specials_uniques)
    #Crear variable
    df_temp_2_grouped['specials'] = specials_uniques_concatenation
    
    restaurantes_final_2 = pd.concat([restaurantes_final_2, df_temp_2_grouped])

In [26]:
restaurantes_final_2 = restaurantes_final_2.rename(columns={'url': 'conteo_reviews'})

In [28]:
restaurantes_final_2['comida_norm'] = restaurantes_final_2['comida']/restaurantes_final_2['conteo_reviews']
restaurantes_final_2['servicio_norm'] = restaurantes_final_2['servicio']/restaurantes_final_2['conteo_reviews']
restaurantes_final_2['ambiente_norm'] = restaurantes_final_2['ambiente']/restaurantes_final_2['conteo_reviews']
restaurantes_final_2['satisfaccion_norm'] = restaurantes_final_2['satisfaccion']/restaurantes_final_2['conteo_reviews']

# Guardar análisis

In [30]:
restaurantes_final_2.to_csv('data/restaurants_classification.csv', index=False)

# Ecoturismo

In [87]:
ecoturismo = df[df['tipo_lugar']=='Ecoturismo']

In [88]:
corpus = ecoturismo.text
data_corpus = ecoturismo[['place_id', 'url']]

### Limpiar texto

In [89]:
corpus_clean = corpus.apply(doClean)

### Modelo

In [90]:
tv = TfidfVectorizer(min_df=0., max_df=.7, use_idf=True, max_features=200)
tv_matrix = tv.fit_transform(corpus_clean)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
prueba = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

In [91]:
#Crear diccionario
actividades = ['acampar','actividades', 'alimentos','animales','atracciones','bici','bicicleta','cabanas'
               'correr','divertido','ejercicio', 'instalaciones','juegos','lago', 'pasear','tirolesa']
recomendado = ['agradable','amable', 'atencion','bello','bello', 'bien','bonito','buena', 'buenas',
               'bueno','buenos', 'calidad','disfrutar','encanta', 'encanto', 'excelente', 'exelente',
              'genial','great','gusta', 'gusto','hermosa','hermoso', 'increible','lindo',
               'magico', 'maravilloso','perfecto', 'recomendable','recomendado', 'recomiendo','servicio']
naturaleza = ['amplio','bosque', 'cabanas','caminar','correr', 'familia','familiar','fresco', 
              'grande','lago','naturaleza', 'natural','paisaje','paisajes', 'parque','peces','relajarse']
specials_words = ['acampar','animales', 'bici','bicicleta','cabanas', 'lago','peces','tirolesa']

In [92]:
corpus_clean_df = pd.DataFrame(corpus_clean)

In [93]:
#Reser index 
corpus_clean_df = corpus_clean_df.reset_index()

In [94]:
corpus_clean_df_2 = corpus_clean_df.copy()

In [95]:
#Columnas
corpus_clean_df_2['actividades'] = float('nan')
corpus_clean_df_2['recomendado'] = float('nan')
corpus_clean_df_2['naturaleza'] = float('nan')
corpus_clean_df_2['specials'] = ''

In [96]:
for i in range(0, len(corpus_clean_df)):
    res = corpus_clean_df['text'][i].split()
    #Counters
    counter_1 = 0
    counter_2 = 0
    counter_3 = 0
    specials = []
    for j in res:
        if j in actividades:
            counter_1 = counter_1+1
        if j in recomendado:
            counter_2 = counter_2+1
        if j in naturaleza:
            counter_3 = counter_3+1
        if j in specials_words:
            specials.append(j)
    
    #Asignar valores
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('actividades')] = counter_1
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('recomendado')] = counter_2
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('naturaleza')] = counter_3
    
    specials_concatenation = '-'.join(specials)
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('specials')] = specials_concatenation

In [97]:
# Unir con datos de los lugares
corpus_clean_df_2 = corpus_clean_df_2.set_index('index')

In [98]:
ecoturismo_final = pd.merge(data_corpus, corpus_clean_df_2, how='left', left_index=True,
                              right_index=True)

# Agrupar

In [99]:
ecoturismo_final_2 = pd.DataFrame()
place_id = ecoturismo_final['place_id'].unique().tolist()

In [100]:
for i in place_id:
    df_temp = ecoturismo_final[ecoturismo_final['place_id']==i]
    df_temp_2 = df_temp.copy()
    
    df_temp_2_grouped = df_temp_2.groupby('place_id').agg({'url':'count',
                                                           'actividades':'sum',
                                                           'recomendado':'sum',
                                                           'naturaleza':'sum'
                                                          })
    
    temp_list = []
    for j in df_temp_2['specials']:
        if j == '':
            None
        else:
            temp_list.append(j)
    #Separar los specials
    temp = []
    for element in temp_list:
        splitted = element.split("-")
        for word in splitted:
            temp.append(word)
    #Obtener specials únicos
    myset = set(temp)
    specials_uniques = list(myset)
    #Concatenar en un sólo string
    specials_uniques_concatenation = '-'.join(specials_uniques)
    #Crear variable
    df_temp_2_grouped['specials'] = specials_uniques_concatenation
    
    ecoturismo_final_2 = pd.concat([ecoturismo_final_2, df_temp_2_grouped])

In [102]:
ecoturismo_final_2 = ecoturismo_final_2.rename(columns={'url': 'conteo_reviews'})

In [104]:
# Normalizar
ecoturismo_final_2['actividades_norm'] = ecoturismo_final_2['actividades']/ecoturismo_final_2['conteo_reviews']
ecoturismo_final_2['recomendado_norm'] = ecoturismo_final_2['recomendado']/ecoturismo_final_2['conteo_reviews']
ecoturismo_final_2['naturaleza_norm'] = ecoturismo_final_2['naturaleza']/ecoturismo_final_2['conteo_reviews']

### Guardar

In [106]:
ecoturismo_final_2.to_csv('data/ecoturismo_classification.csv', index=False)

# Cultural

In [108]:
list_ = ['Museo', 'Templo', 'Teatro', 'Pirámides']

In [109]:
cultural = df[df['tipo_lugar'].isin(list_)]

In [111]:
corpus = cultural.text
data_corpus = cultural[['place_id', 'url']]

### Limpiar texto

In [112]:
corpus_clean = corpus.apply(doClean)

### Análisis

In [122]:
tv = TfidfVectorizer(min_df=0., max_df=.5, use_idf=True, max_features=250)
tv_matrix = tv.fit_transform(corpus_clean)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
prueba = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

In [124]:
#Crear diccionario
ambiente = ['acogedor', 'agradable', 'amable', 'amables', 'ambiente', 'atencion', 'comodo', 'disfrutar',
            'divertido', 'entretenido', 'interesante', 'interesantes', 'limpio', 'servicio', '', '', '', 
           'tranquilo', '', '', '', '', '', ]
recomendable = ['apreciar', 'bello', 'bien', 'bonita', 'bonito', 'buena', 'buenas', 'bueno', 'buenos',
                'calidad', 'encanta', 'encanto', 'espectacular', 'excelente', 'excelentes','exelente',
                'genial', 'hermosa','hermoso', 'impresionante', 'increible','lindo', 'maravilloso', 'perfecto', 
               'precios', 'precioso','recomendable', 'recomendado','recomiendo', '', '', '', '',]

In [125]:
corpus_clean_df = pd.DataFrame(corpus_clean)

In [126]:
#Reser index 
corpus_clean_df = corpus_clean_df.reset_index()

In [127]:
corpus_clean_df_2 = corpus_clean_df.copy()

In [128]:
#Columnas
corpus_clean_df_2['ambiente'] = float('nan')
corpus_clean_df_2['recomendable'] = float('nan')

In [129]:
for i in range(0, len(corpus_clean_df)):
    res = corpus_clean_df['text'][i].split()
    #Counters
    counter_1 = 0
    counter_2 = 0
    for j in res:
        if j in ambiente:
            counter_1 = counter_1+1
        if j in recomendable:
            counter_2 = counter_2+1
    
    #Asignar valores
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('ambiente')] = counter_1
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('recomendable')] = counter_2
    

In [130]:
# Unir con datos de los lugares
corpus_clean_df_2 = corpus_clean_df_2.set_index('index')

In [132]:
cultural_final = pd.merge(data_corpus, corpus_clean_df_2, how='left', left_index=True,
                              right_index=True)

# Agrupar

In [141]:
cultural_final_2 = pd.DataFrame()
place_id = cultural_final['place_id'].unique().tolist()

In [142]:
for i in place_id:
    df_temp = cultural_final[cultural_final['place_id']==i]
    df_temp_2 = df_temp.copy()
    
    df_temp_2_grouped = df_temp_2.groupby('place_id').agg({'url':'count',
                                                           'ambiente':'sum',
                                                           'recomendable':'sum'
                                                          })

    
    cultural_final_2 = pd.concat([cultural_final_2, df_temp_2_grouped])

In [144]:
cultural_final_2 = cultural_final_2.rename(columns={'url': 'conteo_reviews'})

In [146]:
# Normalizar
cultural_final_2['ambiente_norm'] = cultural_final_2['ambiente']/cultural_final_2['conteo_reviews']
cultural_final_2['recomendable_norm'] = cultural_final_2['recomendable']/cultural_final_2['conteo_reviews']

### Guardar

In [148]:
cultural_final_2.to_csv('data/cultural_classification.csv', index=False)