In [15]:
import pandas as pd
import numpy as np
import os
import ast
import nltk
import re
import unidecode
from nltk.corpus import stopwords

In [3]:
stop_words = nltk.corpus.stopwords.words('spanish')

# Datos

In [4]:
#Subir un directorio
os.chdir("..")

In [59]:
df = pd.read_csv('data/places_reviews_processed.csv')
df_1 = pd.read_csv('data/places_types.csv')

# Formato

In [60]:
df = df[['place_id', 'url', 'text']]

In [61]:
df = pd.merge(df, df_1, how='left', on='place_id')

In [62]:
df = df.dropna()

In [63]:
df['tipo_lugar'].unique()

array(['Museo', 'Comida mexicana', 'Templo', 'Teatro', 'Ecoturismo',
       'Pirámides'], dtype=object)

# Temática: Restaurantes

In [64]:
restaurantes = df[df['tipo_lugar']=='Comida mexicana']

In [65]:
corpus = restaurantes.text
data_corpus = restaurantes[['place_id', 'url']]

# Análisis

### Limpiar el texto

In [66]:
def doClean(text):   
    
    # Quitar las expresiones entre llaves que denotan las partes de la canción
    a = re.sub(r'\[.+\]', ' ', str(text))
    # Quitar las aclaraciones entre paréntesis
    a = re.sub(r'\(.+\)', ' ', str(a))
    # Quitar acentos 
    a = unidecode.unidecode(a)
    # Quitar aquello que no sean palabras o cosa que se le parezca
    a = re.sub(r'\W', ' ', a)
    # Quitar espacios extra en caso de haber
    a = re.sub(r'\s+', ' ', a, flags=re.I)
    # Pasar el texto a minúsulas
    a = a.lower()
    # retirar stopwords
    a = a.split()
    a = [ word for word in a if word not in stopwords.words('spanish')]
    a = " ".join(a)

    return a

In [67]:
corpus_clean = corpus.apply(doClean)

# TF-IDF Model

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=.9, use_idf=True, max_features=300)
tv_matrix = tv.fit_transform(corpus_clean)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names()
prueba = pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

In [100]:
# Conteo de veces que aparecen palabras claves
comida_words = ['buen', 'buena', 'buenas', 'bueno', 'buenos', 'calidad', 'delicia', 'deliciosa', 'deliciosas',
                'delicioso', 'deliciosos', 'delicious', 'exquisito', 'presentacion', 'rica', 'ricas',
                'rico', 'ricos', 'riquisima', 'riquisimo', 'sabor', 'sabores', 'sabroso', 'sazon']
servicio_words = ['amable', 'amables', 'atencion', 'atento', 'atentos', 'atienden', 'limpieza', 'higiene', 
                 'limpio', 'rapido', 'service', 'servicio']
ambiente_words = ['acogedor', 'agradable', 'ambiente', 'bonito', 'decoracion', 
                  'familiar', 'musica', 'tranquilo']
satisfaccion_words = ['encanta', 'encanto', 'espectacular', 'especial', 'espectacular', 'excelente',
                      'excelentes', 'exelente', 'good', 'great', 'increible', 'nice', 'recomendable',
                     'recomendable', 'recomendado', 'recomiendo']
specials_words = ['barbacoa', 'birria', 'carne', 'carnitas', 'chocolate', 'cochinita', 'enchiladas', 
                  'mariscos', 'mole', 'nogada', 'pozole', 'tacos', 'tortas', 
                 'yucateca']

In [87]:
corpus_clean_df = pd.DataFrame(corpus_clean)

In [90]:
#Reser index 
corpus_clean_df = corpus_clean_df.reset_index()

In [116]:
corpus_clean_df_2 = corpus_clean_df.copy()

In [122]:
#Columnas
corpus_clean_df_2['comida'] = float('nan')
corpus_clean_df_2['servicio'] = float('nan')
corpus_clean_df_2['ambiente'] = float('nan')
corpus_clean_df_2['satisfaccion'] = float('nan')
corpus_clean_df_2['specials'] = ''

In [125]:
for i in range(0, len(corpus_clean_df)):
    res = corpus_clean_df['text'][i].split()
    #Counters
    comida_counter = 0
    service_counter = 0
    ambiente_counter = 0
    satisfaccion_counter = 0
    specials = []
    for j in res:
        if j in comida_words:
            comida_counter = comida_counter+1
        if j in servicio_words:
            service_counter = service_counter+1
        if j in ambiente_words:
            ambiente_counter = ambiente_counter+1
        if j in satisfaccion_words:
            satisfaccion_counter = satisfaccion_counter+1
        if j in specials_words:
            specials.append(j)
    
    #Asignar valores
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('comida')] = comida_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('servicio')] = service_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('ambiente')] = ambiente_counter
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('satisfaccion')] = satisfaccion_counter
    
    specials_concatenation = '-'.join(specials)
    corpus_clean_df_2.iloc[i, corpus_clean_df_2.columns.get_loc('specials')] = specials_concatenation

In [129]:
# Unir con datos de los restaurantes
corpus_clean_df_2 = corpus_clean_df_2.set_index('index')

In [131]:
restaurantes_final = pd.merge(data_corpus, corpus_clean_df_2, how='left', left_index=True,
                              right_index=True)

## Agrupar a nivel restaurante 

In [134]:
restaurantes_final_2 = pd.DataFrame()
place_id = restaurantes_final['place_id'].unique().tolist()

In [154]:
for i in place_id:
    df_temp = restaurantes_final[restaurantes_final['place_id']==i]
    df_temp_2 = df_temp.copy()
    
    df_temp_2_grouped = df_temp_2.groupby('place_id').sum().reset_index()
    
    temp_list = []
    for j in df_temp_2['specials']:
        if j == '':
            None
        else:
            temp_list.append(j)
    #Separar los specials
    temp = []
    for element in temp_list:
        splitted = element.split("-")
        for word in splitted:
            temp.append(word)
    #Obtener specials únicos
    myset = set(temp)
    specials_uniques = list(myset)
    #Concatenar en un sólo string
    specials_uniques_concatenation = '-'.join(specials_uniques)
    #Crear variable
    df_temp_2_grouped['specials'] = specials_uniques_concatenation
    
    restaurantes_final_2 = pd.concat([restaurantes_final_2, df_temp_2_grouped])

# Guardar análisis

In [156]:
restaurantes_final_2.to_csv('data/restaurants_classification.csv', index=False)

Unnamed: 0,comida,servicio,ambiente,satisfaccion
comida,1.0,0.795213,0.432159,0.750158
servicio,0.795213,1.0,0.549758,0.895431
ambiente,0.432159,0.549758,1.0,0.594126
satisfaccion,0.750158,0.895431,0.594126,1.0
