In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


## Cargar estados

In [None]:
# Lista con nombre de los estados
estados = []
ruta = '../datasets/raw/Google Maps/reviews-estados'
for estado in os.listdir(ruta):
        estado_path = os.path.join(ruta, estado)
        
        if os.path.isdir(estado_path) and estado.startswith('review-'):
            nombre_estado = estado[len('review-'):]
            estados.append(nombre_estado)

In [None]:
# Funcion para leer todos los json de una carpeta y unirlos en un datafram
def read_all_json(folder_ruth):
    dataframes = []
    for archivo_json in os.listdir(folder_ruth):
        if archivo_json.endswith('.json'):
            # Construye la ruta completa del archivo JSON
            ruta_archivo_json = os.path.join(folder_ruth, archivo_json)
            
            # Lee el archivo JSON en un DataFrame
            df = pd.read_json(ruta_archivo_json,lines=True)
            
            # Agrega el DataFrame a la lista
            dataframes.append(df)

    # Concatena todos los DataFrames en uno solo
    return pd.concat(dataframes, ignore_index=True)

In [None]:
# Extraer la fecha del diccionario
def extraer_resp_time(diccionario):
    if diccionario is not None:
        return diccionario['time']
    else:
        return None

# Función para extraer 'resp_text'
def extraer_resp_text(diccionario):
    if diccionario is not None:
        return diccionario['text']
    else:
        return None
    
#Transformo los formatos de time, extraigo time y texto de respuesta
def format_time(df):
    df['time'] =  pd.to_datetime(df['time'], unit='ms')
    df['resp_time'] = df['resp'].apply(extraer_resp_time)
    df['resp_time'] = pd.to_datetime(df['resp_time'], unit='ms')
    df['resp_text'] = df['resp'].apply(extraer_resp_text)
    df['resp_text'] = df['resp'].apply(extraer_resp_text)
    df.drop(columns=['resp'],inplace=True)

In [None]:
for estado in estados:
    rutas = f'{ruta}/review-{estado}'
    df = read_all_json(rutas)
    format_time(df)
    df.to_parquet(f'./{rutas}/all_{estado}.parquet',index=False)

## Reviews

In [2]:
ruta ='../datasets/extras/Google Maps/reviews-estados'

In [3]:
metadata_google = pd.read_json('../datasets/extras/Google Maps/bussiness_google.json.gz', compression='gzip')


## Carga de datos por criterio

### Criterio: Lugar

In [4]:
# Discriminar por estado
estados = ['California', 'New_Jersey', 'Florida', 'Illinois'] 

df_states = pd.DataFrame()

for estado in estados:
    ruta_archivo = f'{ruta}/review-{estado}/all_{estado}.parquet'
    review_estado = pd.read_parquet(ruta_archivo)

    # Agrega la columna 'state' con el valor del estado actual
    review_estado['state'] = estado

    df_states = pd.concat([df_states, review_estado], ignore_index=True)
    

# df_states ahora contiene la concatenación de todos los DataFrames de los diferentes estados, con la columna 'state' agregada


Normalizo un estado

In [5]:
df_states['state'].replace('New_Jersey','New Jersey',inplace=True)

In [6]:
df_states.head()

Unnamed: 0,user_id,name,time,rating,text,pics,gmap_id,resp_time,resp_text,state
0,1.089912e+20,Song Ro,2021-01-06 05:12:07.056,5,Love there korean rice cake.,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California
1,1.112903e+20,Rafa Robles,2021-02-09 05:47:28.663,5,Good very good,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California
2,1.126404e+20,David Han,2020-03-08 05:04:42.296,4,They make Korean traditional food very properly.,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California
3,1.174403e+20,Anthony Kim,2019-03-07 05:56:56.355,5,Short ribs are very delicious.,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California
4,1.005808e+20,Mario Marzouk,2017-05-16 05:01:41.933,5,Great food and prices the portions are large,,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California


In [7]:
# Elimino duplicados en user_id time y gmap_id en conjunto
df_states.drop_duplicates(subset=['user_id','time','gmap_id'],inplace=True)

In [8]:
# Elimino la columna pics que no sera usada
df_states.drop(columns='pics',inplace=True)


In [9]:
df_states.sample(5)

Unnamed: 0,user_id,name,time,rating,text,gmap_id,resp_time,resp_text,state
3062262,1.183387e+20,john spond,2019-07-30 20:43:33.112,5,It's right by home. I fill up there once a wee...,0x89c3c49d5c0fb3e7:0x853600658f48534e,NaT,,New Jersey
9521039,1.07794e+20,Craig Friedinger,2020-01-23 04:44:40.496,5,,0x880af5c608de0a99:0x6d065876dda0fb2b,NaT,,Illinois
2174676,1.118758e+20,Clara Landers,2019-06-23 22:50:09.913,4,"Ok do not like where it's at, but they do good...",0x808fc09c71f8f77f:0x522c2121da1d7053,NaT,,California
348102,1.044098e+20,Angry Son,2019-05-16 05:28:52.536,5,,0x80c2bf51ea454bad:0x425d1c2aefdbdf0c,NaT,,California
6688469,1.136236e+20,Kim Boff,2020-12-15 21:29:19.730,5,,0x88e634df665a5d2f:0xfb8749b27ecc44a0,NaT,,Florida


### Criterio: Tiempo

In [None]:
# Convertir la columna 'time' a tipo datetime
df_states['time'] = pd.to_datetime(df_states['time'])

# Filtrar los datos por año desde 2015 inclusive en adelante
df_filtered = df_states[df_states['time'].dt.year >= 2015]

# Eliminar las milésimas de segundo de la columna 'time'
df_filtered['time'] = df_filtered['time'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
# Renombrar columna de tiempo de respuesta
df_states.rename(columns={'resp_time':'resp_date'},inplace=True)

### Criterio: Rubro

In [12]:
metadata_google[['gmap_id','category']]

Unnamed: 0,gmap_id,category
4,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,['Restaurant']
5,0x88c2e4e34f1ed783:0x76c5da381c499d79,['Buffet restaurant']
46,0x8890b9241e704667:0x3a1e565c17c00993,['Restaurant']
81,0x88e635378f43352f:0xa1b53c63436fa428,['Gas station' 'ATM' 'Convenience store' 'Rest...
156,0x88d9ab98b5baad79:0x1a2e3f0642a55246,['Mexican restaurant']
...,...,...
693466,0x89c3b2b8e9c8c357:0xb1b07826ef85a598,['Bar' 'Italian restaurant' 'Pub' 'Restaurant']
693471,0x89c24d60acc3ffff:0xc8048546518281f6,['Restaurant' 'Family restaurant']
693475,0x89c24d7a198a3541:0x3dcb8f429c053248,['American restaurant']
693478,0x89c3b290dcc12d6d:0x34ad59938efed577,['Latin American restaurant' 'Mexican restaura...


Encuentro las review que por gmap_id pertenezcan a los locales de restaurante.

In [13]:
df_states = df_states[df_states['gmap_id'].isin(metadata_google['gmap_id'])]

## Analisis de sentimiento usando TextBlob

In [14]:
from textblob import TextBlob

In [15]:
df_states['sentiment_textblob'] = df_states['text'].apply(lambda x:0 if x is None else TextBlob(x).sentiment.polarity)
df_states['sentiment_textblob_resp'] = df_states['resp_text'].apply(lambda x:0 if x is None else TextBlob(x).sentiment.polarity)

Exploro y comparo ratings con score del sentimiento

In [16]:
df_states.sample(5)

Unnamed: 0,user_id,name,date,rating,text,gmap_id,resp_time,resp_text,state,sentiment_textblob,sentiment_textblob_resp
68408,1.178356e+20,Luis Barragan,2020-10-31 07:07:43,5,,0x80c336d43db2fbcd:0xb0357b8e2775d93f,NaT,,California,0.0,0.0
7826740,1.13141e+20,ellen stefanits,2019-07-25 15:30:58,5,,0x880fcc90842cda71:0xed06f34a062a8f65,NaT,,Illinois,0.0,0.0
8454240,1.144431e+20,Max Mroczkowski,2021-06-24 20:06:55,5,Best fried rice in the city. Make sure to get ...,0x880fd22d565378db:0x151569f439632f53,NaT,,Illinois,0.8125,0.0
7838789,1.145706e+20,Kev 1253,2020-05-24 14:25:55,5,,0x880ee5b151a6dc7d:0xbcee9aeb2b2a8d6c,NaT,,Illinois,0.0,0.0
1310202,1.116675e+20,John Garcia,2017-05-08 20:34:27,1,I got the breaded fish tacos and they tasted l...,0x80c36622c42bb5fd:0xc8c8fffdbd830105,NaT,,California,0.0,0.0


In [17]:
df_states[df_states['sentiment_textblob']<0].sample(5)

Unnamed: 0,user_id,name,date,rating,text,gmap_id,resp_time,resp_text,state,sentiment_textblob,sentiment_textblob_resp
7469756,1.084714e+20,Osvaldo Lopez,2019-11-18 20:53:58,1,(Translated by Google) The lousy service and t...,0x88dd83e495ac3927:0x4aa1ff529526162a,NaT,,Florida,-0.0625,0.0
6252090,1.115745e+20,Florida,2020-03-08 03:28:27,5,Off menu - The Bayside. Prime Rib piled high w...,0x88eb7c4460bf23a5:0xd7b0a47f7f7f246,NaT,,Florida,-0.164141,0.0
4168213,1.066834e+20,Andrea Merino,2019-05-01 18:13:20,3,It's ok but a little expensive.,0x89c2531ecd0f1beb:0xbbbb2efcad9bdce9,NaT,,New Jersey,-0.0625,0.0
6193896,1.155862e+20,Anthony Cartagena,2019-09-23 02:33:25,1,Horrible costumer service...Sad\nIf don't like...,0x88def1abcc188757:0xa865e0d1a6a1d210,NaT,,Florida,-1.0,0.0
5884416,1.082846e+20,Jackie St.George,2018-02-15 17:22:04,1,We went on Valentine's Day. Despite the fact t...,0x88c290186e7d21ff:0x9f4453666e9d71e0,NaT,,Florida,-0.098571,0.0


### Criterio de sentiment

Voy a generar el criterio que en el sentimiento se obtenga un valor cuantificable mas real, normalizando el ranting y sumandole al socre

In [18]:
df_states['sentiment'] = df_states['rating'] / 5. + df_states['sentiment_textblob']

In [19]:
df_states.sample(5)


Unnamed: 0,user_id,name,date,rating,text,gmap_id,resp_time,resp_text,state,sentiment_textblob,sentiment_textblob_resp,sentiment
7370330,1.114002e+20,Robert DePrez jr,2019-08-10 15:01:56,2,"Its fast food, expectations aren't very high",0x88c2be09855a30db:0x787b50d7d029b70b,NaT,,Florida,0.204,0.0,0.604
1948645,1.049375e+20,Jamie Toelle,2015-02-08 02:08:36,5,,0x80dd313a43846cc3:0x129019255c97a1e4,NaT,,California,0.0,0.0,1.0
5185394,1.142369e+20,Ladonna Lippincott,2021-03-09 21:11:47,4,,0x88db47c5edc27223:0xef209fd63c769776,NaT,,Florida,0.0,0.0,0.8
6026130,1.056228e+20,Nateia Lazarz,2018-05-10 12:27:30,4,,0x88d9ac5ec941a141:0xe40bf80fe06f9334,NaT,,Florida,0.0,0.0,0.8
675358,1.138117e+20,Jay Casares,2012-01-28 20:15:34,5,One of the best little bars I've been to.\nRem...,0x80952f64b086cf09:0x55bf7bfbedf25f26,NaT,,California,0.270833,0.0,1.270833


### En base al valor obtenido defino el criterio para score ponderado <br>
 mayor a 1.5 en un sentimiento muy bueno y asigno 2, <br>
 mayor a 1 y menor a 1.5 en un sentimiento bueno y asigno 1,<br>
 menor a 0.3 es un score negativo y asigno -1,<br>
 entre 1 y 0.3 es un score neutro y asigno 0

In [20]:
df_states['sentiment'] = df_states['sentiment'].apply(lambda x: 2 if x >= 1.5 else (1 if x >= 1 else (-1 if x <= 0.3 else 0)))


In [21]:
df_states.head()

Unnamed: 0,user_id,name,date,rating,text,gmap_id,resp_time,resp_text,state,sentiment_textblob,sentiment_textblob_resp,sentiment
0,1.089912e+20,Song Ro,2021-01-06 05:12:07,5,Love there korean rice cake.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California,0.5,0.0,2
1,1.112903e+20,Rafa Robles,2021-02-09 05:47:28,5,Good very good,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California,0.805,0.0,2
2,1.126404e+20,David Han,2020-03-08 05:04:42,4,They make Korean traditional food very properly.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California,0.0,0.0,0
3,1.174403e+20,Anthony Kim,2019-03-07 05:56:56,5,Short ribs are very delicious.,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California,0.5,0.0,2
4,1.005808e+20,Mario Marzouk,2017-05-16 05:01:41,5,Great food and prices the portions are large,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,,California,0.507143,0.0,2


Elimino las columnas que ya no se usan debido al analisis de sentimiento

In [22]:
df_states.drop(columns=['sentiment_textblob','text','resp_text'],inplace=True)

In [23]:
df_states.head()

Unnamed: 0,user_id,name,date,gmap_id,resp_time,state,sentiment_textblob_resp,sentiment
0,1.089912e+20,Song Ro,2021-01-06 05:12:07,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,California,0.0,2
1,1.112903e+20,Rafa Robles,2021-02-09 05:47:28,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,California,0.0,2
2,1.126404e+20,David Han,2020-03-08 05:04:42,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,California,0.0,0
3,1.174403e+20,Anthony Kim,2019-03-07 05:56:56,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,California,0.0,2
4,1.005808e+20,Mario Marzouk,2017-05-16 05:01:41,0x80c2c778e3b73d33:0xbdc58662a4a97d49,NaT,California,0.0,2


### Por último exporto el dataset a parquet particionado por estados

In [26]:
df_states.to_parquet('../datasets/processed/google/reviews_google.parquet.gz', compression='gzip')