# Importación de librerías
====================================================================================================================================

In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
from math import factorial
from scipy import stats as st
import json
import gzip
import pickle

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime

# Carga de datos
====================================================================================================================================

In [2]:
# Sites
dfg_rest = pd.read_parquet('dataset_g_restaurants.parquet')

In [3]:
# Reviews
# Este dataset se carga más adelante en su sección de pre-procesamiento

# Preprocesamiento
====================================================================================================================================

## Dataset SITES

In [4]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 248852 entries, 2 to 274996
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   name              248852 non-null  object 
 1   address           247751 non-null  object 
 2   gmap_id           248852 non-null  object 
 3   description       79018 non-null   object 
 4   latitude          248852 non-null  float64
 5   longitude         248852 non-null  float64
 6   category          248852 non-null  object 
 7   avg_rating        248852 non-null  float64
 8   num_of_reviews    248852 non-null  int64  
 9   price             115831 non-null  object 
 10  hours             220683 non-null  object 
 11  MISC              246706 non-null  object 
 12  state             221749 non-null  object 
 13  relative_results  203743 non-null  object 
 14  url               248852 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 30.4+ MB
None


Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
215672,Genesee Valley Hunt Races - Annually 2nd Satur...,Genesee Valley Hunt Races - Annually 2nd Satur...,0x89d15da29fe2038f:0x228b856b094b6e3b,,42.84093,-77.817175,[Restaurant],4.4,8,,,"{'Accessibility': None, 'Activities': None, 'A...",,"[0x89d15d90521692bf:0xf7af3c271052b2ca, 0x89d1...",https://www.google.com/maps/place//data=!4m2!3...
252563,Conshy Seafood Co,"Conshy Seafood Co, 3024 Butler Pike, Conshohoc...",0x89c6bf84dba7e2d7:0x8d5c6558920fbe53,,40.095487,-75.285114,[Seafood restaurant],4.3,48,,"[[Saturday, 11AM–9:30PM], [Sunday, 11AM–8:30PM...",{'Accessibility': ['Wheelchair accessible entr...,Closed ⋅ Opens 11AM,,https://www.google.com/maps/place//data=!4m2!3...


In [5]:
# Adecuación de tipos de dato
dfg_rest = dfg_rest[['name', 'address', 'gmap_id', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price']]
dfg_rest = dfg_rest.rename(columns={'address': 'address_full'})
dfg_rest[['address', 'city', 'postal_code']] = dfg_rest['address_full'].str.extract(r'.*,\s*([^,]+),\s*([^,]+),\s*([^,]+)')
dfg_rest['state'] = dfg_rest['postal_code'].str.split(' ').str[0]
dfg_rest['postal_code'] = dfg_rest['postal_code'].str.split(' ').str[1]

dfg_rest['postal_code'] = pd.to_numeric(dfg_rest['postal_code'], errors='coerce')
dfg_rest = dfg_rest.dropna(subset=['postal_code'])
dfg_rest['postal_code'] = dfg_rest['postal_code'].astype(int)

In [6]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 231427 entries, 2 to 274996
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            231427 non-null  object 
 1   address_full    231427 non-null  object 
 2   gmap_id         231427 non-null  object 
 3   latitude        231427 non-null  float64
 4   longitude       231427 non-null  float64
 5   avg_rating      231427 non-null  float64
 6   num_of_reviews  231427 non-null  int64  
 7   price           104084 non-null  object 
 8   address         231427 non-null  object 
 9   city            231427 non-null  object 
 10  postal_code     231427 non-null  int64  
 11  state           231427 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 23.0+ MB
None


Unnamed: 0,name,address_full,gmap_id,latitude,longitude,avg_rating,num_of_reviews,price,address,city,postal_code,state
7224,Five Corner Store,"Five Corner Store, 4971 N Greenville Rd, Lakev...",0x8818953792be391d:0x4baa437973a845f9,43.365219,-85.262947,4.6,188,$,4971 N Greenville Rd,Lakeview,48850,MI
146474,Subway,"Subway, 490 W 6th Ave, Eugene, OR 97401",0x54c11e6d6f4a7c85:0x1651d62913f23589,44.052996,-123.100031,3.7,118,$,490 W 6th Ave,Eugene,97401,OR


In [7]:
# Convertir precio a formato numerico
price_mapping = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}
dfg_rest['price'] = dfg_rest['price'].map(price_mapping)

# Reemplazar valores nulos del campo "precio" con el promedio de precios basado en cercanía, utilzando el código postal
dfg_rest['postal_code'] = dfg_rest['postal_code'].astype(str)
average_price_by_zip = dfg_rest.groupby('postal_code')['price'].apply(lambda x: x.dropna().astype(float).mean())
dfg_rest['price'] = dfg_rest.apply(lambda row: average_price_by_zip.get(row['postal_code']) if pd.isnull(row['price']) else row['price'], axis=1)

dfg_rest = dfg_rest.dropna(subset=['price'])

In [8]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 223485 entries, 2 to 274996
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            223485 non-null  object 
 1   address_full    223485 non-null  object 
 2   gmap_id         223485 non-null  object 
 3   latitude        223485 non-null  float64
 4   longitude       223485 non-null  float64
 5   avg_rating      223485 non-null  float64
 6   num_of_reviews  223485 non-null  int64  
 7   price           223485 non-null  float64
 8   address         223485 non-null  object 
 9   city            223485 non-null  object 
 10  postal_code     223485 non-null  object 
 11  state           223485 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 22.2+ MB
None


Unnamed: 0,name,address_full,gmap_id,latitude,longitude,avg_rating,num_of_reviews,price,address,city,postal_code,state
273200,Marilu's,"Marilu's, 73511 Twentynine Palms Highway, Twen...",0x80dac74cb11c93f5:0x5b7291c0bcf6f62f,34.135345,-116.058143,4.3,18,2.0,73511 Twentynine Palms Highway,Twentynine Palms,92277,CA
163114,Harvest Bistro and Wine Bar,"Harvest Bistro and Wine Bar, 3410 Telford St, ...",0x8841b3871604b955:0x69a47228cc8830c2,39.144062,-84.520401,4.3,36,2.0,3410 Telford St,Cincinnati,45220,OH


In [9]:
#dfg_rest.to_parquet('dfg_rest.parquet')

### Creacion del dataset de categorias

In [10]:
dfg_restaurants = pd.read_parquet('dataset_g_restaurants.parquet')
dfg_categories = dfg_restaurants[['gmap_id', 'category']]
dfg_categories = dfg_categories.explode('category')
dfg_categories.rename(columns={'gmap_id': 'site_id'}, inplace=True)

dfg_categories.info()
dfg_categories.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 626995 entries, 2 to 274996
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   site_id   626995 non-null  object
 1   category  626995 non-null  object
dtypes: object(2)
memory usage: 14.4+ MB


Unnamed: 0,site_id,category
5263,0x881302d2fee622a3:0xe3a91c7062e3853a,Asian restaurant
266929,0x885398299bf78c27:0xe9643e8cc2cb9853,Pizza delivery
226794,0x8824c39b16ab9847:0xccacb3ad92b6f5ba,Hookah bar
237465,0x865c642265b9f799:0xda5e53f3fb0ca9e4,American restaurant
28717,0x87ee9f4d66a776c9:0x78248ed720bc058b,Bar
64591,0x8834efc8ccb43dd1:0x97a436e6bc7f9166,Pizza delivery
164485,0x883b35d3c50f4e01:0x9afb1ed584262b33,Traditional American restaurant
112299,0x87c4c541d9cace43:0x875c361d288e48fd,Bubble tea store
187958,0x888b278124016295:0x77f0f1bae26bd679,Traditional American restaurant
63775,0x89e7d8e6570b556b:0xb75ea2b3af1dbdc7,Chinese restaurant


In [11]:
dfg_categories_grouped = dfg_categories.groupby(['site_id'])['category'].count().reset_index()
dfg_categories_grouped.sort_values(by='category', ascending=False, inplace=True)
dfg_categories_grouped.head()

Unnamed: 0,site_id,category
24829,0x808f7792bd0c683d:0xefefd9d96f8fcb11,22
100420,0x87c5696c2304eb11:0xf60dbc11df643d20,22
211649,0x89c28801576a92a1:0xe8183528656d5f08,22
224911,0x89c6cf1e78429c4d:0xffccfa7176afdc7e,20
224636,0x89c6c9d5df403af7:0xa304c9f7a08ac8e3,20


In [12]:
# Exportar
#dfg_categories.to_parquet('dfg_site_categories.parquet')
#dfg_categories.to_excel('dfg_site_categories.xlsx')

### Creacion del dataset de atributos

In [13]:
dfg_restaurants = pd.read_parquet('dataset_g_restaurants.parquet')
dfg_attributes = dfg_restaurants[['gmap_id', 'MISC']]

dfg_categories.rename(columns={'gmap_id': 'site_id'}, inplace=True)

dfg_attributes.info()
dfg_attributes.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 248852 entries, 2 to 274996
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   gmap_id  248852 non-null  object
 1   MISC     246706 non-null  object
dtypes: object(2)
memory usage: 5.7+ MB


Unnamed: 0,gmap_id,MISC
35055,0x89e7ff74564a06c5:0x93459026b7578831,{'Accessibility': ['Wheelchair accessible entr...
97203,0x89c2975ca16f3545:0x8b674abe6e7ee52d,"{'Accessibility': None, 'Activities': None, 'A..."
240118,0x883822ad990dc575:0xba8ed2d361abeb50,{'Accessibility': ['Wheelchair accessible entr...
46088,0x862362a27a9e6c5b:0xe79ab731ea7527a6,{'Accessibility': ['Wheelchair accessible entr...
140046,0x80dca584abb3a539:0xbee450fcf6234e7d,"{'Accessibility': None, 'Activities': None, 'A..."
241394,0x8824ce095857a031:0x5af941ac44f011e4,"{'Accessibility': None, 'Activities': None, 'A..."
58299,0x86409456cd92f70b:0x42002a35dc37fe47,"{'Accessibility': None, 'Activities': None, 'A..."
36083,0x864097d6e4c59d53:0xc9337089a338ca12,{'Accessibility': ['Wheelchair accessible entr...
42525,0x86d6ac2d5c1b8f25:0x97cee23ab0f77ce4,"{'Accessibility': None, 'Activities': None, 'A..."
249879,0x89e434c253f45c57:0x1ef8ad4cc09e16b9,{'Accessibility': ['Wheelchair accessible entr...


In [14]:
# Convertir el campo MISC en un DataFrame separado
misc_df = dfg_attributes['MISC'].apply(pd.Series)

# Dividir el diccionario en columnas separadas
dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]

# Eliminar la columna MISC original
dfg_attributes.drop(columns=['MISC'], inplace=True)

dfg_attributes.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

Unnamed: 0,gmap_id,Accessibility,Activities,Amenities
2,0x80c2c778e3b73d33:0xbdc58662a4a97d49,[Wheelchair accessible entrance],,[Good for kids]
6,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,[Wheelchair accessible entrance],,[Good for kids]
8,0x87ec235c54d25b31:0x3b75fb5facc602f,,,
68,0x89c6c89efcaed69d:0xded973f6033e7dba,,,[Good for kids]
75,0x87fd0e70c5f5d87b:0xdf340eeb75040ef3,,,[Restroom]
96,0x7c00456eecad3111:0x8217f9600c51f33,[Wheelchair accessible entrance],,[Good for kids]
114,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,"[Wheelchair accessible entrance, Wheelchair ac...",,[Good for kids]
115,0x88c2e4e34f1ed783:0x76c5da381c499d79,"[Wheelchair accessible entrance, Wheelchair ac...",,[Good for kids]
123,0x8644b59b8fe872e5:0x5e638876caa84cc3,[Wheelchair accessible entrance],,
126,0x80c8be4e73e8263f:0x3edb275a351e6266,[Wheelchair accessible entrance],,[Restroom]


In [15]:
dfg_attributes.to_csv("dfg_attributes.csv")

In [16]:
dfg_attributes['Accessibility'].head(10)

2                       [Wheelchair accessible entrance]
6                       [Wheelchair accessible entrance]
8                                                   None
68                                                  None
75                                                  None
96                      [Wheelchair accessible entrance]
114    [Wheelchair accessible entrance, Wheelchair ac...
115    [Wheelchair accessible entrance, Wheelchair ac...
123                     [Wheelchair accessible entrance]
126                     [Wheelchair accessible entrance]
Name: Accessibility, dtype: object

In [1]:
# Hasta aca va okey, CODIGO A DESARROLLAR

In [22]:
'''import pandas as pd

# Función para procesar los valores de la columna Accessibility
def process_value(value):
    # Verificar si el valor es nulo
    if pd.isnull(value):
        return None
    # Verificar si el valor es una lista
    if isinstance(value, list):
        return value
    # Dividir la cadena por líneas y eliminar elementos vacíos
    return value.splitlines() if isinstance(value, str) else None

# Aplicar la función a la columna Accessibility
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(process_value)
dfg_attributes['Accessibility'].head(5)'''

"import pandas as pd\n\n# Función para procesar los valores de la columna Accessibility\ndef process_value(value):\n    # Verificar si el valor es nulo\n    if pd.isnull(value):\n        return None\n    # Verificar si el valor es una lista\n    if isinstance(value, list):\n        return value\n    # Dividir la cadena por líneas y eliminar elementos vacíos\n    return value.splitlines() if isinstance(value, str) else None\n\n# Aplicar la función a la columna Accessibility\ndfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(process_value)\ndfg_attributes['Accessibility'].head(5)"

In [23]:
'''# Dividir y limpiar los valores basados en el espacio en blanco
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: x.split() if pd.notnull(x) else [])
dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: x.split() if pd.notnull(x) else [])
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: x.split() if pd.notnull(x) else [])

# Muestra el DataFrame resultante
dfg_attributes.head()'''

"# Dividir y limpiar los valores basados en el espacio en blanco\ndfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: x.split() if pd.notnull(x) else [])\ndfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: x.split() if pd.notnull(x) else [])\ndfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: x.split() if pd.notnull(x) else [])\n\n# Muestra el DataFrame resultante\ndfg_attributes.head()"

In [20]:
dfg_attributes.head()

Unnamed: 0,gmap_id,Accessibility,Activities,Amenities
2,0x80c2c778e3b73d33:0xbdc58662a4a97d49,[Wheelchair accessible entrance],,[Good for kids]
6,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,[Wheelchair accessible entrance],,[Good for kids]
8,0x87ec235c54d25b31:0x3b75fb5facc602f,,,
68,0x89c6c89efcaed69d:0xded973f6033e7dba,,,[Good for kids]
75,0x87fd0e70c5f5d87b:0xdf340eeb75040ef3,,,[Restroom]


### Creacion del dataset de precios

In [24]:
dfg_rest_prices_by_zip = dfg_rest.groupby('postal_code')['price'].apply(lambda x: x.dropna().astype(float).mean())

dfg_rest_prices_by_zip.reset_index()

Unnamed: 0,postal_code,price
0,1,1.000000
1,10001,1.639344
2,10002,1.710744
3,10003,1.828025
4,10004,1.750000
...,...,...
17612,99829,2.000000
17613,99833,2.000000
17614,99835,2.000000
17615,99901,1.444444


In [25]:
dfg_rest_prices_by_zip_df = dfg_rest_prices_by_zip.to_frame()
dfg_rest_prices_by_zip_df.to_parquet('dfg_rest_prices_by_zip.parquet')

### Creación de dataset de coordenadas

In [None]:
dfg_rest_coord = dfg_rest[['gmap_id', 'latitude', 'longitude', 'name', 'state', 'city', 'postal_code',]]
dfg_rest_coord = dfg_rest_coord.rename(columns={'gmap_id': 'business_id'})
dfg_rest_coord['source'] = 'google'

print(dfg_rest_coord.info())
dfg_rest_coord.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 223485 entries, 2 to 274996
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            223485 non-null  object 
 1   address_full    223485 non-null  object 
 2   gmap_id         223485 non-null  object 
 3   latitude        223485 non-null  float64
 4   longitude       223485 non-null  float64
 5   avg_rating      223485 non-null  float64
 6   num_of_reviews  223485 non-null  int64  
 7   price           223485 non-null  float64
 8   address         223485 non-null  object 
 9   city            223485 non-null  object 
 10  postal_code     223485 non-null  object 
 11  state           223485 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 22.2+ MB
None


Unnamed: 0,business_id,latitude,longitude,name,state,city,postal_code,source
20298,0x80c2d7d0dbd83f77:0x1b319fcb8b0cfc8d,34.07122,-117.93172,Wokcano Asian Grill,CA,West Covina,91790,google
109954,0x865ca3a9ab4898e7:0x7cd58aa81914a15d,29.7491,-98.060456,Taqueria Taco Rico,TX,New Braunfels,78130,google


In [27]:
# Función para convertir latitud y longitud a coordenadas cartesianas
def lat_lon_to_cartesian(lat, lon):
    R = 6371  # Radio de la Tierra en kilómetros
    x = R * math.cos(math.radians(lat)) * math.cos(math.radians(lon))
    y = R * math.cos(math.radians(lat)) * math.sin(math.radians(lon))
    return x, y

# Suponiendo que tienes un DataFrame llamado df con las columnas business_id, latitud y longitud
# Agrega campos de coordenadas cartesianas x e y al DataFrame
dfg_rest_coord['x'], dfg_rest_coord['y'] = zip(*dfg_rest_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))

print(dfg_rest_coord.info())
dfg_rest_coord.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 223485 entries, 2 to 274996
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  223485 non-null  object 
 1   latitude     223485 non-null  float64
 2   longitude    223485 non-null  float64
 3   name         223485 non-null  object 
 4   state        223485 non-null  object 
 5   city         223485 non-null  object 
 6   postal_code  223485 non-null  object 
 7   source       223485 non-null  object 
 8   x            223485 non-null  float64
 9   y            223485 non-null  float64
dtypes: float64(4), object(6)
memory usage: 18.8+ MB
None


Unnamed: 0,business_id,latitude,longitude,name,state,city,postal_code,source,x,y
100938,0x880fcdb76315de7f:0x105e24999d85f137,41.953489,-87.727317,Top Dog II,IL,Chicago,60618,google,187.888631,-4734.307852
202198,0x89de0b8a0905153d:0x976fc4a006084f03,42.707354,-73.777065,Pearl of the Orient,NY,Loudonville,12211,google,1307.920384,-4495.174583


In [28]:
# Exportar
#dfg_rest_coord.to_parquet('dfg_rest_coord.parquet', index=False)
#dfg_rest_coord.to_excel('dfg_rest_coord.xlsx', index=False)

## Dataset REVIEW

In [29]:
dfg_rest_ids = dfg_rest[['gmap_id', 'state']]
print(dfg_rest_ids.info())
dfg_rest_ids.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 223485 entries, 2 to 274996
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   gmap_id  223485 non-null  object
 1   state    223485 non-null  object
dtypes: object(2)
memory usage: 5.1+ MB
None


Unnamed: 0,gmap_id,state
199383,0x89d940dadf3f7833:0x3168c2c8fce67d5,NY
225919,0x808e31ed56832b03:0x7cf3622892ff571b,CA


In [30]:
# Carga de datos desde los datasets .json para cada estado de USA

states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY',
    'District_of_Columbia'
]
df_list = []

for state in states:
    filename = f'/Users/Juan/Documents/Academics/DataScience/PjFinal/Google/GoogleReviews/revgoo_{state}.parquet'
    dfgrev = pd.read_parquet(filename)
    dfgrev = dfgrev.dropna(subset=['text'])

    # Filtrado por restaurantes
    dfgrev = dfgrev[dfgrev['gmap_id'].isin(dfg_rest_ids['gmap_id'])]
    # Análisis de sentimientos con VADER
    analyzer = SentimentIntensityAnalyzer()
    dfgrev['vader_polarity'] = dfgrev['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
    dfgrev['vader_sentiment'] = pd.cut(dfgrev['vader_polarity'], bins=[-float('inf'), -0.001, 0.0, float('inf')], labels=[-1, 0, 1])
    # Seleccion de campos
    dfgrev = dfgrev[['gmap_id', 'state', 'user_id', 'time', 'rating', 'vader_polarity', 'vader_sentiment']]

    df_list.append(dfgrev)
dfg_reviews = pd.concat(df_list, ignore_index=True)

print(dfg_reviews.shape)
dfg_reviews.sample(2)


(4271426, 7)


Unnamed: 0,gmap_id,state,user_id,time,rating,vader_polarity,vader_sentiment
1000462,0x880cd08ac1a65b51:0xfbab4ab3b79e19d3,IL,1.061677e+20,1593914905788,5,0.4588,1
183888,0x87d2bb5ba812efd7:0x5af0a4c3a31f5b99,AR,1.116707e+20,1602611271401,5,0.6114,1


In [31]:
dfg_reviews_usa = dfg_reviews

# Adecuacion de campos
dfg_reviews_usa['time'] = dfg_reviews_usa['time'].astype(int)
dfg_reviews_usa['time'] = dfg_reviews_usa['time'] / 1000

# Convertir la marca de tiempo a un objeto de fecha y hora
dfg_reviews_usa['datetime'] = dfg_reviews_usa['time'].apply(lambda x: datetime.fromtimestamp(x))
dfg_reviews_usa = dfg_reviews_usa[['user_id', 'gmap_id', 'state', 'datetime', 'rating', 'vader_polarity', 'vader_sentiment']]

In [32]:
print(dfg_reviews_usa.info())
dfg_reviews_usa.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4271426 entries, 0 to 4271425
Data columns (total 7 columns):
 #   Column           Dtype         
---  ------           -----         
 0   user_id          float64       
 1   gmap_id          object        
 2   state            object        
 3   datetime         datetime64[ns]
 4   rating           int64         
 5   vader_polarity   float64       
 6   vader_sentiment  category      
dtypes: category(1), datetime64[ns](1), float64(2), int64(1), object(2)
memory usage: 199.6+ MB
None


Unnamed: 0,user_id,gmap_id,state,datetime,rating,vader_polarity,vader_sentiment
875609,1.149884e+20,0x88f5f2347bc60e2f:0x370d11b5b62d9fc,GA,2020-06-24 14:02:11.325,1,-0.0176,-1
4225644,1.182243e+20,0x88057ff5d86654c5:0x7b9bb5d08f3c8308,WI,2018-10-18 13:29:25.000,5,0.8271,1
758460,1.075899e+20,0x88db1eeccf582187:0x217fd35656a42fd,FL,2011-07-07 08:51:37.443,5,0.8016,1
1293542,1.011439e+20,0x87e4f740dd790d9b:0xbc5da9eed103a1b9,IA,2018-05-29 11:44:04.571,5,0.9259,1
2758491,1.154013e+20,0x89acf75dc1309461:0x1feb31ad2a3422a8,NC,2017-05-02 08:39:05.721,5,0.8377,1


In [None]:
# Exportación de datos
#dfg_reviews_usa.to_parquet('dfgrevall.parquet')

# Conclusiones

* Dataset SITES
    * Conversión del campo "precio" a formato numérico
    * Obtención de los campos "state" y "postal_code" a partir del campo "address" - esto será fundamental para combinar luego el dataset de google y yelp
    * Creación del dataset de categorías a partir del campo "category" cuyos valores originales están en formato tipo "lista" - será de utilidad para el modelo de machine learning
    * Creación del dataset de atributos a partir del campo "MISC" cuyos valores originales están en formato tipo "diccionario de listas" - será de utilidad para el modelo de machine learning
    * Creación del dataset de precios - será utilizado para crear el campo "precio" en el dataset de yelp
    * Creación del dataset de coordenadas - se utilizará para concatenar un listado único de sites entre ambos datasets de google y yelp
* Dataset REVIEWS
    * Mediante un código se leen todos los archivos de reviews de cada estado, concatenando los registros que correspondan a restaurantes 