# Importación de librerías
====================================================================================================================================

In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
from math import factorial
from scipy import stats as st
import json
import gzip
import pickle

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from datetime import datetime

# Carga de datos
====================================================================================================================================

In [2]:
# Sites
dfg_rest = pd.read_parquet('dataset_g_restaurants.parquet')

In [3]:
# Reviews
# Este dataset se carga más adelante en su sección de pre-procesamiento

# Preprocesamiento
====================================================================================================================================

## Dataset SITES

In [4]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 248852 entries, 2 to 274996
Data columns (total 15 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   name              248852 non-null  object 
 1   address           247751 non-null  object 
 2   gmap_id           248852 non-null  object 
 3   description       79018 non-null   object 
 4   latitude          248852 non-null  float64
 5   longitude         248852 non-null  float64
 6   category          248852 non-null  object 
 7   avg_rating        248852 non-null  float64
 8   num_of_reviews    248852 non-null  int64  
 9   price             115831 non-null  object 
 10  hours             220683 non-null  object 
 11  MISC              246706 non-null  object 
 12  state             221749 non-null  object 
 13  relative_results  203743 non-null  object 
 14  url               248852 non-null  object 
dtypes: float64(3), int64(1), object(11)
memory usage: 30.4+ MB
None


Unnamed: 0,name,address,gmap_id,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
126759,Julie's Cafe,"Julie's Cafe, 2562 Bancroft Way, Berkeley, CA ...",0x80857c2f652686cd:0x32ee8401769072a4,Relaxed haunt showcasing a diverse menu of Tur...,37.868725,-122.257582,"[Cafe, Deli, Italian restaurant, Mediterranean...",4.3,44,$,"[[Tuesday, 9AM–6PM], [Wednesday, 9AM–6PM], [Th...",{'Accessibility': ['Wheelchair accessible entr...,Permanently closed,[0x80857e9e823662bd:0x7802cda49027336f],https://www.google.com/maps/place//data=!4m2!3...
101830,Lam Taste,"Lam Taste, 892 S Rochester Rd, Rochester Hills...",0x8824e9c528cfa47b:0x1ddc0fa07a02c5a,,42.668593,-83.13477,"[Delivery Chinese restaurant, Asian restaurant...",4.1,53,,"[[Friday, 11AM–9PM], [Saturday, 11AM–9PM], [Su...",{'Accessibility': ['Wheelchair accessible entr...,Opens soon ⋅ 11AM,"[0x8824e9a5df7a2dc3:0xeea3536fe24f7a89, 0x8824...",https://www.google.com/maps/place//data=!4m2!3...


In [5]:
dfg_rest = dfg_rest[['name', 'address', 'gmap_id', 'latitude', 'longitude', 'avg_rating', 'num_of_reviews', 'price']]
dfg_rest = dfg_rest.rename(columns={'address': 'address_full'})
dfg_rest[['address', 'city', 'postal_code']] = dfg_rest['address_full'].str.extract(r'.*,\s*([^,]+),\s*([^,]+),\s*([^,]+)')
dfg_rest['state'] = dfg_rest['postal_code'].str.split(' ').str[0]
dfg_rest['postal_code'] = dfg_rest['postal_code'].str.split(' ').str[1]

dfg_rest['postal_code'] = pd.to_numeric(dfg_rest['postal_code'], errors='coerce')
dfg_rest = dfg_rest.dropna(subset=['postal_code'])
dfg_rest['postal_code'] = dfg_rest['postal_code'].astype(int)

In [6]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 231427 entries, 2 to 274996
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            231427 non-null  object 
 1   address_full    231427 non-null  object 
 2   gmap_id         231427 non-null  object 
 3   latitude        231427 non-null  float64
 4   longitude       231427 non-null  float64
 5   avg_rating      231427 non-null  float64
 6   num_of_reviews  231427 non-null  int64  
 7   price           104084 non-null  object 
 8   address         231427 non-null  object 
 9   city            231427 non-null  object 
 10  postal_code     231427 non-null  int64  
 11  state           231427 non-null  object 
dtypes: float64(3), int64(2), object(7)
memory usage: 23.0+ MB
None


Unnamed: 0,name,address_full,gmap_id,latitude,longitude,avg_rating,num_of_reviews,price,address,city,postal_code,state
139410,E.T.'s Bar,"E.T.'s Bar, 7300 Secor Rd, Lambertville, MI 48144",0x883c8079df011a55:0x6b4f5092d2d35540,41.746554,-83.624354,4.5,58,$,7300 Secor Rd,Lambertville,48144,MI
202590,Foolish Club,"Foolish Club, 1 Arrowhead Dr, Kansas City, MO ...",0x87c0e4a1d0441ed3:0x64b22c24d43e5a7a,39.048171,-94.483084,4.4,8,,1 Arrowhead Dr,Kansas City,64129,MO


In [7]:
# Convertir precio a formato numerico
price_mapping = {'$': 1, '$$': 2, '$$$': 3, '$$$$': 4}
dfg_rest['price'] = dfg_rest['price'].map(price_mapping)

# Reemplazar valores nulos del campo "precio" con el promedio de precios basado en cercanía, utilzando el código postal
dfg_rest['postal_code'] = dfg_rest['postal_code'].astype(str)
average_price_by_zip = dfg_rest.groupby('postal_code')['price'].apply(lambda x: x.dropna().astype(float).mean())
dfg_rest['price'] = dfg_rest.apply(lambda row: average_price_by_zip.get(row['postal_code']) if pd.isnull(row['price']) else row['price'], axis=1)

dfg_rest = dfg_rest.dropna(subset=['price'])


In [8]:
print(dfg_rest.info())
dfg_rest.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 223485 entries, 2 to 274996
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   name            223485 non-null  object 
 1   address_full    223485 non-null  object 
 2   gmap_id         223485 non-null  object 
 3   latitude        223485 non-null  float64
 4   longitude       223485 non-null  float64
 5   avg_rating      223485 non-null  float64
 6   num_of_reviews  223485 non-null  int64  
 7   price           223485 non-null  float64
 8   address         223485 non-null  object 
 9   city            223485 non-null  object 
 10  postal_code     223485 non-null  object 
 11  state           223485 non-null  object 
dtypes: float64(4), int64(1), object(7)
memory usage: 22.2+ MB
None


Unnamed: 0,name,address_full,gmap_id,latitude,longitude,avg_rating,num_of_reviews,price,address,city,postal_code,state
85506,Warren's Drive Inn,"Warren's Drive Inn, 5331 S 1900 W, Roy, UT 84067",0x8753055fadf0ab17:0x5abcb806b4fc084f,41.166331,-112.026444,3.9,215,2.0,5331 S 1900 W,Roy,84067,UT
28584,TOP FIVE! BBQ,"TOP FIVE! BBQ, 1365 N Dupont Hwy, Dover, DE 19901",0x89b8970721e48c0f:0x8081e8692b41bcda,39.194847,-75.540409,4.8,24,1.214286,1365 N Dupont Hwy,Dover,19901,DE


In [9]:
#dfg_rest.to_parquet('dfg_rest.parquet')

### Creacion del dataset de categorias

In [10]:
dfg_restaurants = pd.read_parquet('dataset_g_restaurants.parquet')
dfg_categories = dfg_restaurants[['gmap_id', 'category']]
dfg_categories = dfg_categories.explode('category')
dfg_categories.rename(columns={'gmap_id': 'site_id'}, inplace=True)

dfg_categories.info()
dfg_categories.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 626995 entries, 2 to 274996
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   site_id   626995 non-null  object
 1   category  626995 non-null  object
dtypes: object(2)
memory usage: 14.4+ MB


Unnamed: 0,site_id,category
166710,0x80952f426db182d3:0x6528e86745ac118b,Restaurant
206381,0x8635f0af43738047:0x9419f8a4c9b0f59f,Mexican restaurant
262934,0x885131994b0ad17d:0x915b4f1cf5c42da9,Night club
75517,0x54d3d9853e36314f:0x64a95c9ff30e6e59,Cocktail bar
53595,0x880fd3bad48a1ab1:0xa4d6fb632243008e,Wi-Fi spot
41376,0x7eab33a2dd8904b7:0xf247ed1e4dfd7588,Restaurant
206144,0x8626a4758871dd07:0x599aae5dedbfba55,Restaurant
36870,0x89c1863671fe9ebb:0x7a1685dd8fb41e40,Caterer
177556,0x5495a1bdfaffec21:0x95abe43c3019e4,Restaurant
207595,0x89c2a04e59180fd9:0x445314e46670ee40,Pizza delivery


In [11]:
dfg_categories_grouped = dfg_categories.groupby(['site_id'])['category'].count().reset_index()
dfg_categories_grouped.sort_values(by='category', ascending=False, inplace=True)
dfg_categories_grouped.head()

Unnamed: 0,site_id,category
24829,0x808f7792bd0c683d:0xefefd9d96f8fcb11,22
100420,0x87c5696c2304eb11:0xf60dbc11df643d20,22
211649,0x89c28801576a92a1:0xe8183528656d5f08,22
224911,0x89c6cf1e78429c4d:0xffccfa7176afdc7e,20
224636,0x89c6c9d5df403af7:0xa304c9f7a08ac8e3,20


In [12]:
# Exportar
#dfg_categories.to_parquet('dfg_site_categories.parquet')
dfg_categories.to_excel('dfg_site_categories.xlsx')

### Creacion del dataset de atributos

In [13]:
dfg_restaurants = pd.read_parquet('dataset_g_restaurants.parquet')
dfg_attributes = dfg_restaurants[['gmap_id', 'MISC']]

dfg_categories.rename(columns={'gmap_id': 'site_id'}, inplace=True)

dfg_attributes.info()
dfg_attributes.sample(10)

<class 'pandas.core.frame.DataFrame'>
Index: 248852 entries, 2 to 274996
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   gmap_id  248852 non-null  object
 1   MISC     246706 non-null  object
dtypes: object(2)
memory usage: 5.7+ MB


Unnamed: 0,gmap_id,MISC
29194,0x80c2b65845258b3d:0x927632fa8e5bac71,{'Accessibility': ['Wheelchair accessible entr...
265503,0x880503e86086c7e1:0xd361e724b24390c0,{'Accessibility': ['Wheelchair accessible entr...
32465,0x89d373d1ad84b7fd:0x1093c7fdbb1327e9,{'Accessibility': ['Wheelchair accessible entr...
74328,0x872b3fdf60ab530f:0x48fbe7ba3265bed4,{'Accessibility': ['Wheelchair accessible entr...
211415,0x89c25bd397d1a133:0x7550ad928cd9edb1,"{'Accessibility': None, 'Activities': None, 'A..."
233228,0x88c3417ee6481fd9:0x210f9b2c90a6df0,{'Accessibility': ['Wheelchair accessible entr...
16614,0x8807aec4b66fc267:0x85f67fdaa960ffce,{'Accessibility': ['Wheelchair accessible entr...
40136,0x88f48a69b8449d8d:0xb2b3da4a70188baa,{'Accessibility': ['Wheelchair accessible entr...
178674,0x87c0ef8a8abafc0f:0x18cdd5fea0cb9e43,{'Accessibility': ['Wheelchair accessible entr...
66901,0x880e5c2357066d0b:0x3846ff20e115f90,{'Accessibility': ['Wheelchair accessible entr...


In [14]:
# Convertir el campo MISC en un DataFrame separado
misc_df = dfg_attributes['MISC'].apply(pd.Series)

# Dividir el diccionario en columnas separadas
dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]

# Eliminar la columna MISC original
dfg_attributes.drop(columns=['MISC'], inplace=True)

dfg_attributes.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfg_attributes[['Accessibility', 'Activities', 'Amenities']] = misc_df[['Accessibility', 'Activities', 'Amenities']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

Unnamed: 0,gmap_id,Accessibility,Activities,Amenities
2,0x80c2c778e3b73d33:0xbdc58662a4a97d49,[Wheelchair accessible entrance],,[Good for kids]
6,0x80dd2b4c8555edb7:0xfc33d65c4bdbef42,[Wheelchair accessible entrance],,[Good for kids]
8,0x87ec235c54d25b31:0x3b75fb5facc602f,,,
68,0x89c6c89efcaed69d:0xded973f6033e7dba,,,[Good for kids]
75,0x87fd0e70c5f5d87b:0xdf340eeb75040ef3,,,[Restroom]
96,0x7c00456eecad3111:0x8217f9600c51f33,[Wheelchair accessible entrance],,[Good for kids]
114,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,"[Wheelchair accessible entrance, Wheelchair ac...",,[Good for kids]
115,0x88c2e4e34f1ed783:0x76c5da381c499d79,"[Wheelchair accessible entrance, Wheelchair ac...",,[Good for kids]
123,0x8644b59b8fe872e5:0x5e638876caa84cc3,[Wheelchair accessible entrance],,
126,0x80c8be4e73e8263f:0x3edb275a351e6266,[Wheelchair accessible entrance],,[Restroom]


In [15]:
dfg_attributes.to_csv("dfg_attributes.csv")

In [None]:
dfg_attributes['Accessibility'].head(10)

In [None]:
# Hasta aca va okey

In [None]:
txt = ['Wheelchair accessible entrance' 'Wheelchair accessible parking lot' 'Wheelchair accessible seating']
x = [s.split(" ", 3) for s in txt]

print(x)

In [None]:
import pandas as pd

# Función para procesar los valores de la columna Accessibility
def process_value(value):
    # Verificar si el valor es nulo
    if pd.isnull(value):
        return None
    # Verificar si el valor es una lista
    if isinstance(value, list):
        return value
    # Dividir la cadena por líneas y eliminar elementos vacíos
    return value.splitlines() if isinstance(value, str) else None

# Aplicar la función a la columna Accessibility
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(process_value)
dfg_attributes['Accessibility'].head(5)

In [None]:
'''# Dividir y limpiar los valores basados en el espacio en blanco
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: x.split() if pd.notnull(x) else [])
dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: x.split() if pd.notnull(x) else [])
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: x.split() if pd.notnull(x) else [])

# Muestra el DataFrame resultante
dfg_attributes.head()'''

In [None]:
import pandas as pd

# Función para procesar los valores de la columna Accessibility
def process_value(value):
    # Verificar si el valor es nulo
    if pd.isnull(value):
        return None
    # Verificar si el valor es una lista
    if isinstance(value, list):
        return value
    # Dividir la cadena por espacio vacío y eliminar elementos vacíos
    return value.split() if isinstance(value, str) else None

# Aplicar la función a la columna Accessibility
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(process_value)

dfg_attributes['Accessibility'].sample(10)

In [None]:
import pandas as pd

# Dividir las listas en elementos individuales y eliminar los elementos vacíos
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: [item.strip() for item in x.split() if item.strip()] if isinstance(x, str) else 0)
#dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: [item.strip() for item in x.split('') if item.strip()] if isinstance(x, str) else [])
#dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: [item.strip() for item in x.split('') if item.strip()] if isinstance(x, str) else [])

# Muestra el DataFrame resultante

dfg_attributes['Accessibility'].info()

dfg_attributes[dfg_attributes['Accessibility']!=0].sample(10)


In [None]:
import re

import numpy as np

def process_value(value):
    # Verificar si el valor es una cadena vacía o un valor nulo
    if pd.isnull(value):
        return np.nan  # Devolver NaN si el valor es nulo
    
    # Separar los elementos utilizando el espacio como delimitador
    return value.strip('][').split(' ')  # Eliminar los corchetes y luego dividir por espacios

dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].astype(str).apply(process_value)
dfg_attributes['Activities'] = dfg_attributes['Activities'].astype(str).apply(process_value)
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].astype(str).apply(process_value)

dfg_attributes['Accessibility'].info()
dfg_attributes['Accessibility'].sample(10)

In [None]:
import re

# Definir una función para procesar los valores
def process_value(x):
    if pd.notnull(x):
        # Utilizar expresiones regulares para encontrar los elementos entre corchetes
        values = re.findall(r'\[\'(.*?)\'\]', x)
        return [value.strip() for value in values]
    else:
        return []

# Aplicar la función a cada columna
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(process_value)
dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(process_value)
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(process_value)



In [None]:
import pandas as pd

# Dividir las listas en elementos individuales y eliminar los elementos vacíos
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: [item.strip() for item in x.split(',') if item.strip()] if isinstance(x, str) else [])
dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: [item.strip() for item in x.split(',') if item.strip()] if isinstance(x, str) else [])
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: [item.strip() for item in x.split(',') if item.strip()] if isinstance(x, str) else [])

# Muestra el DataFrame resultante
dfg_attributes.head()


In [None]:
import ast
import pandas as pd

# Eliminar comillas dobles y convertir las listas de texto a listas de Python
dfg_attributes['Accessibility'] = dfg_attributes['Accessibility'].apply(lambda x: ast.literal_eval(x.replace("'", '')) if pd.notnull(x) else None)
dfg_attributes['Activities'] = dfg_attributes['Activities'].apply(lambda x: ast.literal_eval(x.replace("'", '')) if pd.notnull(x) else None)
dfg_attributes['Amenities'] = dfg_attributes['Amenities'].apply(lambda x: ast.literal_eval(x.replace("'", '')) if pd.notnull(x) else None)


In [None]:
#dfg_attributes.to_excel('dfg_attributes.xlsx', sheet_name='dfg_attributes')

In [None]:
dfg_attributes.head()

In [None]:
dfg_attributes = dfg_attributes.explode(['Accesibility', 'Activities', 'Amenities'])
dfg_attributes.head(2)

### Creacion del dataset de precios

In [None]:
dfg_rest_prices_by_zip = dfg_rest.groupby('postal_code')['price'].apply(lambda x: x.dropna().astype(float).mean())

dfg_rest_prices_by_zip.reset_index()

In [None]:
dfg_rest_prices_by_zip_df = dfg_rest_prices_by_zip.to_frame()
dfg_rest_prices_by_zip_df.to_parquet('dfg_rest_prices_by_zip.parquet')


### Creación de dataset de coordenadas

In [None]:
dfg_rest_coord = dfg_rest[['gmap_id', 'latitude', 'longitude', 'name', 'state', 'city', 'postal_code',]]
dfg_rest_coord = dfg_rest_coord.rename(columns={'gmap_id': 'business_id'})
dfg_rest_coord['source'] = 'google'

print(dfg_rest.info())
dfg_rest_coord.sample(2)

In [None]:
# Función para convertir latitud y longitud a coordenadas cartesianas
def lat_lon_to_cartesian(lat, lon):
    R = 6371  # Radio de la Tierra en kilómetros
    x = R * math.cos(math.radians(lat)) * math.cos(math.radians(lon))
    y = R * math.cos(math.radians(lat)) * math.sin(math.radians(lon))
    return x, y

# Suponiendo que tienes un DataFrame llamado df con las columnas business_id, latitud y longitud
# Agrega campos de coordenadas cartesianas x e y al DataFrame
dfg_rest_coord['x'], dfg_rest_coord['y'] = zip(*dfg_rest_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))

print(dfg_rest_coord.info())
dfg_rest_coord.sample(2)

In [None]:
# Exportar
#dfg_rest_coord.to_parquet('dfg_rest_coord.parquet', index=False)
#dfg_rest_coord.to_excel('dfg_rest_coord.xlsx', index=False)

## Dataset REVIEW

In [None]:
dfg_rest_ids = dfg_rest[['gmap_id', 'state']]
print(dfg_rest_ids.info())
dfg_rest_ids.sample(2)

In [None]:
states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA', 
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY',
    'District_of_Columbia'
]
df_list = []

for state in states:
    filename = f'/Users/Juan/Documents/Academics/DataScience/PjFinal/Google/GoogleReviews/revgoo_{state}.parquet'
    dfgrev = pd.read_parquet(filename)
    dfgrev = dfgrev.dropna(subset=['text'])

    # Filtrado por restaurantes
    dfgrev = dfgrev[dfgrev['gmap_id'].isin(dfg_rest_ids['gmap_id'])]
    # Análisis de sentimientos con VADER
    analyzer = SentimentIntensityAnalyzer()
    dfgrev['vader_polarity'] = dfgrev['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
    dfgrev['vader_sentiment'] = pd.cut(dfgrev['vader_polarity'], bins=[-float('inf'), -0.001, 0.0, float('inf')], labels=[-1, 0, 1])
    # Seleccion de campos
    dfgrev = dfgrev[['gmap_id', 'state', 'user_id', 'time', 'rating', 'vader_polarity', 'vader_sentiment']]

    df_list.append(dfgrev)
dfg_reviews = pd.concat(df_list, ignore_index=True)

print(dfg_reviews.shape)
dfg_reviews.sample(2)


In [None]:
dfg_reviews_usa = dfg_reviews

# Adecuacion de campos
dfg_reviews_usa['time'] = dfg_reviews_usa['time'].astype(int)
dfg_reviews_usa['time'] = dfg_reviews_usa['time'] / 1000

# Convertir la marca de tiempo a un objeto de fecha y hora
dfg_reviews_usa['datetime'] = dfg_reviews_usa['time'].apply(lambda x: datetime.fromtimestamp(x))
dfg_reviews_usa = dfg_reviews_usa[['user_id', 'gmap_id', 'state', 'datetime', 'rating', 'vader_polarity', 'vader_sentiment']]

In [None]:
print(dfg_reviews_usa.info())
dfg_reviews_usa.sample(5)

In [None]:
#dfg_reviews_usa.to_parquet('dfgrevall.parquet')