# Importación de librerías
====================================================================================================================================

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from scipy import stats as st
import json

from joblib import Parallel, delayed
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Carga de datos
====================================================================================================================================

In [2]:
# Restaurantes
dfyrst = pd.read_parquet('dfyrst_gastronomics.parquet')

In [3]:
# Tips
tip = pd.read_json('dataset_y_tips.json', lines=True)

In [4]:
# Review
review = pd.read_parquet('dataset_y_reviews.parquet')

# Preprocesamiento
====================================================================================================================================

## Dataset RESTAURANTES

In [5]:
dfyrst.sample()

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food
75101,lSwPJt2ZPX03tEpfXE1BFw,New Orleans Cake Café & Bakery,New Orleans,70117,29.963752,-90.052964,4.5,696,0,LA,LA - New Orleans,New Orleans - 70117,LA - New Orleans - 70117,Restaurants,yes


## Dataset TIP

In [6]:
tip.sample(2)

Unnamed: 0,user_id,business_id,text,date,compliment_count
6888,s6QghJT6IHNxV5MX8a66tw,d5gNZfKLqCKmqV0PmxgXYw,"For craft beer fans, perhaps the best deal in ...",2013-08-17 01:32:33,0
84136,X2uuhn5hChyYrbYPJUDZvg,ltBBYdNzkeKdCNPDAsxwAA,the scrimshaw pilsner is disappointing brew wi...,2013-03-03 22:45:14,0


In [7]:
# Selección de campos
dfytip = tip

dfytip['year'] = dfytip['date'].dt.year
dfytip['month'] = dfytip['date'].dt.month
dfytip['year_month'] = dfytip['year'].astype(str).str.slice(-2) + dfytip['month'].astype(str).str.zfill(2)

# Filtrado por restaurantes
dfytip = dfytip[dfytip['business_id'].isin(dfyrst['business_id'])]
dfytip

dfytip.info()
dfytip.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 737451 entries, 1 to 908914
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           737451 non-null  object        
 1   business_id       737451 non-null  object        
 2   text              737451 non-null  object        
 3   date              737451 non-null  datetime64[ns]
 4   compliment_count  737451 non-null  int64         
 5   year              737451 non-null  int32         
 6   month             737451 non-null  int32         
 7   year_month        737451 non-null  object        
dtypes: datetime64[ns](1), int32(2), int64(1), object(4)
memory usage: 45.0+ MB


Unnamed: 0,user_id,business_id,text,date,compliment_count,year,month,year_month
741238,aGHJvZOBsseRY0Jgt7L-Vg,zhn28iqIxCTD7hQX7t04gg,"super good food, super good service! prices ar...",2018-05-16 16:04:44,0,2018,5,1805
44418,VwiYJN6BxqtW_zF8hefhmQ,00raCkd-MBw7n7lVKOb_oQ,Early bday dinner with family (well some of th...,2012-04-04 00:11:13,0,2012,4,1204
900710,rNMb41ZjOT-YFiiIYIgF7w,7NITtPelRe_oQRsWACI3yA,"Great coffee, pizza, atmosphere, and service. ...",2021-10-07 21:39:30,0,2021,10,2110
354870,hyifHBwxOAwQiIfB-2DBfw,bp5Mk2d0qofUeF5uLauIbg,Excellent food. Clean and nice servers. Would ...,2015-07-31 17:16:15,0,2015,7,1507
71910,y2eVOTPpiH0n8VYg023Y6g,ZDV6ocQSe6dkMt-u011xcQ,Chicken and dumplings are to die for.,2013-03-09 02:45:58,0,2013,3,1303


In [8]:
# Análisis de sentimientos a partir del campo "text"
dfytip['text'] = dfytip['text'].astype(str)

analyzer = SentimentIntensityAnalyzer()
dfytip['polarity'] = dfytip['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
dfytip['sentiment'] = pd.cut(dfytip['polarity'], bins=[-float('inf'), -0.001, 0.0, float('inf')], labels=[-1, 0, 1])

dfytip.info()
dfytip.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfytip['text'] = dfytip['text'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfytip['polarity'] = dfytip['text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfytip['sentiment'] = pd.cut(dfytip['polarity'], bins=[-float(

<class 'pandas.core.frame.DataFrame'>
Index: 737451 entries, 1 to 908914
Data columns (total 10 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   user_id           737451 non-null  object        
 1   business_id       737451 non-null  object        
 2   text              737451 non-null  object        
 3   date              737451 non-null  datetime64[ns]
 4   compliment_count  737451 non-null  int64         
 5   year              737451 non-null  int32         
 6   month             737451 non-null  int32         
 7   year_month        737451 non-null  object        
 8   polarity          737451 non-null  float64       
 9   sentiment         737451 non-null  category      
dtypes: category(1), datetime64[ns](1), float64(1), int32(2), int64(1), object(4)
memory usage: 51.3+ MB


Unnamed: 0,user_id,business_id,text,date,compliment_count,year,month,year_month,polarity,sentiment
846975,CRsJSxpXL_WzedGeTdEcZw,zea3cUOcoQ_wGynJkZ0nLg,Excellent food.....New York Style....fast serv...,2017-06-03 16:00:27,0,2017,6,1706,0.6114,1
451354,Rx-113l9m4ex64C8DZPEbA,UOiqYBdIPRrMm-lnEbbjBg,Their pumpkin latte is delicious - not too swe...,2011-10-20 20:16:59,0,2011,10,1110,-0.123,-1
758032,FlU3NZ_ZtBbRnRV_RKX23Q,wj8XtPyuREj8_0GQz3LZ6w,A lot more pricey than 2 dollar signs!!,2017-09-24 19:31:59,0,2017,9,1709,0.0,0
362528,KRNbW2TCluXsd8VtNJ8uew,Df60BZXw3Kj9Xt1UlAidqA,Cute bunnies at the pet shop.,2015-11-23 03:35:52,0,2015,11,1511,0.4588,1
44379,5Y5KbsI5buMcNh2hTFaRpA,_V3YkIkqAn4V6U-cAykVtw,Half-price sushi menu available all week! Grea...,2013-02-08 19:27:19,0,2013,2,1302,0.6588,1


In [9]:
dfytip.to_parquet('dfy_tips.parquet', index=False)

## Dataset REVIEW

In [10]:
print(review.shape)
review.sample(2)

(6990280, 9)


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date
3408575,sSBVBV8n2EiV9ugru3zZAg,How400tgmS48fbXKdDCO9w,Pud2rnFwby48KD1qSaAAXg,5.0,1,0,1,Cutest Cuban spot for a quick bite to eat! We ...,2021-06-11 21:11:21
3055224,DnXnDHcwUp-t_YH7K8MTjw,YwTtuahNDjMtTWG6kY4POg,AdPk3z9_tGW8wlLnQBAPDw,1.0,0,1,0,Just purchased a package of General Mills Gard...,2016-07-02 17:21:40


In [11]:
# Selección de campos
dfyrev = review
# Adecuación de campos
dfyrev['date'] = pd.to_datetime(dfyrev['date'], errors='coerce')
dfyrev.reset_index(drop=True, inplace=True)

# Eliminación de duplicados: No tiene duplicados, se ha analizado fuera de este archivo

# Eliminación de nulos
dfyrev = dfyrev.dropna()

dfyrev['year'] = dfyrev['date'].dt.year
dfyrev['month'] = dfyrev['date'].dt.month
dfyrev['year_month'] = dfyrev['year'].astype(str).str.slice(-2) + dfyrev['month'].astype(str).str.zfill(2)

dfyrev = dfyrev[(dfyrev['year'] >= 2010) & (dfyrev['year'] <= 2021)]

# Filtrado por restaurantes
dfyrev = dfyrev[dfyrev['business_id'].isin(dfyrst['business_id'])]
dfyrev

dfyrev.info()
dfyrev.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 5086663 entries, 0 to 6990279
Data columns (total 12 columns):
 #   Column       Dtype         
---  ------       -----         
 0   review_id    object        
 1   user_id      object        
 2   business_id  object        
 3   stars        float64       
 4   useful       int64         
 5   funny        int64         
 6   cool         int64         
 7   text         object        
 8   date         datetime64[ns]
 9   year         int32         
 10  month        int32         
 11  year_month   object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(3), object(5)
memory usage: 465.7+ MB


Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,month,year_month
1684537,3iiPFcBvoengKOqhD3WShQ,qjfMBIZpQT9DDtw_BWCopQ,I7SkoqN88fpKagzKA059Fw,4.0,1,0,0,"No way, haven't reviewed this place yet. Okay ...",2013-04-04 19:11:29,2013,4,1304
2894751,A5NxHEK4oqCMoTcwN3c_rA,6dD34xst7plVVdWpwh0KlA,rJuz-Y_80o9jXBRb7B1bQg,3.0,2,2,0,I don't get the fascination with this place. M...,2012-06-04 03:27:12,2012,6,1206


In [13]:
# Especificar la semilla aleatoria para reproducibilidad
random_state = 42
dfyrev_sample = dfyrev.sample(n=1000000, random_state=random_state)
dfyrev_sample.head(2)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,month,year_month
2254991,kAvTYPTeyG_USQBzRd5hYw,PGas3x06gHXGQITDMnFyow,uZsStnH9w2xY15og9VsQZA,1.0,0,0,0,Blech! Only went there cuz we had a gift cert...,2012-04-10 23:19:02,2012,4,1204
944067,60HMcgQ9EJZkhicMtnZ6JQ,D9wybQ24_bpA1WCadfqEig,ngCSdj_2csgsfgpLipCaMg,1.0,0,1,0,Me and my wife ate the chili cheese taquitos a...,2013-11-26 06:31:39,2013,11,1311


In [14]:
analyzer = SentimentIntensityAnalyzer()

dfyrev_sample['polarity'] = [analyzer.polarity_scores(text)['compound'] for text in dfyrev_sample['text']]
dfyrev_sample['sentiment'] = pd.cut(dfyrev_sample['polarity'], bins=[-float('inf'), -0.001, 0.0, float('inf')], labels=[-1, 0, 1])

print(dfyrev_sample.info())
dfyrev_sample.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 1000000 entries, 2254991 to 6314535
Data columns (total 14 columns):
 #   Column       Non-Null Count    Dtype         
---  ------       --------------    -----         
 0   review_id    1000000 non-null  object        
 1   user_id      1000000 non-null  object        
 2   business_id  1000000 non-null  object        
 3   stars        1000000 non-null  float64       
 4   useful       1000000 non-null  int64         
 5   funny        1000000 non-null  int64         
 6   cool         1000000 non-null  int64         
 7   text         1000000 non-null  object        
 8   date         1000000 non-null  datetime64[ns]
 9   year         1000000 non-null  int32         
 10  month        1000000 non-null  int32         
 11  year_month   1000000 non-null  object        
 12  polarity     1000000 non-null  float64       
 13  sentiment    1000000 non-null  category      
dtypes: category(1), datetime64[ns](1), float64(2), int32(2), int64(3)

Unnamed: 0,review_id,user_id,business_id,stars,useful,funny,cool,text,date,year,month,year_month,polarity,sentiment
163124,rOfG9EyF8jl0aCqG8d7ERQ,X35YxESH-fHOIVkRPJce3w,9A1ve-8MfJcUcOGQYCLlKA,5.0,2,2,2,Oh my! First time eating here. The food was ab...,2017-02-04 18:28:09,2017,2,1702,0.9506,1
2540040,EJFaJJsaMTu1gpvbe_MvMA,wjjF-2xZXVqIDTFTLhJL_g,9AdqkHZTVTwGygpHY8NT3A,5.0,0,0,1,This place was so good and the vibe was super ...,2019-08-11 06:58:53,2019,8,1908,0.9914,1


In [15]:
dfyrev_sample.to_parquet('dfy_reviews.parquet', index=False)

# Conclusiones
====================================================================================================================================

* El dataset RESTAURANTES se encuentra ya procesado y se utilizará para filtrar los locales deseados utilizando el campo 'site_id"
* Dataset TIP 
    * Contiene información de reseñas con un texto un poco más corto pero con sugerencias más rápidas
    * Se adecúan los tipos de datos, no se observan valores nulos ni duplicados
    * Se aplica un análisis de sentimientos utilizando "vader"
* Dataset REVIEWS 
    * Se adecúan los tipos de datos, no se observan valores nulos, se eliminan los pocos duplicados que tiene
    * Se aplica un análisis de sentimientos utilizando "vader" tomando una muestra de 1 millón de registros (contiene aprox 6 millones)