# Importación de librerías
====================================================================================================================================

In [2]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from math import factorial
from scipy import stats as st
import json
import gzip
import pickle

# Carga de datos
====================================================================================================================================

In [3]:
# Yelp
business = pd.read_pickle('dataset_y_business.pkl')
checkin = pd.read_json('dataset_y_checkin.json', lines=True)
user = pd.read_parquet('dataset_y_user.parquet')

In [4]:
# Base de datos externa
df_uszip = pd.read_excel('dataset_e_uszips.xlsx')

# Preprocesamiento
====================================================================================================================================

## Dataset uszip (externo)

In [5]:
print(df_uszip.info())
df_uszip.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33787 entries, 0 to 33786
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   postal_code       33787 non-null  int64  
 1   city              33787 non-null  object 
 2   state             33787 non-null  object 
 3   state_name        33787 non-null  object 
 4   population        33770 non-null  float64
 5   density           33770 non-null  float64
 6   county_fips       33787 non-null  int64  
 7   county_name       33787 non-null  object 
 8   county_weights    33787 non-null  object 
 9   county_names_all  33787 non-null  object 
 10  county_fips_all   33787 non-null  object 
 11  timezone          33787 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 3.1+ MB
None


Unnamed: 0,postal_code,city,state,state_name,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,timezone
19727,57047,Monroe,SD,South Dakota,293.0,7.7,46087,McCook,"{""46087"": 77.04, ""46125"": 22.96}",McCook|Turner,46087|46125,America/Chicago
5422,17240,Newburg,PA,Pennsylvania,3405.0,27.1,42041,Cumberland,"{""42041"": 60.96, ""42055"": 39.04}",Cumberland|Franklin,42041|42055,America/New_York


In [6]:
df_uszip = df_uszip[['postal_code', 'state']]
df_uszip['postal_code'] = df_uszip['postal_code'].astype(str)

df_uszip.info()
df_uszip.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33787 entries, 0 to 33786
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   postal_code  33787 non-null  object
 1   state        33787 non-null  object
dtypes: object(2)
memory usage: 528.1+ KB


Unnamed: 0,postal_code,state
22024,62670,IL
1186,4450,ME


## Dataset BUSINESS

In [7]:
print(business.info())
business.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
81721,mMZSlfax3GiUn0kCF7OOHQ,DeHart's Barber Shop,3253 S Holt Rd,Indianapolis,MO,46221,39.715212,-86.224831,5.0,8,...,,,,,,,,,,
92457,W-hiQed5M778PKaFIdxLuQ,Tampa Dentistry,1311 W Busch Blvd,Tampa,IL,33612,28.033744,-82.472455,4.5,6,...,,,,,,,,,,
145868,DeY0JL4MpfKfB4jf2Bx68w,Sponge Docks,761 Dodecanese Blvd,Tarpon Springs,LA,34689,28.155486,-82.760903,4.5,41,...,,,,,,,,,,
34868,6YVzV8cIn28pOPmXoQra1g,The Pour House - Exton,116 N Pottstown Pike,Exton,AB,19341,40.029818,-75.630395,3.5,323,...,,,,,,,,,,
100043,lNSnj5PqYdrMunAqlBzSRg,China Taste,1832 Bruce B Downs Blvd,Wesley Chapel,PA,33544,28.184164,-82.350903,3.0,24,...,,,,,,,,,,


In [8]:
business_copy = business.copy()
business_copy = business_copy.loc[:,~business_copy.columns.duplicated()]
business_copy.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [9]:
# Selección de campos
dfybsn = business[['business_id', 'name', 'state', 'city', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open']]

# Eliminación de columnas duplicadas
dfybsn = dfybsn.loc[:,~dfybsn.columns.duplicated()]

# Adcuación de tipos de datos
dfybsn['latitude'] = pd.to_numeric(dfybsn['latitude'], errors='coerce')
dfybsn['longitude'] = pd.to_numeric(dfybsn['longitude'], errors='coerce')
dfybsn['stars'] = pd.to_numeric(dfybsn['stars'], errors='coerce')
dfybsn['review_count'] = pd.to_numeric(dfybsn['review_count'], errors='coerce')
dfybsn['is_open'] = pd.to_numeric(dfybsn['is_open'], errors='coerce')

# Eliminación de duplicados
dfybsn.drop_duplicates()

'''Reemplazo de valores del campo "state": los valores del dataset original no se corresponden con los códigos postales, 
por lo cual se utilizará una base de datos externa para corregir esta serie.'''

dfybsn.drop(columns=['state'], inplace=True)
dfybsn = pd.merge(dfybsn, df_uszip, on='postal_code', how='left')

# Campos auxiliares
dfybsn.dropna(subset=['state'], inplace=True)
dfybsn['state_city'] = dfybsn['state'].str.cat(dfybsn['city'], sep=' - ')
dfybsn['city_postalcode'] = dfybsn['city'].str.cat(dfybsn['postal_code'], sep=' - ')
dfybsn['state_city_postalcode'] = dfybsn['state'].str.cat(dfybsn['city'], sep=' - ').str.cat(dfybsn['postal_code'], sep=' - ')

dfybsn.info()
dfybsn.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 135158 entries, 0 to 150345
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            135158 non-null  object 
 1   name                   135158 non-null  object 
 2   city                   135158 non-null  object 
 3   postal_code            135158 non-null  object 
 4   latitude               135158 non-null  float64
 5   longitude              135158 non-null  float64
 6   stars                  135158 non-null  float64
 7   review_count           135158 non-null  int64  
 8   is_open                135158 non-null  int64  
 9   state                  135158 non-null  object 
 10  state_city             135158 non-null  object 
 11  city_postalcode        135158 non-null  object 
 12  state_city_postalcode  135158 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 14.4+ MB


Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode
55768,OZkBdAPOE7INwhAV8ZQ-sg,DuPont Environmental Education Center,Wilmington,19801,39.723961,-75.561764,4.5,5,1,DE,DE - Wilmington,Wilmington - 19801,DE - Wilmington - 19801
150175,3u5qUgOrgczTdCUhb0kKow,Verona Pizza,Upper Darby,19082,39.967261,-75.289625,3.0,66,1,PA,PA - Upper Darby,Upper Darby - 19082,PA - Upper Darby - 19082


## Creación de dataset RESTAURANTES

In [10]:
# Selección de campos
dfybct = business[['business_id', 'categories']]

# Eliminación de columnas duplicadas
dfybct = dfybct.loc[:,~dfybct.columns.duplicated()]

# Adecuación de campos
dfybct = dfybct.explode('categories')
dfybct = dfybct.assign(categories=dfybct['categories'].str.split(', ')).explode('categories')
dfybct.reset_index(drop=True, inplace=True)

# Eliminación de nulos
dfybct = dfybct.dropna(subset=['categories'])

dfybct.info()
dfybct.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 668592 entries, 0 to 668694
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  668592 non-null  object
 1   categories   668592 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB


Unnamed: 0,business_id,categories
202336,b_jaIVcqgHf2BElsGy_hHw,Print Media
519556,rcIFKLwbmTdniOHIPfWIVQ,Art Galleries


In [11]:
# CATEGORIAS DISPONIBLES
#categories_yelp = pd.DataFrame(dfybct['categories'].unique())
#categories_yelp.to_csv('categories_yelp.csv', index=False)

In [12]:
# CATEGORIAS TIPO "FOOD" (LOCALES QUE TIENEN POR LO MENOS UNA CATEGORIA GASTRONOMICA)
categories_food = pd.read_csv('dataframe_categories_food.csv')
dfyfct = pd.merge(dfybct, categories_food, on='categories', how='inner')

dfyfct.info()
dfyfct.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149643 entries, 0 to 149642
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  149643 non-null  object
 1   categories   149643 non-null  object
 2   food         149643 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


Unnamed: 0,business_id,categories,food
37933,59j7DxoQ304a3NV-u2rieg,Fast Food,yes
30233,KRKqh3DK7sYanI7O8_3YLA,Burgers,yes


In [13]:
dfy_site_categories = dfyfct.copy()
dfy_site_categories.rename(columns={"business_id":"site_id"}, inplace=True)
dfy_site_categories.to_parquet("dfy_site_categories.parquet")

In [14]:
dfyfct['categories'].nunique()

131

In [15]:
# LOCALES QUE SON RESTAURANTE
dfyrst = pd.merge(dfybsn, dfyfct, on='business_id', how='left')
# Eliminación de duplicados
#dfyrst = dfyrst.drop_duplicates(subset='business_id', keep='first')

dfyrst.info()
dfyrst.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            212998 non-null  object 
 1   name                   212998 non-null  object 
 2   city                   212998 non-null  object 
 3   postal_code            212998 non-null  object 
 4   latitude               212998 non-null  float64
 5   longitude              212998 non-null  float64
 6   stars                  212998 non-null  float64
 7   review_count           212998 non-null  int64  
 8   is_open                212998 non-null  int64  
 9   state                  212998 non-null  object 
 10  state_city             212998 non-null  object 
 11  city_postalcode        212998 non-null  object 
 12  state_city_postalcode  212998 non-null  object 
 13  categories             133777 non-null  object 
 14  food                   133777 non-nu

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food
67100,Am6LrTyW0ijOfeGzFc7hTQ,LA Fitness,Seminole,33772,27.840785,-82.788396,2.5,17,0,FL,FL - Seminole,Seminole - 33772,FL - Seminole - 33772,,
152936,VzwXTzOxwEidp16HpQwtHg,Ent Allergy of Delaware,Wilmington,19803,39.800873,-75.523939,1.5,7,1,DE,DE - Wilmington,Wilmington - 19803,DE - Wilmington - 19803,,


In [16]:
# Agregado de campo "precio", basado en el dataset de google
dfg_rest_prices_by_zip = pd.read_parquet("dfg_rest_prices_by_zip.parquet")
dfg_rest_prices_by_zip.sample(5)
dfyrst = pd.merge(dfyrst, dfg_rest_prices_by_zip, on=['postal_code'], how='left')

dfyrst.info()
dfyrst.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            212998 non-null  object 
 1   name                   212998 non-null  object 
 2   city                   212998 non-null  object 
 3   postal_code            212998 non-null  object 
 4   latitude               212998 non-null  float64
 5   longitude              212998 non-null  float64
 6   stars                  212998 non-null  float64
 7   review_count           212998 non-null  int64  
 8   is_open                212998 non-null  int64  
 9   state                  212998 non-null  object 
 10  state_city             212998 non-null  object 
 11  city_postalcode        212998 non-null  object 
 12  state_city_postalcode  212998 non-null  object 
 13  categories             133777 non-null  object 
 14  food                   133777 non-nu

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food,price
300,LVYAXWQB3t7tdwWteyjfhw,Option 1 Barber Shop,Tampa,33615,27.9987,-82.582253,4.0,16,0,FL,FL - Tampa,Tampa - 33615,FL - Tampa - 33615,,,1.0
118591,0lzdZFAyyiYVdQDqjH8FNQ,Exton Happy Nails Spa,Exton,19341,40.023909,-75.628333,3.5,117,1,PA,PA - Exton,Exton - 19341,PA - Exton - 19341,,,1.384615


In [17]:
dfyrst.to_parquet('dfy_rest.parquet')

### Creación de dataset de coordenadas

In [None]:
dfyrst_coord = dfyrst[['business_id', 'latitude', 'longitude', 'name', 'state', 'city', 'postal_code']]

dfyrst_coord['source'] = 'yelp'
dfyrst_coord.sample(2)

print(dfyrst_coord.info())
dfyrst_coord.sample(2)

In [None]:
import pandas as pd
import math

# Función para convertir latitud y longitud a coordenadas cartesianas
def lat_lon_to_cartesian(lat, lon):
    R = 6371  # Radio de la Tierra en kilómetros
    x = R * math.cos(math.radians(lat)) * math.cos(math.radians(lon))
    y = R * math.cos(math.radians(lat)) * math.sin(math.radians(lon))
    return x, y

# Suponiendo que tienes un DataFrame llamado df con las columnas business_id, latitud y longitud
# Agrega campos de coordenadas cartesianas x e y al DataFrame
dfyrst_coord['x'], dfyrst_coord['y'] = zip(*dfyrst_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))

print(dfyrst_coord.info())
dfyrst_coord.sample(2)


In [None]:
# Exportar 
#dfyrst_coord.to_parquet('dfyrst_coord.parquet', index=False)
#dfyrst_coord.to_excel('dfyrst_coord.xlsx', index=False)

## Creación de dataset BUSINESS ATTRIBUTES

In [13]:
# Selección de campos
dfybat = business[['business_id','attributes']]

# Eliminación de columnas duplicadas
dfybat = dfybat.loc[:,~dfybat.columns.duplicated()]

dfybat.info()
dfybat.sample(5)


<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  150346 non-null  object
 1   attributes   136602 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB


Unnamed: 0,business_id,attributes
140462,GhQnFehOiw4XMNEW5AWt2Q,"{'NoiseLevel': ''average'', 'OutdoorSeating': ..."
66615,pUZtLtnoNmXsfdji4ms0Pg,"{'RestaurantsDelivery': 'True', 'RestaurantsGo..."
44781,9F9RxnKQ_oi0nr1ImLAZMQ,"{'BusinessAcceptsCreditCards': 'True', 'Restau..."
135033,xFvkI8Uf_7NDZbChS6wSYQ,"{'RestaurantsPriceRange2': '1', 'BikeParking':..."
17,M0XSSHqrASOnhgbWDJIpQA,"{'BusinessParking': '{'garage': False, 'street..."


In [14]:
# Adecuación de campos
dfybat = dfybat.explode('attributes')

# Eliminación de duplicados
dfybat.drop_duplicates()

# Eliminación de nulos
dfybat = dfybat.dropna(subset=['attributes'])


In [15]:
dfybat.info()
dfybat.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 1206820 entries, 0 to 150345
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   business_id  1206820 non-null  object
 1   attributes   1206820 non-null  object
dtypes: object(2)
memory usage: 27.6+ MB


Unnamed: 0,business_id,attributes
19359,eIOfChlA3XgQKL4nw1EBjQ,BYOB
122030,89DwcchNdIblNjRIXZrPlg,RestaurantsTakeOut
142586,VCyFKtBsba_FYwq358blBA,OutdoorSeating
70706,Gisap06SS9o51ukfnUUVWg,RestaurantsAttire
131680,n3JTWpcILPdkjr1yU2K6MA,WheelchairAccessible


In [None]:
#dfybat[['business_id', 'attributes']].to_parquet('dfy_attributes.parquet')
#dfybat[['business_id', 'attributes']].to_csv('dfy_attributes.csv')

## Precios

In [None]:
# Filtrar los valores del campo "attributes" relacionados con la palabra "precio"
df_prices = dfybat[dfybat['attributes'].str.contains('price', case=False)]
df_prices.rename(columns={'attributes': 'price'}, inplace=True)

print(df_prices['price'].nunique())
# Mostrar el DataFrame filtrado
#df_prices.sample(2)


## Dataset CHECKIN

In [None]:
print(checkin.shape)
checkin.head(2)

In [None]:
'''# Filtrado por restaurantes
dfychk = checkin[checkin['business_id'].isin(dfyrst['business_id'])]

print(dfychk.info())
dfychk.head(2) 
'''

In [None]:
# Filtrado por restarurants y agregado de características
dfychk = pd.merge(checkin, dfyrst, on='business_id', how='inner')

dfychk.info()
dfychk.head(2)

In [None]:
dfychk['categories'].nunique()

In [None]:
dfychk = dfychk.assign(date=dfychk['date'].str.split(', ')).explode('date')
dfychk.reset_index(drop=True, inplace=True)

dfychk.info()
dfychk.sample(2)

In [None]:
# Adecuación de campos
#dfychk['date'] = dfychk['date'].astype(str)

In [None]:
dfychk['date'] = pd.to_datetime(dfychk['date'], errors='coerce')
dfychk['year'] = dfychk['date'].dt.year
dfychk['month'] = dfychk['date'].dt.month

# Filtrado cronológico (los datos están entre dic2009 y ene2022, por lo tanto se quitan los años 2009 y 2022)
dfychk = dfychk[(dfychk['year'] >= 2010) & (dfychk['year'] <= 2021)]

dfychk.info()
dfychk.sample(2)

In [None]:
dfychk.to_parquet('dfy_checkins.parquet')

## Dataset USER

In [None]:
user.info()
user.sample(2)

In [None]:
# Selección de campos
dfyusr = user[['user_id', 'review_count', 'fans', 'yelping_since', 'useful', 'funny', 'cool', 'average_stars']]

dfyusr['yelping_since'] = pd.to_datetime(dfyusr['yelping_since'], errors='coerce')
dfyusr.reset_index(drop=True, inplace=True)

dfyusr['year'] = dfyusr['yelping_since'].dt.year
dfyusr['month'] = dfyusr['yelping_since'].dt.month
dfyusr['year_month'] = dfyusr['year'].astype(str).str.slice(-2) + dfyusr['month'].astype(str).str.zfill(2)

dfyusr.info()
dfyusr.sample(5)

In [None]:
dfyusr.to_parquet("dfy_user.parquet")