# Importación de librerías
====================================================================================================================================

In [1]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from math import factorial
from scipy import stats as st
import json
import gzip
import pickle

# Carga de datos
====================================================================================================================================

In [2]:
# Yelp
business = pd.read_pickle('dataset_y_business.pkl')
checkin = pd.read_json('dataset_y_checkin.json', lines=True)
user = pd.read_parquet('dataset_y_user.parquet')

In [3]:
# Base de datos externa
df_uszip = pd.read_excel('dataset_e_uszips.xlsx')

# Preprocesamiento
====================================================================================================================================

## Dataset uszip (externo)

In [4]:
print(df_uszip.info())
df_uszip.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33787 entries, 0 to 33786
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   postal_code       33787 non-null  int64  
 1   city              33787 non-null  object 
 2   state             33787 non-null  object 
 3   state_name        33787 non-null  object 
 4   population        33770 non-null  float64
 5   density           33770 non-null  float64
 6   county_fips       33787 non-null  int64  
 7   county_name       33787 non-null  object 
 8   county_weights    33787 non-null  object 
 9   county_names_all  33787 non-null  object 
 10  county_fips_all   33787 non-null  object 
 11  timezone          33787 non-null  object 
dtypes: float64(2), int64(2), object(8)
memory usage: 3.1+ MB
None


Unnamed: 0,postal_code,city,state,state_name,population,density,county_fips,county_name,county_weights,county_names_all,county_fips_all,timezone
22969,65233,Boonville,MO,Missouri,10903.0,29.7,29053,Cooper,"{""29053"": 100}",Cooper,29053,America/Chicago
9122,28521,Chinquapin,NC,North Carolina,1632.0,15.2,37061,Duplin,"{""37061"": 96.39, ""37133"": 3.61}",Duplin|Onslow,37061|37133,America/New_York


In [5]:
df_uszip = df_uszip[['postal_code', 'state']]
df_uszip['postal_code'] = df_uszip['postal_code'].astype(str)

df_uszip.info()
df_uszip.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33787 entries, 0 to 33786
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   postal_code  33787 non-null  object
 1   state        33787 non-null  object
dtypes: object(2)
memory usage: 528.1+ KB


Unnamed: 0,postal_code,state
27075,76664,TX
31979,95428,CA


## Dataset BUSINESS

In [6]:
print(business.info())
business.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 28 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   business_id   150346 non-null  object
 1   name          150346 non-null  object
 2   address       150346 non-null  object
 3   city          150346 non-null  object
 4   state         150343 non-null  object
 5   postal_code   150346 non-null  object
 6   latitude      150346 non-null  object
 7   longitude     150346 non-null  object
 8   stars         150346 non-null  object
 9   review_count  150346 non-null  object
 10  is_open       150346 non-null  object
 11  attributes    136602 non-null  object
 12  categories    150243 non-null  object
 13  hours         127123 non-null  object
 14  business_id   5 non-null       object
 15  name          5 non-null       object
 16  address       5 non-null       object
 17  city          5 non-null       object
 18  state         5 non-null     

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,...,state.1,postal_code.1,latitude.1,longitude.1,stars.1,review_count.1,is_open,attributes,categories,hours
41086,WQzQvusL15qZMxKlQq4CDA,Point Breakfast Restaurant,1438 Point Breeze Ave,Philadelphia,PA,19146,39.933739,-75.180541,4.0,9,...,,,,,,,,,,
24060,ABr07PGwhVlsnXNDUomNKQ,Lululemon Athletica,1527 Walnut St,Philadelphia,FL,19102,39.949938,-75.167153,4.0,42,...,,,,,,,,,,
2091,VKFWX_Cd7cTiV3_RPdcXPw,Bamboo Oriental Cuisine,331 Waldron Rd,La Vergne,FL,37086,36.011712,-86.593429,3.5,5,...,,,,,,,,,,
121845,9f7rbp9tNpinBGZV4IEqZA,H & H Heating and Air Conditioning,3 Industrial Hwy,Essington,PA,19029,39.86699,-75.30012,5.0,16,...,,,,,,,,,,
51215,GvgLd-yCE50BTyaW5yKwwQ,Holiday Inn Express & Suites Largo-Clearwater,210 Seminole Blvd,Largo,AZ,33770,27.913642,-82.787277,3.0,18,...,,,,,,,,,,


In [7]:
business_copy = business.copy()
business_copy = business_copy.loc[:,~business_copy.columns.duplicated()]
business_copy.head()

Unnamed: 0,business_id,name,address,city,state,postal_code,latitude,longitude,stars,review_count,is_open,attributes,categories,hours
0,Pns2l4eNsfO8kk83dixA6A,"Abby Rappoport, LAC, CMQ","1616 Chapala St, Ste 2",Santa Barbara,,93101,34.426679,-119.711197,5.0,7,0,{'ByAppointmentOnly': 'True'},"Doctors, Traditional Chinese Medicine, Naturop...",
1,mpf3x-BjTdTEA3yCZrAYPw,The UPS Store,87 Grasso Plaza Shopping Center,Affton,,63123,38.551126,-90.335695,3.0,15,1,{'BusinessAcceptsCreditCards': 'True'},"Shipping Centers, Local Services, Notaries, Ma...","{'Monday': '0:0-0:0', 'Tuesday': '8:0-18:30', ..."
2,tUFrWirKiKi_TAnsVWINQQ,Target,5255 E Broadway Blvd,Tucson,,85711,32.223236,-110.880452,3.5,22,0,"{'BikeParking': 'True', 'BusinessAcceptsCredit...","Department Stores, Shopping, Fashion, Home & G...","{'Monday': '8:0-22:0', 'Tuesday': '8:0-22:0', ..."
3,MTSW4McQd7CbVtyjqoe9mw,St Honore Pastries,935 Race St,Philadelphia,CA,19107,39.955505,-75.155564,4.0,80,1,"{'RestaurantsDelivery': 'False', 'OutdoorSeati...","Restaurants, Food, Bubble Tea, Coffee & Tea, B...","{'Monday': '7:0-20:0', 'Tuesday': '7:0-20:0', ..."
4,mWMc6_wTdE0EUBKIGXDVfA,Perkiomen Valley Brewery,101 Walnut St,Green Lane,MO,18054,40.338183,-75.471659,4.5,13,1,"{'BusinessAcceptsCreditCards': 'True', 'Wheelc...","Brewpubs, Breweries, Food","{'Wednesday': '14:0-22:0', 'Thursday': '16:0-2..."


In [8]:
# Selección de campos
dfybsn = business[['business_id', 'name', 'state', 'city', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open']]

# Eliminación de columnas duplicadas
dfybsn = dfybsn.loc[:,~dfybsn.columns.duplicated()]

# Adcuación de tipos de datos
dfybsn['latitude'] = pd.to_numeric(dfybsn['latitude'], errors='coerce')
dfybsn['longitude'] = pd.to_numeric(dfybsn['longitude'], errors='coerce')
dfybsn['stars'] = pd.to_numeric(dfybsn['stars'], errors='coerce')
dfybsn['review_count'] = pd.to_numeric(dfybsn['review_count'], errors='coerce')
dfybsn['is_open'] = pd.to_numeric(dfybsn['is_open'], errors='coerce')

# Eliminación de duplicados
dfybsn.drop_duplicates()

'''Reemplazo de valores del campo "state": los valores del dataset original no se corresponden con los códigos postales, 
por lo cual se utilizará una base de datos externa para corregir esta serie.'''

dfybsn.drop(columns=['state'], inplace=True)
dfybsn = pd.merge(dfybsn, df_uszip, on='postal_code', how='left')

# Campos auxiliares
dfybsn.dropna(subset=['state'], inplace=True)
dfybsn['state_city'] = dfybsn['state'].str.cat(dfybsn['city'], sep=' - ')
dfybsn['city_postalcode'] = dfybsn['city'].str.cat(dfybsn['postal_code'], sep=' - ')
dfybsn['state_city_postalcode'] = dfybsn['state'].str.cat(dfybsn['city'], sep=' - ').str.cat(dfybsn['postal_code'], sep=' - ')

dfybsn.info()
dfybsn.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 135158 entries, 0 to 150345
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            135158 non-null  object 
 1   name                   135158 non-null  object 
 2   city                   135158 non-null  object 
 3   postal_code            135158 non-null  object 
 4   latitude               135158 non-null  float64
 5   longitude              135158 non-null  float64
 6   stars                  135158 non-null  float64
 7   review_count           135158 non-null  int64  
 8   is_open                135158 non-null  int64  
 9   state                  135158 non-null  object 
 10  state_city             135158 non-null  object 
 11  city_postalcode        135158 non-null  object 
 12  state_city_postalcode  135158 non-null  object 
dtypes: float64(3), int64(2), object(8)
memory usage: 14.4+ MB


Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode
39338,mlzUbkIgIvDOzHtA4hhhgA,Nails First,Tucson,85714,32.161584,-110.941276,3.5,22,0,AZ,AZ - Tucson,Tucson - 85714,AZ - Tucson - 85714
93708,CGWi0CL99ilYaUFAtND9IQ,The Eagle Cafe,St. Petersburg,33701,27.774379,-82.632677,2.5,14,0,FL,FL - St. Petersburg,St. Petersburg - 33701,FL - St. Petersburg - 33701


## Creación de dataset RESTAURANTES

In [9]:
# Selección de campos
dfybct = business[['business_id', 'categories']]

# Eliminación de columnas duplicadas
dfybct = dfybct.loc[:,~dfybct.columns.duplicated()]

# Adecuación de campos
dfybct = dfybct.explode('categories')
dfybct = dfybct.assign(categories=dfybct['categories'].str.split(', ')).explode('categories')
dfybct.reset_index(drop=True, inplace=True)

# Eliminación de nulos
dfybct = dfybct.dropna(subset=['categories'])

dfybct.info()
dfybct.sample(2)

<class 'pandas.core.frame.DataFrame'>
Index: 668592 entries, 0 to 668694
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  668592 non-null  object
 1   categories   668592 non-null  object
dtypes: object(2)
memory usage: 15.3+ MB


Unnamed: 0,business_id,categories
542304,8REjg7DzvyCw5QDNEi6_5A,Food
70296,5RtPmGW6efUuWIFEapaApA,Sporting Goods


In [10]:
# CATEGORIAS DISPONIBLES
#categories_yelp = pd.DataFrame(dfybct['categories'].unique())
#categories_yelp.to_csv('categories_yelp.csv', index=False)

In [11]:
# CATEGORIAS TIPO "FOOD" (LOCALES QUE TIENEN POR LO MENOS UNA CATEGORIA GASTRONOMICA)
categories_food = pd.read_csv('dataframe_categories_food.csv')
dfyfct = pd.merge(dfybct, categories_food, on='categories', how='inner')

dfyfct.info()
dfyfct.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149643 entries, 0 to 149642
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  149643 non-null  object
 1   categories   149643 non-null  object
 2   food         149643 non-null  object
dtypes: object(3)
memory usage: 3.4+ MB


Unnamed: 0,business_id,categories,food
20631,Nz97LDAke7SKDTTAdtbA_w,Food,yes
40762,9L7yi0YRdzbK1KCjBsalNw,Sandwiches,yes


In [12]:
dfy_site_categories = dfyfct.copy()
dfy_site_categories.rename(columns={"business_id":"site_id"}, inplace=True)
dfy_site_categories.to_parquet("dfy_site_categories.parquet")

In [13]:
dfyfct['categories'].nunique()

131

In [14]:
# LOCALES QUE SON RESTAURANTE
dfyrst = pd.merge(dfybsn, dfyfct, on='business_id', how='left')
# Eliminación de duplicados
#dfyrst = dfyrst.drop_duplicates(subset='business_id', keep='first')

dfyrst.info()
dfyrst.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            212998 non-null  object 
 1   name                   212998 non-null  object 
 2   city                   212998 non-null  object 
 3   postal_code            212998 non-null  object 
 4   latitude               212998 non-null  float64
 5   longitude              212998 non-null  float64
 6   stars                  212998 non-null  float64
 7   review_count           212998 non-null  int64  
 8   is_open                212998 non-null  int64  
 9   state                  212998 non-null  object 
 10  state_city             212998 non-null  object 
 11  city_postalcode        212998 non-null  object 
 12  state_city_postalcode  212998 non-null  object 
 13  categories             133777 non-null  object 
 14  food                   133777 non-nu

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food
59378,24LgPIiGIUH-03AqXAOhGw,Cheys Kitchen,Philadelphia,19103,39.950405,-75.169036,4.5,10,0,PA,PA - Philadelphia,Philadelphia - 19103,PA - Philadelphia - 19103,Sandwiches,yes
106792,y98y26WYSZqYCLXH-uQRLA,Subway,Indianapolis,46202,39.779847,-86.172692,2.0,20,1,IN,IN - Indianapolis,Indianapolis - 46202,IN - Indianapolis - 46202,Fast Food,yes


In [15]:
# Agregado de campo "precio", basado en el dataset de google
dfg_rest_prices_by_zip = pd.read_parquet("dfg_rest_prices_by_zip.parquet")
dfg_rest_prices_by_zip.sample(5)
dfyrst = pd.merge(dfyrst, dfg_rest_prices_by_zip, on=['postal_code'], how='left')

dfyrst.info()
dfyrst.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 16 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            212998 non-null  object 
 1   name                   212998 non-null  object 
 2   city                   212998 non-null  object 
 3   postal_code            212998 non-null  object 
 4   latitude               212998 non-null  float64
 5   longitude              212998 non-null  float64
 6   stars                  212998 non-null  float64
 7   review_count           212998 non-null  int64  
 8   is_open                212998 non-null  int64  
 9   state                  212998 non-null  object 
 10  state_city             212998 non-null  object 
 11  city_postalcode        212998 non-null  object 
 12  state_city_postalcode  212998 non-null  object 
 13  categories             133777 non-null  object 
 14  food                   133777 non-nu

Unnamed: 0,business_id,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food,price
58656,51BQzF_4dOOuIO9s-4Ropw,Golosa,Philadelphia,19147,39.938556,-75.153178,4.0,65,0,PA,PA - Philadelphia,Philadelphia - 19147,PA - Philadelphia - 19147,Food,yes,1.581818
119317,3rS9fNuUmpMdCP3AFiQKQw,Beyond Curry,Franklin,37067,35.954133,-86.819769,4.5,24,1,TN,TN - Franklin,Franklin - 37067,TN - Franklin - 37067,Desserts,yes,1.428571


In [16]:
dfyrst.to_parquet('dfy_rest.parquet')

### Creación de dataset de coordenadas

In [17]:
dfyrst_coord = dfyrst[['business_id', 'latitude', 'longitude', 'name', 'state', 'city', 'postal_code']]

dfyrst_coord['source'] = 'yelp'
dfyrst_coord.sample(2)

print(dfyrst_coord.info())
dfyrst_coord.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  212998 non-null  object 
 1   latitude     212998 non-null  float64
 2   longitude    212998 non-null  float64
 3   name         212998 non-null  object 
 4   state        212998 non-null  object 
 5   city         212998 non-null  object 
 6   postal_code  212998 non-null  object 
 7   source       212998 non-null  object 
dtypes: float64(2), object(6)
memory usage: 13.0+ MB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyrst_coord['source'] = 'yelp'


Unnamed: 0,business_id,latitude,longitude,name,state,city,postal_code,source
85095,zMy3Pr6j2ZndQyD0Q0ToIw,39.953328,-75.171776,Gia Pronto,PA,Philadelphia,19103,yelp
58906,Mgaxy0pJAlhAjdNivhu82A,38.504194,-90.378747,Pad Thai,MO,Saint Louis,63128,yelp


In [18]:
import pandas as pd
import math

# Función para convertir latitud y longitud a coordenadas cartesianas
def lat_lon_to_cartesian(lat, lon):
    R = 6371  # Radio de la Tierra en kilómetros
    x = R * math.cos(math.radians(lat)) * math.cos(math.radians(lon))
    y = R * math.cos(math.radians(lat)) * math.sin(math.radians(lon))
    return x, y

# Suponiendo que tienes un DataFrame llamado df con las columnas business_id, latitud y longitud
# Agrega campos de coordenadas cartesianas x e y al DataFrame
dfyrst_coord['x'], dfyrst_coord['y'] = zip(*dfyrst_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))

print(dfyrst_coord.info())
dfyrst_coord.sample(2)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212998 entries, 0 to 212997
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   business_id  212998 non-null  object 
 1   latitude     212998 non-null  float64
 2   longitude    212998 non-null  float64
 3   name         212998 non-null  object 
 4   state        212998 non-null  object 
 5   city         212998 non-null  object 
 6   postal_code  212998 non-null  object 
 7   source       212998 non-null  object 
 8   x            212998 non-null  float64
 9   y            212998 non-null  float64
dtypes: float64(4), object(6)
memory usage: 16.3+ MB
None


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyrst_coord['x'], dfyrst_coord['y'] = zip(*dfyrst_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyrst_coord['x'], dfyrst_coord['y'] = zip(*dfyrst_coord.apply(lambda row: lat_lon_to_cartesian(row['latitude'], row['longitude']), axis=1))


Unnamed: 0,business_id,latitude,longitude,name,state,city,postal_code,source,x,y
210772,7rtbIYXGSFdZeKbIaif1vQ,38.622518,-90.335393,Camille's Sidewalk Cafe,MO,Saint Louis,63144,yelp,-29.136755,-4977.419128
105239,GWqPmrWu0kXB_-gB1H-j6A,39.967481,-75.136957,Love & Honey Fried Chicken,PA,Philadelphia,19123,yelp,1252.482235,-4719.42284


In [19]:
# Exportar 
#dfyrst_coord.to_parquet('dfyrst_coord.parquet', index=False)
#dfyrst_coord.to_excel('dfyrst_coord.xlsx', index=False)

## Creación de dataset BUSINESS ATTRIBUTES

In [20]:
# Selección de campos
dfybat = business[['business_id','attributes']]

# Eliminación de columnas duplicadas
dfybat = dfybat.loc[:,~dfybat.columns.duplicated()]

dfybat.info()
dfybat.sample(5)


<class 'pandas.core.frame.DataFrame'>
Index: 150346 entries, 0 to 150345
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   business_id  150346 non-null  object
 1   attributes   136602 non-null  object
dtypes: object(2)
memory usage: 3.4+ MB


Unnamed: 0,business_id,attributes
9889,tqgPVExgDIx9ZxXfIZWQUg,"{'BusinessAcceptsCreditCards': 'True', 'Outdoo..."
108307,565O3w2uSH4atWntebE-Nw,"{'ByAppointmentOnly': 'True', 'RestaurantsPric..."
128469,k1shNGktYpVZD8zYWsm6Rg,"{'ByAppointmentOnly': 'False', 'RestaurantsPri..."
69992,gnau2dVvNfrO-dWazIoOOg,"{'Alcohol': 'u'none'', 'NoiseLevel': 'u'averag..."
99384,SGiSLM8r8rT3oWa6qCwgbw,"{'RestaurantsPriceRange2': '1', 'BusinessAccep..."


In [21]:
# Adecuación de campos
dfybat = dfybat.explode('attributes')

# Eliminación de duplicados
dfybat.drop_duplicates()

# Eliminación de nulos
dfybat = dfybat.dropna(subset=['attributes'])


In [22]:
dfybat.info()
dfybat.sample(5)

<class 'pandas.core.frame.DataFrame'>
Index: 1206820 entries, 0 to 150345
Data columns (total 2 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   business_id  1206820 non-null  object
 1   attributes   1206820 non-null  object
dtypes: object(2)
memory usage: 27.6+ MB


Unnamed: 0,business_id,attributes
71113,MQIS2I-QNTMHY3JHPs36oA,BusinessParking
94618,SZ7-z5W7u30Z2-N0rGPN9A,RestaurantsTakeOut
35478,MaEdUVjCbAHKAF7jZ1t7HA,WiFi
97449,cgGB_EQZt_AZZH74TI4XnA,BusinessParking
41584,U-RlkCheEvJxDn9BCyiDWg,RestaurantsDelivery


In [23]:
#dfybat[['business_id', 'attributes']].to_parquet('dfy_attributes.parquet')
#dfybat[['business_id', 'attributes']].to_csv('dfy_attributes.csv')

## Precios

In [24]:
'''# Filtrar los valores del campo "attributes" relacionados con la palabra "precio"
df_prices = dfybat[dfybat['attributes'].str.contains('price', case=False)]
df_prices.rename(columns={'attributes': 'price'}, inplace=True)

print(df_prices['price'].nunique())
# Mostrar el DataFrame filtrado
df_prices.sample(2)'''
# Se ha encontrado que este campo no contiene información de precios ni de rangos de precios

'# Filtrar los valores del campo "attributes" relacionados con la palabra "precio"\ndf_prices = dfybat[dfybat[\'attributes\'].str.contains(\'price\', case=False)]\ndf_prices.rename(columns={\'attributes\': \'price\'}, inplace=True)\n\nprint(df_prices[\'price\'].nunique())\n# Mostrar el DataFrame filtrado\ndf_prices.sample(2)'

## Dataset CHECKIN

In [25]:
print(checkin.shape)
checkin.head(2)

(131930, 2)


Unnamed: 0,business_id,date
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020..."
1,--0iUa4sNDFiZFrAdIWhZQ,"2010-09-13 21:43:09, 2011-05-04 23:08:15, 2011..."


In [26]:
# Filtrado por restarurants y agregado de características
dfychk = pd.merge(checkin, dfyrst, on='business_id', how='inner')

dfychk.info()
dfychk.head(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 195264 entries, 0 to 195263
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   business_id            195264 non-null  object 
 1   date                   195264 non-null  object 
 2   name                   195264 non-null  object 
 3   city                   195264 non-null  object 
 4   postal_code            195264 non-null  object 
 5   latitude               195264 non-null  float64
 6   longitude              195264 non-null  float64
 7   stars                  195264 non-null  float64
 8   review_count           195264 non-null  int64  
 9   is_open                195264 non-null  int64  
 10  state                  195264 non-null  object 
 11  state_city             195264 non-null  object 
 12  city_postalcode        195264 non-null  object 
 13  state_city_postalcode  195264 non-null  object 
 14  categories             131641 non-nu

Unnamed: 0,business_id,date,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food,price
0,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...",Frankie's Raw Bar,New Port Richey,34652,28.217288,-82.733344,4.5,24,1,FL,FL - New Port Richey,New Port Richey - 34652,FL - New Port Richey - 34652,Food,yes,1.454545
1,---kPU91CF4Lq2-WlRu9Lw,"2020-03-13 21:10:56, 2020-06-02 22:18:06, 2020...",Frankie's Raw Bar,New Port Richey,34652,28.217288,-82.733344,4.5,24,1,FL,FL - New Port Richey,New Port Richey - 34652,FL - New Port Richey - 34652,Food Trucks,yes,1.454545


In [27]:
dfychk['categories'].nunique()

131

In [None]:
#dfychk = dfychk.assign(date=dfychk['date'].str.split(', ')).explode('date')

In [28]:
dfychk['date'] = pd.to_datetime(dfychk['date'], errors='coerce')
dfychk['year'] = dfychk['date'].dt.year
dfychk['month'] = dfychk['date'].dt.month

# Filtrado cronológico (los datos están entre dic2009 y ene2022, por lo tanto se quitan los años 2009 y 2022)
dfychk = dfychk[(dfychk['year'] >= 2010) & (dfychk['year'] <= 2021)]

dfychk.info()
dfychk.sample(2)

  dfychk['date'] = pd.to_datetime(dfychk['date'], errors='coerce')


<class 'pandas.core.frame.DataFrame'>
Index: 9901 entries, 18 to 195263
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   business_id            9901 non-null   object        
 1   date                   9901 non-null   datetime64[ns]
 2   name                   9901 non-null   object        
 3   city                   9901 non-null   object        
 4   postal_code            9901 non-null   object        
 5   latitude               9901 non-null   float64       
 6   longitude              9901 non-null   float64       
 7   stars                  9901 non-null   float64       
 8   review_count           9901 non-null   int64         
 9   is_open                9901 non-null   int64         
 10  state                  9901 non-null   object        
 11  state_city             9901 non-null   object        
 12  city_postalcode        9901 non-null   object        
 13  state

Unnamed: 0,business_id,date,name,city,postal_code,latitude,longitude,stars,review_count,is_open,state,state_city,city_postalcode,state_city_postalcode,categories,food,price,year,month
28280,87F9S_fU192AI8XfF9Pc3Q,2018-10-27 01:55:48,Driftwood Music,St Charles,63301,38.790205,-90.477858,5.0,8,1,MO,MO - St Charles,St Charles - 63301,MO - St Charles - 63301,,,1.333333,2018.0,10.0
69132,LkIZ501yngBdsFoVd4eFpw,2019-10-07 00:51:17,Performance Pilates Tampa,Tampa,33611,27.894078,-82.520807,5.0,5,1,FL,FL - Tampa,Tampa - 33611,FL - Tampa - 33611,,,1.375,2019.0,10.0


In [29]:
#dfychk.to_parquet('dfy_checkins.parquet')

## Dataset USER

In [30]:
user.info()
user.sample(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105597 entries, 0 to 2105596
Data columns (total 22 columns):
 #   Column              Dtype  
---  ------              -----  
 0   user_id             object 
 1   name                object 
 2   review_count        int64  
 3   yelping_since       object 
 4   useful              int64  
 5   funny               int64  
 6   cool                int64  
 7   elite               object 
 8   friends             object 
 9   fans                int64  
 10  average_stars       float64
 11  compliment_hot      int64  
 12  compliment_more     int64  
 13  compliment_profile  int64  
 14  compliment_cute     int64  
 15  compliment_list     int64  
 16  compliment_note     int64  
 17  compliment_plain    int64  
 18  compliment_cool     int64  
 19  compliment_funny    int64  
 20  compliment_writer   int64  
 21  compliment_photos   int64  
dtypes: float64(1), int64(16), object(5)
memory usage: 353.4+ MB


Unnamed: 0,user_id,name,review_count,yelping_since,useful,funny,cool,elite,friends,fans,...,compliment_more,compliment_profile,compliment_cute,compliment_list,compliment_note,compliment_plain,compliment_cool,compliment_funny,compliment_writer,compliment_photos
1794485,n2o4eJyfEN_prw6SwxphMw,Matt,53,2006-09-18 20:18:49,55,27,10,,"qzQ8BMZXJe6qe7k6xxzTWQ, dyUwFl7ZvEylbw3doXVL_Q...",2,...,0,0,0,0,4,3,1,1,0,0
1492358,0xhoTcEZS-hgCL7e8lwWxQ,Kristin,1,2016-01-03 15:09:04,0,0,0,,,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# Selección de campos
dfyusr = user[['user_id', 'review_count', 'fans', 'yelping_since', 'useful', 'funny', 'cool', 'average_stars']]

dfyusr['yelping_since'] = pd.to_datetime(dfyusr['yelping_since'], errors='coerce')
dfyusr.reset_index(drop=True, inplace=True)

dfyusr['year'] = dfyusr['yelping_since'].dt.year
dfyusr['month'] = dfyusr['yelping_since'].dt.month
dfyusr['year_month'] = dfyusr['year'].astype(str).str.slice(-2) + dfyusr['month'].astype(str).str.zfill(2)

dfyusr.info()
dfyusr.sample(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyusr['yelping_since'] = pd.to_datetime(dfyusr['yelping_since'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyusr['year'] = dfyusr['yelping_since'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyusr['month'] = dfyusr['yelping_since'].dt.month


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2105597 entries, 0 to 2105596
Data columns (total 11 columns):
 #   Column         Dtype         
---  ------         -----         
 0   user_id        object        
 1   review_count   int64         
 2   fans           int64         
 3   yelping_since  datetime64[ns]
 4   useful         int64         
 5   funny          int64         
 6   cool           int64         
 7   average_stars  float64       
 8   year           int32         
 9   month          int32         
 10  year_month     object        
dtypes: datetime64[ns](1), float64(1), int32(2), int64(5), object(2)
memory usage: 160.6+ MB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dfyusr['year_month'] = dfyusr['year'].astype(str).str.slice(-2) + dfyusr['month'].astype(str).str.zfill(2)


Unnamed: 0,user_id,review_count,fans,yelping_since,useful,funny,cool,average_stars,year,month,year_month
1700687,1IqJWql8cCal-9fMcqeTJQ,61,0,2014-06-21 11:53:28,19,6,8,4.03,2014,6,1406
75203,1Krb1ZJ0lPuyIGaZU79w-g,171,5,2013-09-13 21:39:56,208,30,67,3.79,2013,9,1309
1333981,FtnCj7cGVhByUTtfLCKR3g,1,0,2015-02-12 17:54:10,1,0,0,1.0,2015,2,1502
2003731,NL6HCB2k3ZUgO8GV58p6bg,140,15,2007-11-09 04:03:13,450,226,167,3.25,2007,11,711
1831978,gx7e26DgYavxvcqKFGBZIA,2,1,2015-12-13 19:29:04,4,0,0,4.0,2015,12,1512


In [32]:
dfyusr.to_parquet("dfy_user.parquet")

# Conclusiones

* Dataset BUSINESS
    * Se corrigen los códigos postales originales pues estaban erróneos
    * Se eliminan columnas duplicadas
    * Se adecúan los tipos de datos para su posterior procesamiento
    * Se eliminan registros duplicados
* Creación de dataset RESTAURANTES
    * Mediante un archivo que contiene las categorías deseadas de restaurantes y bares, se filtran los locales a considerar
    * Se agrega el campo precio tomado del dataset de google, con el cual se estima el rango de precios del local tomando el promedio del código postal al que pertenece
* Creación del dataset de COORDENADAS
    * Se calculan las coordenadas cartesianas y se exporta un archivo para luego crear un dataset de locales unívocos tomando en conjunto a yelp y google
* Creación del dataset ATRIBUTOS
    * Explotando el campo "attributes" se exporta un dataset con los atributos que se consideran relevantes para el modelo de machine learning
* Dataset CHECKIN
    * Se obtiene este registro de visitas mediante el filtrado de locales tipo restaurants & bares
* Dataset USER
    * Se adecúan los tipos de datos para su posterior procesamiento