# ETL para la base de datos

In [1]:
import pandas as pd
import unicodedata

In [2]:
google = pd.read_parquet('../datasets/processed/google/business_google.parquet.gz')
yelp = pd.read_parquet('../datasets/processed/yelp/business_yelp.parquet.gz')

In [5]:
google

Unnamed: 0,gmap_id,name,latitude,longitude,stars,categories,state
0,0x88d9ba5d65937567:0xbc27649cf513cc89,Bachata Rosa,25.848173,-80.299773,3.6,[restaurant],Florida
1,0x88d900575f0dd065:0x9b3638d2a80be4d,Zampini's Bottega,26.116549,-80.138561,4.7,[restaurant],Florida
2,0x88d9b719170a9f61:0x30d454a980f76ad3,Choices Cafe,25.779837,-80.239103,4.6,[restaurant],Florida
3,0x88e76652cd84272f:0x548abb9935d912ff,Subway,28.537201,-81.208736,4.0,"[sandwich shop, caterer, fast food restaurant,...",Florida
4,0x88d9b395c6179c89:0x914b0aafb453b3b5,Biscayne Backyard Barbecue,25.877904,-80.168434,3.9,[restaurant],Florida
...,...,...,...,...,...,...,...
24930,0x89c3b3a602b2fe29:0x35912eea537f1d13,Polmart,40.627265,-74.269879,4.4,"[deli, european restaurant]",New Jersey
24931,0x89c3b2b8e9c8c357:0xb1b07826ef85a598,Park View Tavern,40.653743,-74.234148,4.3,"[bar, italian restaurant, pub, restaurant]",New Jersey
24932,0x89c24d60acc3ffff:0xc8048546518281f6,The Brown Derby Cafe & Rental Hall,40.661250,-74.193971,4.6,"[restaurant, family restaurant]",New Jersey
24933,0x89c24d7a198a3541:0x3dcb8f429c053248,George's Lunch,40.654865,-74.177221,4.5,[american restaurant],New Jersey


# state

In [6]:
states = pd.DataFrame(columns=['state_id', 'state'])

states['state'] = google['state'].unique()
states['state_id'] = list(range(1,5))
states

Unnamed: 0,state_id,state
0,1,Florida
1,2,California
2,3,Illinois
3,4,New Jersey


In [7]:
states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   state_id  4 non-null      int64 
 1   state     4 non-null      object
dtypes: int64(1), object(1)
memory usage: 196.0+ bytes


In [24]:
states.to_parquet('../datasets/processed/bd/1_states.parquet.gz', compression='gzip')

# categories

Seleccionamos las categorias existentes en ambos datasets

In [25]:
listaCategorias = []
for categorias in google['categories']:
    for categoria in categorias:
        if categoria not in listaCategorias:
            listaCategorias.append(categoria)

KeyError: 'categories'

In [None]:
for categorias in yelp['categories']:
    for categoria in categorias:
        if categoria not in listaCategorias:
            listaCategorias.append(categoria)

In [None]:
categories = pd.DataFrame()
categories['name'] = listaCategorias
categories.reset_index(names = 'categories_id', inplace= True)
categories

Unnamed: 0,categories_id,name
0,0,Restaurant
1,1,Buffet restaurant
2,2,Gas station
3,3,ATM
4,4,Convenience store
...,...,...
1653,1653,Botanical Gardens
1654,1654,Newspapers & Magazines
1655,1655,Parking
1656,1656,Georgian


In [None]:
# Aplicar strip() a la columna 'name' para eliminar espacios en blanco adicionales
categories['name'] = categories['name'].str.strip()

# Convertir la columna 'name' a minúsculas para uniformizar los datos
categories['name'] = categories['name'].str.lower()

# Eliminar las filas duplicadas basadas en la columna 'name'
categories.drop_duplicates(subset=['name'], keep='first', inplace=True)

In [None]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1658 entries, 0 to 1657
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   categories_id  1658 non-null   int64 
 1   name           1658 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.0+ KB


Exportamos

In [None]:
categories.to_parquet('../datasets/processed/bd/2_categories.parquet.gz', compression='gzip')

# categories_google y categories_yelp

In [None]:
categories_google = google[['gmap_id','categories']].explode('categories')
categories_yelp = yelp[['business_id','categories']].explode('categories')

In [None]:
def corregir_codificacion(texto):
    try:
        return texto.encode('latin-1').decode('utf-8', 'ignore')
    except Exception:
        return unicodedata.normalize('NFKD', texto).encode('latin-1', 'ignore').decode('utf-8', 'ignore')

In [None]:
categories_google = categories_google.merge(categories, left_on='categories', right_on='name')
categories_google.drop(columns=['categories', 'name'], inplace=True)

# Recorre todas las columnas del DataFrame
for column in categories_google.columns:
    # Intenta corregir los caracteres no codificados como UTF-8 en cada celda
    categories_google[column] = categories_google[column].apply(lambda x: corregir_codificacion(str(x)) if isinstance(x, str) else x)

categories_google.to_parquet('../datasets/processed/bd/7_categories_google.parquet.gz', compression='gzip')



categories_yelp = categories_yelp.merge(categories, left_on='categories', right_on='name')
categories_yelp.drop(columns=['categories', 'name'], inplace=True)

# Recorre todas las columnas del DataFrame
for column in categories_yelp.columns:
    # Intenta corregir los caracteres no codificados como UTF-8 en cada celda
    categories_yelp[column] = categories_yelp[column].apply(lambda x: corregir_codificacion(str(x)) if isinstance(x, str) else x)


categories_yelp.to_parquet('../datasets/processed/bd/8_categories_yelp.parquet.gz', compression='gzip')

# reviews_yelp

In [None]:
reviews_yelp = pd.read_parquet('../datasets/processed/yelp/reviews_yelp.parquet.gz')
reviews_yelp

Unnamed: 0,review_id,user_id,business_id,stars,sentiment,date
0,pUycOfUwM8vqX7KjRRhUEA,59MxRhNVhU9MYndMkz0wtw,gebiRewfieSdtt17PTW6Zg,3.0,2,2016-07-25 07:31:06
1,qRhOkdYNO1URgn1WJfK1cg,59MxRhNVhU9MYndMkz0wtw,W7gSJz80DywKnPRIGjA2Bw,5.0,2,2016-07-25 07:20:23
2,ZZbpYMY4s8sVQGEU1jAuVA,59MxRhNVhU9MYndMkz0wtw,l_slvEnh4v3W8BXF1gYlcQ,5.0,2,2016-07-23 00:13:36
3,_Ub20uO1MKy4XOVPOdzpqw,lUYboGI6aFbZ0dX27pijpA,gebiRewfieSdtt17PTW6Zg,1.0,1,2017-06-28 01:04:59
4,-DjIfoNFAiT5J4kF9hXocQ,SrfDRvGKI8FQq9LCr0dQuQ,gebiRewfieSdtt17PTW6Zg,3.0,1,2017-01-14 23:31:35
...,...,...,...,...,...,...
909981,HVYhSbKGyj2R39fTDY7sGg,7KBaoiKUhdTvOsRFOMGwZw,VnAJnVpXHOIBdg6qFJBc3g,1.0,1,2021-08-19 02:30:16
909982,CzJMAaOokvASWgVZg8B__w,L-h5y32VWEV60QXuDPW-hg,VnAJnVpXHOIBdg6qFJBc3g,1.0,1,2021-09-12 05:20:37
909983,9_ztYeoSwdz7S9TW8xkDQA,03q-tEfa2aJtKhf00ZZ-hg,VnAJnVpXHOIBdg6qFJBc3g,1.0,1,2021-04-25 13:44:37
909984,jDeNxby0ZI5UMsZzrywedA,NIlmQ38hxTR2w6lNSzJEuQ,VnAJnVpXHOIBdg6qFJBc3g,1.0,1,2021-10-06 01:52:41


In [None]:
reviews_yelp['date'] = reviews_yelp['date'].dt.strftime('%Y-%m-%d %H:%M:%S').str.replace('\r', '')

In [None]:
reviews_yelp.to_parquet('../datasets/processed/bd/10_reviews_yelp.parquet.gz', compression='gzip')

# user_yelp

In [None]:
user_yelp = pd.read_parquet('../datasets/processed/yelp/user_yelp.parquet.gz')
user_yelp

Unnamed: 0,user_id,name,creation,review_count,fans,friends,stars
0,qVc8ODYU5SZjKXVBgXdI7w,Walker,2007-01-25 16:47:26,1,267,14995,2.00
1,4ZaqBJqt7laPPs8xfWvr6A,Nina,2008-08-16 22:43:21,11,75,492,3.55
2,NIhcRW6DWvk1JQhDhXwgOQ,Lia,2005-12-30 13:47:19,9,345,998,3.56
3,baSDvZweZk6qLY_kHPvYzQ,Michelle,2008-02-18 01:40:04,1,22,208,4.00
4,rppTTi-kfF8-qyiArNemag,Helen,2006-01-24 14:33:32,19,49,387,3.05
...,...,...,...,...,...,...,...
353240,xzt22UaBjy40CHgCjJ3Fgg,Don,2009-05-19 20:42:33,1,0,0,5.00
353241,_K4X0IBlwDSnClxOrOdxVw,Sarah,2019-02-28 14:10:05,1,0,0,5.00
353242,Pt3u_iZV4Lo--4yfYsBhGg,Jordan,2013-07-13 23:58:30,1,0,0,5.00
353243,Q5SlTMOwyHq4PIu7Ev-GVg,John,2018-01-13 21:55:39,1,0,0,5.00


In [None]:
user_yelp['creation'] = user_yelp['creation'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [None]:
user_yelp.to_parquet('../datasets/processed/bd/3_user_yelp.parquet.gz', compression='gzip')

# yelp

In [None]:
yelp = pd.read_parquet('../datasets/processed/yelp/bussiness_yelp.parquet.gz')
yelp.head()

Unnamed: 0,business_id,name,latitude,longitude,stars,categories,state
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,27.955269,-82.45632,4.09,"[Vietnamese, Food, Restaurants, Food Trucks]",Florida
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,27.916116,-82.760461,4.45,"[Food, Delis, Italian, Bakeries, Restaurants]",Florida
2,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,28.046203,-82.505053,4.0,"[Restaurants, American (New), Italian]",Florida
3,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,27.960514,-82.506127,4.17,"[Restaurants, Pizza]",Florida
4,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,28.196252,-82.380615,4.51,"[Burgers, Sports Bars, Bars, Lounges, Restaura...",Florida


In [None]:
yelp.drop(columns='categories', inplace=True)

### states

In [None]:
yelp = pd.merge(yelp, states, on='state')
yelp.drop(columns='state', inplace=True)
yelp

Unnamed: 0,business_id,name,latitude,longitude,stars,categories,state_id
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,27.955269,-82.456320,4.09,"[Vietnamese, Food, Restaurants, Food Trucks]",1
1,0bPLkL0QhhPO5kt1_EXmNQ,Zios Italian Market,27.916116,-82.760461,4.45,"[Food, Delis, Italian, Bakeries, Restaurants]",1
2,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,28.046203,-82.505053,4.00,"[Restaurants, American (New), Italian]",1
3,JgpnXv_0XhV3SfbfB50nxw,Joes Pizza,27.960514,-82.506127,4.17,"[Restaurants, Pizza]",1
4,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,28.196252,-82.380615,4.51,"[Burgers, Sports Bars, Bars, Lounges, Restaura...",1
...,...,...,...,...,...,...,...
12912,Df-DdXqssj7ZeD8ypAF7Og,Crux Cafe,39.894469,-75.071001,4.00,"[Cafes, Restaurants]",4
12913,qjtELTt9fdIwoi_xGNN21g,Chipotle Mexican Grill,39.943481,-74.964743,2.43,"[Restaurants, Mexican, Fast Food]",4
12914,BzPTw9daJW8ToqTc7QiHFw,Taco Bell,40.210771,-74.756588,2.67,"[Restaurants, Tex-Mex, Fast Food, Mexican, Tac...",4
12915,UBQAksw81m0sMrAd8g-ECg,El Zarape Restaurant,39.718717,-74.971143,4.00,"[Restaurants, Mexican]",4


### Arreglamos errores

In [None]:
yelp.rename(columns={'stars': 'avg_stars'}, inplace=True)

yelp['name'] = yelp['name'].replace(r'[^a-zA-Z\s]', '', regex=True)
yelp = yelp.drop_duplicates(subset=['business_id'], keep='first')

def corregir_codificacion(texto):
    try:
        return texto.encode('latin-1').decode('utf-8', 'ignore')
    except Exception:
        return unicodedata.normalize('NFKD', texto).encode('latin-1', 'ignore').decode('utf-8', 'ignore')


# Recorre todas las columnas del DataFrame
for column in yelp.columns:
    # Intenta corregir los caracteres no codificados como UTF-8 en cada celda
    yelp[column] = yelp[column].apply(lambda x: corregir_codificacion(str(x)) if isinstance(x, str) else x)

In [53]:
yelp.to_parquet('../datasets/processed/bd/6_business_yelp.parquet.gz', compression='gzip')

# google

In [54]:
google = pd.read_parquet('../datasets/processed/google/bussiness_google.parquet.gz')
google.head()

Unnamed: 0,gmap_id,name,latitude,longitude,stars,categories,state
0,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,Cape Seafood Shack,26.641377,-81.940545,5.0,[Restaurant],Florida
1,0x88c2e4e34f1ed783:0x76c5da381c499d79,Fresh Point Country Buffet,27.867489,-82.702971,5.0,[Buffet restaurant],Florida
2,0x8890b9241e704667:0x3a1e565c17c00993,Hot Box,30.391411,-87.26722,4.2,[Restaurant],Florida
3,0x88e635378f43352f:0xa1b53c63436fa428,Shell,29.183272,-81.889965,1.8,"[Gas station, ATM, Convenience store, Restaurant]",Florida
4,0x88d9ab98b5baad79:0x1a2e3f0642a55246,Baby Food Grill & Bar,26.01025,-80.148659,3.0,[Mexican restaurant],Florida


In [55]:
google = pd.merge(google, states, on='state')
google.drop(columns=['state','categories'], inplace=True)
google.rename(columns={'stars': 'avg_stars'}, inplace=True)
google

Unnamed: 0,gmap_id,name,latitude,longitude,stars,categories,state_id
0,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,Cape Seafood Shack,26.641377,-81.940545,5.0,[Restaurant],0
1,0x88c2e4e34f1ed783:0x76c5da381c499d79,Fresh Point Country Buffet,27.867489,-82.702971,5.0,[Buffet restaurant],0
2,0x8890b9241e704667:0x3a1e565c17c00993,Hot Box,30.391411,-87.267220,4.2,[Restaurant],0
3,0x88e635378f43352f:0xa1b53c63436fa428,Shell,29.183272,-81.889965,1.8,"[Gas station, ATM, Convenience store, Restaurant]",0
4,0x88d9ab98b5baad79:0x1a2e3f0642a55246,Baby Food Grill & Bar,26.010250,-80.148659,3.0,[Mexican restaurant],0
...,...,...,...,...,...,...,...
54901,0x89c3b2b8e9c8c357:0xb1b07826ef85a598,Park View Tavern,40.653743,-74.234148,4.3,"[Bar, Italian restaurant, Pub, Restaurant]",3
54902,0x89c24d60acc3ffff:0xc8048546518281f6,The Brown Derby Cafe & Rental Hall,40.661250,-74.193971,4.6,"[Restaurant, Family restaurant]",3
54903,0x89c24d7a198a3541:0x3dcb8f429c053248,George's Lunch,40.654865,-74.177221,4.5,[American restaurant],3
54904,0x89c3b290dcc12d6d:0x34ad59938efed577,Las Brasas Sports Bar and Restaurant,40.661939,-74.248363,3.9,"[Latin American restaurant, Mexican restaurant...",3
