# ETL para la base de datos

In [2]:
import pandas as pd

In [3]:
google = pd.read_parquet('../datasets/processed/google/bussiness_google.parquet.gz')
yelp = pd.read_parquet('../datasets/processed/yelp/bussiness_yelp.parquet.gz')

# state

In [5]:
states = pd.DataFrame(columns=['state_id', 'state'])

states['state'] = google['state'].unique()
states['state_id'] = list(range(0,4))
states

Unnamed: 0,state_id,state
0,0,Florida
1,1,California
2,2,Illinois
3,3,New Jersey


In [6]:
states.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   state_id  4 non-null      int64 
 1   state     4 non-null      object
dtypes: int64(1), object(1)
memory usage: 196.0+ bytes


In [7]:
states.to_parquet('../datasets/processed/bd/states.parquet.gz', compression='gzip')

# categories

Seleccionamos las categorias existentes en ambos datasets

In [8]:
listaCategorias = []
for categorias in google['categories']:
    for categoria in categorias:
        if categoria not in listaCategorias:
            listaCategorias.append(categoria)

In [9]:
for categorias in yelp['categories']:
    for categoria in categorias:
        if categoria not in listaCategorias:
            listaCategorias.append(categoria)

In [10]:
categories = pd.DataFrame()
categories['name'] = listaCategorias
categories.reset_index(names = 'categories_id', inplace= True)
categories

Unnamed: 0,categories_id,name
0,0,Restaurant
1,1,Buffet restaurant
2,2,Gas station
3,3,ATM
4,4,Convenience store
...,...,...
1655,1655,Botanical Gardens
1656,1656,Newspapers & Magazines
1657,1657,Parking
1658,1658,Georgian


In [11]:
categories.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1660 entries, 0 to 1659
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   categories_id  1660 non-null   int64 
 1   name           1660 non-null   object
dtypes: int64(1), object(1)
memory usage: 26.1+ KB


Exportamos

In [12]:
categories.to_parquet('../datasets/processed/bd/categories.parquet.gz', compression='gzip')

# categories_google y categories_yelp

In [13]:
categories_google = google[['gmap_id','categories']].explode('categories')
categories_yelp = yelp[['business_id','categories']].explode('categories')

In [14]:
categories_google = categories_google.merge(categories, left_on='categories', right_on='name')
categories_google.drop(columns=['categories', 'name'], inplace=True)

categories_google.to_parquet('../datasets/processed/bd/categories_google.parquet.gz', compression='gzip')



categories_yelp = categories_yelp.merge(categories, left_on='categories', right_on='name')
categories_yelp.drop(columns=['categories', 'name'], inplace=True)

categories_yelp.to_parquet('../datasets/processed/bd/categories_yelp.parquet.gz', compression='gzip')

# yelp

In [115]:
yelp = pd.read_parquet('../datasets/processed/yelp/bussiness_yelp.parquet.gz')
yelp.head()

Unnamed: 0,business_id,name,latitude,longitude,stars,categories,state
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,27.955269,-82.45632,4.0,"[Vietnamese, Food, Restaurants, Food Trucks]",Florida
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,27.916116,-82.760461,4.5,"[Food, Delis, Italian, Bakeries, Restaurants]",Florida
2,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,28.046203,-82.505053,4.0,"[Restaurants, American (New), Italian]",Florida
3,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,27.960514,-82.506127,4.0,"[Restaurants, Pizza]",Florida
4,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,28.196252,-82.380615,4.5,"[Burgers, Sports Bars, Bars, Lounges, Restaura...",Florida


### states

In [117]:
yelp = pd.merge(yelp, states, on='state')
yelp.drop(columns='state', inplace=True)
yelp

Unnamed: 0,business_id,name,latitude,longitude,stars,categories,state_id
0,eEOYSgkmpB90uNA7lDOMRA,Vietnamese Food Truck,27.955269,-82.456320,4.0,"[Vietnamese, Food, Restaurants, Food Trucks]",0
1,0bPLkL0QhhPO5kt1_EXmNQ,Zio's Italian Market,27.916116,-82.760461,4.5,"[Food, Delis, Italian, Bakeries, Restaurants]",0
2,uI9XODGY_2_ieTE6xJ0myw,Roman Forum,28.046203,-82.505053,4.0,"[Restaurants, American (New), Italian]",0
3,JgpnXv_0XhV3SfbfB50nxw,Joe's Pizza,27.960514,-82.506127,4.0,"[Restaurants, Pizza]",0
4,pJfh3Ct8iL58NZa8ta-a5w,Top Shelf Sports Lounge,28.196252,-82.380615,4.5,"[Burgers, Sports Bars, Bars, Lounges, Restaura...",0
...,...,...,...,...,...,...,...
13914,qjtELTt9fdIwoi_xGNN21g,Chipotle Mexican Grill,39.943481,-74.964743,3.0,"[Restaurants, Mexican, Fast Food]",3
13915,BzPTw9daJW8ToqTc7QiHFw,Taco Bell,40.210771,-74.756588,2.5,"[Restaurants, Tex-Mex, Fast Food, Mexican, Tac...",3
13916,oNos27QR8hkBb4z-IK6uTA,Cafe Tango,39.847724,-74.977354,2.5,"[Indian, Restaurants, Tapas/Small Plates]",3
13917,UBQAksw81m0sMrAd8g-ECg,El Zarape Restaurant,39.718717,-74.971143,4.0,"[Restaurants, Mexican]",3


# google

In [118]:
google = pd.read_parquet('../datasets/processed/google/bussiness_google.parquet.gz')
google.head()

Unnamed: 0,gmap_id,name,latitude,longitude,stars,categories,state
0,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,Cape Seafood Shack,26.641377,-81.940545,5.0,[Restaurant],Florida
1,0x88c2e4e34f1ed783:0x76c5da381c499d79,Fresh Point Country Buffet,27.867489,-82.702971,5.0,[Buffet restaurant],Florida
2,0x8890b9241e704667:0x3a1e565c17c00993,Hot Box,30.391411,-87.26722,4.2,[Restaurant],Florida
3,0x88e635378f43352f:0xa1b53c63436fa428,Shell,29.183272,-81.889965,1.8,"[Gas station, ATM, Convenience store, Restaurant]",Florida
4,0x88d9ab98b5baad79:0x1a2e3f0642a55246,Baby Food Grill & Bar,26.01025,-80.148659,3.0,[Mexican restaurant],Florida


In [120]:
google = pd.merge(google, states, on='state')
google.drop(columns='state', inplace=True)
google

Unnamed: 0,gmap_id,name,latitude,longitude,stars,categories,state_id
0,0x88db4147b1d9e6f3:0x943dbd10a92ba1b1,Cape Seafood Shack,26.641377,-81.940545,5.0,[Restaurant],0
1,0x88c2e4e34f1ed783:0x76c5da381c499d79,Fresh Point Country Buffet,27.867489,-82.702971,5.0,[Buffet restaurant],0
2,0x8890b9241e704667:0x3a1e565c17c00993,Hot Box,30.391411,-87.267220,4.2,[Restaurant],0
3,0x88e635378f43352f:0xa1b53c63436fa428,Shell,29.183272,-81.889965,1.8,"[Gas station, ATM, Convenience store, Restaurant]",0
4,0x88d9ab98b5baad79:0x1a2e3f0642a55246,Baby Food Grill & Bar,26.010250,-80.148659,3.0,[Mexican restaurant],0
...,...,...,...,...,...,...,...
54901,0x89c3b2b8e9c8c357:0xb1b07826ef85a598,Park View Tavern,40.653743,-74.234148,4.3,"[Bar, Italian restaurant, Pub, Restaurant]",3
54902,0x89c24d60acc3ffff:0xc8048546518281f6,The Brown Derby Cafe & Rental Hall,40.661250,-74.193971,4.6,"[Restaurant, Family restaurant]",3
54903,0x89c24d7a198a3541:0x3dcb8f429c053248,George's Lunch,40.654865,-74.177221,4.5,[American restaurant],3
54904,0x89c3b290dcc12d6d:0x34ad59938efed577,Las Brasas Sports Bar and Restaurant,40.661939,-74.248363,3.9,"[Latin American restaurant, Mexican restaurant...",3
