In [1]:
import pandas as pd
from sqlalchemy import create_engine

In [18]:
# Store JSON into DataFrame
json_file = "../Resource/yelp_academic_dataset_business.json"

import json
data = []
with open(json_file, errors='ignore') as f:
    for line in f:
        data.append(json.loads(line))
        
business_df = pd.DataFrame(data)
business_df.head()

Unnamed: 0,address,attributes,business_id,categories,city,hours,is_open,latitude,longitude,name,postal_code,review_count,stars,state
0,2818 E Camino Acequia Drive,{'GoodForKids': 'False'},1SWheh84yJXfytovILXOAQ,"Golf, Active Life",Phoenix,,0,33.522143,-112.018481,Arizona Biltmore Golf Club,85016,5,3.0,AZ
1,30 Eglinton Avenue W,"{'RestaurantsReservations': 'True', 'GoodForMe...",QXAEGFB4oINsVuTFxEYKFQ,"Specialty Food, Restaurants, Dim Sum, Imported...",Mississauga,"{'Monday': '9:0-0:0', 'Tuesday': '9:0-0:0', 'W...",1,43.605499,-79.652289,Emerald Chinese Restaurant,L5R 3E7,128,2.5,ON
2,"10110 Johnston Rd, Ste 15","{'GoodForKids': 'True', 'NoiseLevel': 'u'avera...",gnKjwL_1w79qoiV3IC_xQQ,"Sushi Bars, Restaurants, Japanese",Charlotte,"{'Monday': '17:30-21:30', 'Wednesday': '17:30-...",1,35.092564,-80.859132,Musashi Japanese Restaurant,28210,170,4.0,NC
3,"15655 W Roosevelt St, Ste 237",,xvX2CttrVhyG2z1dFg_0xw,"Insurance, Financial Services",Goodyear,"{'Monday': '8:0-17:0', 'Tuesday': '8:0-17:0', ...",1,33.455613,-112.395596,Farmers Insurance - Paul Lorenz,85338,3,5.0,AZ
4,"4209 Stuart Andrew Blvd, Ste F","{'BusinessAcceptsBitcoin': 'False', 'ByAppoint...",HhyxOkGAM07SRYtlQ4wMFQ,"Plumbing, Shopping, Local Services, Home Servi...",Charlotte,"{'Monday': '7:0-23:0', 'Tuesday': '7:0-23:0', ...",1,35.190012,-80.887223,Queen City Plumbing,28217,4,4.0,NC


In [13]:
# get important columns to transform
new_business_df = business_df[['business_id', 'name', 'categories', 'is_open', 'city', 'state', 'review_count', 'stars']].copy()
new_business_df.head()

Unnamed: 0,business_id,name,categories,is_open,city,state,review_count,stars
0,1SWheh84yJXfytovILXOAQ,Arizona Biltmore Golf Club,"Golf, Active Life",0,Phoenix,AZ,5,3.0
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,"Specialty Food, Restaurants, Dim Sum, Imported...",1,Mississauga,ON,128,2.5
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"Sushi Bars, Restaurants, Japanese",1,Charlotte,NC,170,4.0
3,xvX2CttrVhyG2z1dFg_0xw,Farmers Insurance - Paul Lorenz,"Insurance, Financial Services",1,Goodyear,AZ,3,5.0
4,HhyxOkGAM07SRYtlQ4wMFQ,Queen City Plumbing,"Plumbing, Shopping, Local Services, Home Servi...",1,Charlotte,NC,4,4.0


In [21]:
# Filter all the open restaurants
open_restaurants = new_business_df[(new_business_df.is_open==1) & (new_business_df["categories"].str.contains("Restaurant"))]
open_restaurants.head()

Unnamed: 0,business_id,name,categories,is_open,city,state,review_count,stars
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,"Specialty Food, Restaurants, Dim Sum, Imported...",1,Mississauga,ON,128,2.5
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,"Sushi Bars, Restaurants, Japanese",1,Charlotte,NC,170,4.0
11,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,"Restaurants, Breakfast & Brunch, Mexican, Taco...",1,Phoenix,AZ,18,3.0
13,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,"Italian, Restaurants, Pizza, Chicken Wings",1,Mentor-on-the-Lake,OH,16,4.0
23,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,"Sandwiches, Salad, Restaurants, Burgers, Comfo...",1,Pittsburgh,PA,35,4.0


In [27]:
# Prepare the DF to be inserted into database by choosing right columns and renaming them
df = open_restaurants[['business_id', 'name', 'city', 'state', 'review_count', 'stars']].copy()
df.rename(columns = {'name':'business_name'}, inplace = True)
df.head()

Unnamed: 0,business_id,business_name,city,state,review_count,stars
1,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,128,2.5
2,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,Charlotte,NC,170,4.0
11,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,Phoenix,AZ,18,3.0
13,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,Mentor-on-the-Lake,OH,16,4.0
23,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,Pittsburgh,PA,35,4.0


In [28]:
# Connect to Postgres DB
rds_connection_string = "postgres:password@localhost:5432/yelp_data"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [29]:
engine.table_names()

['business_info']

In [30]:
# Write DataFrame to the database
df.to_sql(name='business_info', con=engine, if_exists='append', index=False)

In [32]:
# Query the data from the database table
pd.read_sql_query('select * from business_info', con=engine).head()

Unnamed: 0,business_id,business_name,city,state,review_count,stars
0,QXAEGFB4oINsVuTFxEYKFQ,Emerald Chinese Restaurant,Mississauga,ON,128,2.5
1,gnKjwL_1w79qoiV3IC_xQQ,Musashi Japanese Restaurant,Charlotte,NC,170,4.0
2,1Dfx3zM-rW4n-31KeC8sJg,Taco Bell,Phoenix,AZ,18,3.0
3,fweCYi8FmbJXHCqLnwuk8w,Marco's Pizza,Mentor-on-the-Lake,OH,16,4.0
4,1RHY4K3BD22FK7Cfftn8Mg,Marathon Diner,Pittsburgh,PA,35,4.0
