# OpenRice Restaurant Web Scraping
This project aims to scrape info about OpenRice's selected restaurants (i.e. based in Mong Kok, sells Japanese sushi/sashimi and price below $50).

In [None]:
# Import libraries
from bs4 import BeautifulSoup as bs
import requests
import csv

In [None]:
# Connect to website and pull in data
url = 'https://www.openrice.com/en/hongkong/restaurants?cuisineId=2009&dishId=1034&districtId=2010&priceRangeId=1'

# Copy from httpbin.org/get
# Dictionary of HTTP Headers to send with the Request
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}

page = requests.get(url, headers=headers)

# We can print out the HTML content of the page using the content property
soup1 = bs(page.content, 'html.parser')
soup2 = bs(soup1.prettify(), 'html.parser') # prettify makes html codes look formatted

#print(soup2)

In [None]:
# Parse cellular blocks containing all info
blocks = soup2.find_all('div', class_='content-cell-wrapper')

#print(blocks)

In [None]:
# Write the file (USE ONLY ONCE!!!)
header = ['Restaurant Name', 'Address', 'District', 'Price Range', 'Dish Type', 'Restaurant Type']

with open("OpenRice Restaurant Web Scraping.txt", 'w', newline='', encoding='utf-8') as f:
    writer = csv.writer(f)
    writer.writerow(header)

# Parse names, addresses, districts, price range, dish types, restaurant types
for block in blocks:
    name = block.find('h2', class_='title-name').a.text.strip()
    
    # Filter out relocated stores
    if '(Moved)' not in name:
        address = block.find('div', class_='icon-info address').span.text.strip()
        district = block.find('div', class_='icon-info address').span.a.text.strip()
        price_range = block.find('div', class_='icon-info icon-info-food-price').span.text.strip()
        dish = block.find('ul', class_='pois-categoryui-list').text.strip()[-13:] # brute force slicing
        # unknown reason about why there are two <li> tags but only the first can be displayed
        restaurant = block.find('ul', class_='pois-categoryui-list').li.a.text.strip()

        # Filter out sponsored blocks and are not wanted results
        if 'Sushi/Sashimi' in dish:
            data = [name, address, district, price_range, dish, restaurant]
            
            # Append rows to the txt file
            # csv file cannot read Chinese characters, that's the problem of Excel
            # To load correct Chinese characters in Excel, refer to method 2 in https://www.accompa.com/kb/answer.html?answer_id=264
            with open("OpenRice Restaurant Web Scraping.txt", 'a+', newline='', encoding='utf-8') as f:
                writer = csv.writer(f)
                writer.writerow(data)