# Notebook: Collect HTML Documents

This notebook is used to scrape the pages listing the reviews of a specific restaurant. So first, the pages with the reviews are collected in the form of .html files.

## Packages

In [1]:
from bs4 import BeautifulSoup
import requests
import json

In [2]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Constants

In [3]:
# Source: https://stackoverflow.com/questions/69946941/python-beautifulsoup-web-scraping-tripadvisor-view-a-review
REQUEST_HEADER = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate',
    'accept-language': 'en,mr;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}

In [4]:
RESTAURANT_URLS = "restaurants.json"
PAGES_RESTAURANTS_PATH = "pages_restaurants_html/"
RESTAURANT_METADATA_WITH_HIGHEST_PAGE_INDEX_PATH = "restaurant_metadata_with_highest_page_index.json"

## Code

### Load Dataset

In [5]:
with open(RESTAURANT_URLS, 'r') as json_file:
    restaurant_metadata = json.load(json_file)

In [6]:
def get_html_by_url(url):
    response = requests.get(url, headers=REQUEST_HEADER)
    html_content = response.text
    return html_content

### Save Pages

In [7]:
for restaurant in restaurant_metadata:
    restaurant["highest_page_index"] = None

In [8]:
from datetime import datetime

month_mapping = {
    "Januar": 1, "Februar": 2, "März": 3, "April": 4, "Mai": 5, "Juni": 6,
    "Juli": 7, "August": 8, "September": 9, "Oktober": 10, "November": 11, "Dezember": 12
}

def convert_date(date_string):
    day, month_name, year = date_string.split()
    day = day.replace(".", "")
    month = month_mapping[month_name]
    return datetime(int(year), month, int(day))

In [9]:
# Only pages with reviews published after 01 October 2022 should be considered in order to primaly collect reviews that do not 
# address the covid-19 pandemic an thus focusing more on other aspects related to the restaurant. Pages with reviews published before this date should not be considered.
def has_review_before_october_2022(soup):
    reviews = soup.find_all("div", class_="review-container")
    review_dates = [convert_date(review.find(class_='ratingDate')['title']) for review in reviews]
    
    for date in review_dates:
        if date < datetime(2022, 10, 1):
            return True
    
    next_button = soup.find("a", class_="nav next ui_button primary")
    next_button_disabled = soup.find("a", class_="nav next ui_button primary disabled")
    
    if next_button_disabled or next_button == None:
        return True
    
    return False

In [10]:
for restaurant in restaurant_metadata:
    page_index = 0
    while True:
        url = restaurant["url"]
        url = url.replace("-Reviews-", "-Reviews-or" + str(page_index * 10) +"-")
        html = get_html_by_url(url)
        soup = BeautifulSoup(html, 'html.parser')
        with open(PAGES_RESTAURANTS_PATH + f'restaurant_{restaurant["id"]}_pageidx_{page_index}.html', "w", encoding="utf-8") as file:
            file.write(html)
        if has_review_before_october_2022(soup):
            next((res.update({"highest_page_index": page_index}) for res in restaurant_metadata if res["id"] == restaurant["id"]), None)
            break
        page_index += 1

In [11]:
with open(RESTAURANT_METADATA_WITH_HIGHEST_PAGE_INDEX_PATH, 'w') as json_file:
    json.dump(restaurant_metadata, json_file)