# Notebook: Collect Reviews

This notebook is used to collect the restaurant reviews from each page listing reviews for a specific restaurant. First, the reviews are collected in the form of a .html file.

## Packages

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import json
import re

In [2]:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Constants

In [3]:
# Source: https://stackoverflow.com/questions/69946941/python-beautifulsoup-web-scraping-tripadvisor-view-a-review
REQUEST_HEADER = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'accept': '*/*',
    'accept-encoding': 'gzip, deflate',
    'accept-language': 'en,mr;q=0.9',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'
}

In [4]:
RESTAURANT_URLS = "restaurant_metadata_with_highest_page_index.json"
REVIEWS_RESTAURANTS_PATH = "reviews_restaurants_html/"
PAGES_RESTAURANTS_PATH = "pages_restaurants_html/"
REVIEWS_PATH = "reviews_dataset/reviews_urls.csv"

## Code

### Load Dataset

In [5]:
with open(RESTAURANT_URLS, 'r') as json_file:
    restaurant_metadata = json.load(json_file)

### Save reviews

In [6]:
def get_html_by_url(url):
    response = requests.get(url, headers=REQUEST_HEADER)
    html_content = response.text
    return html_content

In [7]:
data = []

for restaurant in restaurant_metadata:
    for page_index in range(0, restaurant["highest_page_index"]+1):
        path_review =  PAGES_RESTAURANTS_PATH + "restaurant_" + str(restaurant["id"]) + "_pageidx_" + str(page_index) + ".html"
        with open(path_review, 'r', encoding='utf-8') as file:
            html_content = file.read()
        soup = BeautifulSoup(html_content, 'html.parser')
        reviews = soup.find_all("div", class_="review-container")
        for review in reviews:
            review_url = "https://www.tripadvisor.de" + review.find('a', class_='title').get('href')
            review_id = re.search(r'-r(\d+)', review_url).group(1)
            
            data.append({"review_id": review_id, "restaurant_id": restaurant["id"], "page_index": page_index, "review_url": review_url})
            
            html_content = get_html_by_url(review_url)
            with open(REVIEWS_RESTAURANTS_PATH + f'restaurant_{restaurant["id"]}_review_{review_id}.html', "w", encoding="utf-8") as file:
                file.write(html_content)

In [None]:
df = pd.DataFrame(data)
df.to_csv(REVIEWS_PATH, index=False)