In [114]:
!pip install selenium


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [115]:
!pip install fake_useragent


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [118]:
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
from tqdm.auto import tqdm
import time

In [120]:
OUTPUT_FILE = "final_estonian_restaurant_reviews_dataset.csv"

In [121]:
def parse_restaurants_list_block(one_block, website):
    """Extracts restaurant information from a given block of the restaurant list page."""
    restaurant_title_obj = one_block.find('h2', {'class': 'rest-title'})
    if not restaurant_title_obj:
        print("Restaurant title not found")
        return None
    restaurant_name = restaurant_title_obj.find('a').text.strip()
    restaurant_href = restaurant_title_obj.find('a')['href']
    return {
        'website': website,
        'restaurant_name': restaurant_name,
        'restaurant_href': f'https://dinnerbooking.com{restaurant_href}'
    }

In [122]:
def parse_one_restaurant(block):
    """Extracts review details from a restaurant's webpage using Selenium."""
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("--user-data-dir=/tmp/chrome-user-data")  # Use a unique temp directory
    
    # Set up the WebDriver
    driver = webdriver.Chrome(options=chrome_options)
    restaurant_reviews = []
    
    try:
        print(f"Opening the web page of the restaurant: {block['restaurant_href']}")
        driver.get(block["restaurant_href"])  # Load the page
        time.sleep(2)  # Wait for elements to load

        # Click "More reviews" until the button disappears (there will be "d-none" class)
        while True:
            try:
                # Try to find the 'More reviews' button
                more_reviews_button = driver.find_element(By.ID, "more-reviews")
                button_class = more_reviews_button.get_attribute("class")

                if "d-none" in button_class:
                    break  # The button is hidden, exit loop

                print("Clicking 'More reviews' button to load more reviews...")
                more_reviews_button.click()
                time.sleep(2)  # Wait for reviews to load
            except:
                break  # No "More reviews" button found, exit loop

        # Extract review blocks and process them
        review_blocks = driver.find_elements(By.CLASS_NAME, "customer-review-area")
        print(f"Number of reviews found: {len(review_blocks)}")

        for review_block in review_blocks:
            try:
                # Extract review author
                review_author = review_block.find_element(By.TAG_NAME, "h4").text.strip().split(' ')[0]
                # Extract review text
                review_text = review_block.find_element(By.CLASS_NAME, "customer-review-description").text.strip()

                def get_rating(label):
                    """Helper function to extract rating value based on label."""
                    try:
                        element = review_block.find_element(By.XPATH, f".//div[contains(text(), '{label}')]/following-sibling::div//div[@class='restaurant-starson']")
                        return float(element.get_attribute("data-rating"))
                    except:
                        return None
                
                review_data = {
                    "website": block["website"],
                    "restaurant_name": block["restaurant_name"],
                    "restaurant_href": block["restaurant_href"],
                    "review_author": review_author,
                    "review_text": review_text,
                    # Extract various review ratings
                    "review_rating_total": get_rating("Kokku"),
                    "review_rating_food": get_rating("Toit"),
                    "review_rating_service": get_rating("Teenindus"),
                    "review_rating_atmosphere": get_rating("Atmosfäär"),
                    "review_rating_overall_impression": get_rating("Üldmulje"),
                    "review_rating_price_quality_ratio": get_rating("Hinna ja kvaliteedi suhe")
                }
                restaurant_reviews.append(review_data)
            except Exception as e:
                # Log errors encountered while parsing a review
                print(f"Error processing a review: {e}")
                
    except Exception as e:
        print(f"Error processing {block['restaurant_href']}: {e}")
    
    finally:
        driver.quit()

    return restaurant_reviews

In [123]:
def get_nth_page(page_number):
    """Fetches and processes the restaurant list page for a given page number."""
    session = requests.session()
    ua = UserAgent()
    
    url = f'https://dinnerbooking.com/ee/et-EE/search/restaurants?page={page_number}'
    req = session.get(url, headers={'User-Agent': ua.random})
    soup = BeautifulSoup(req.text, 'html.parser')

    # Extract website name from logo
    website = soup.find("div", {"id": "logo"}).find("img")["alt"]
    
    restaurant_blocks = soup.find_all("div", class_="restaurant-item-wrap row no-margins")
    print(f"Number of restaurants found on web page {page_number}: {len(restaurant_blocks)}")

    data = []
    for block in restaurant_blocks:
        try:
            # Extract restaurant details
            restaurant_data = parse_restaurants_list_block(block, website)
            if restaurant_data:
                # Extract review details for the restaurant
                restaurant_reviews = parse_one_restaurant(restaurant_data)
                data.extend(restaurant_reviews)          
        except Exception as e:
            # Log errors encountered while processing a restaurant list block
            with open("errors.txt", "a") as f:
                f.write(f"Error processing a restaurant list block: {str(e)}\n")
                
    return data

In [124]:
def run_all(n_pages):
    """Runs the extraction process page by page and saves results immediately to a csv-file."""
    for page in tqdm(range(1, n_pages + 1)):
        try:
            data = get_nth_page(page)
            if data:
                # Convert extracted data to a DataFrame
                df = pd.DataFrame(data)
                df.dropna(subset=["review_author"], inplace=True)  # Deleting rows without reviews
                df.to_csv(OUTPUT_FILE, mode="a", index=False, header=not pd.io.common.file_exists(OUTPUT_FILE))
                print(f"Data from page {page} is saved, {len(df)} reviews in total.")
            else:
                print(f"Page {page}: no data to save.")
        except Exception as e:
            print(f"Error processing page {page}: {e}")
                    
    print("Data collection completed!")

In [None]:
# Run extraction for all pages
run_all(8)

  0%|          | 0/8 [00:00<?, ?it/s]

Number of restaurants found on web page 1: 10
Opening the web page of the restaurant: https://dinnerbooking.com/ee/et-EE/r3913/soo-restaurant
Number of reviews found: 10
Opening the web page of the restaurant: https://dinnerbooking.com/ee/et-EE/r3023/lahepere-villa
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More reviews' button to load more reviews...
Clicking 'More revi