## 0. Imports

In [None]:
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import re
import time
import random


In [None]:
## 1. Web Scraping

In [None]:
def get_thomann_review_page(product: str, page=1, rating=0, order=0, reviewlang=1) -> requests.Response:
    print(f"Reading page {page} for product {product}...")
    return requests.get(f'https://www.thomann.de/de/{product}_reviews.htm?page={page}&order={order}&rating={rating}&reviewlang%5B%5D={reviewlang}')

def get_thomann_review_page_soup(product: str, page=1, rating=0, order=0, reviewlang=1) -> BeautifulSoup:
    web_page = get_thomann_review_page(product, page=page, rating=rating, order=order, reviewlang=reviewlang)
    return BeautifulSoup(web_page.text, 'html.parser')

def get_thomann_review_page_text(product: str, page=1, rating=0, order=0, reviewlang=1, class_filter=".rs-prod.review"):
    soup = get_thomann_review_page_soup(product, page=page, rating=rating, order=order, reviewlang=reviewlang)

    return soup.select(class_filter)
    

In [18]:
def get_all_pages_for_thomann_review(product: str, start_page=1, rating=0, order=0, reviewlang=1, class_filter=".rs-prod.review"):
    pages = []

    current_page = start_page
    texts = get_thomann_review_page_text(product, page=current_page, rating=rating, order=order, reviewlang=reviewlang, class_filter=class_filter)

    while len(texts) > 0:
        pages.extend(texts)
        current_page += 1
        sleep_time = random.choice([x * 0.1 for x in range(0, 20)])
        print(f'Sleeping for {sleep_time} seconds...')
        time.sleep(sleep_time)
        texts = get_thomann_review_page_text(product, page=current_page, rating=rating, order=order, reviewlang=reviewlang, class_filter=class_filter)
    
    return pages
    

In [None]:
def strip_reviews(reviews: list):
    return [x.text.strip() for x in reviews]
    

In [None]:
# TODO: Add functionality for getting individual ratings
def get_stars_from_review(review: str, css_selector=".total-stars .overlay-wrapper"):
    soup = BeautifulSoup(str(review))
    selector = soup.select(css_selector)
    style = selector[0].get("style")
    percentage = float(re.search(r"(\d+(\.\d+)?)", style).group(1))
    return int(5 * (percentage / 100))

def get_text_from_review(review: str, css_selector=".inner.js-text-original"):
    soup = BeautifulSoup(str(review))
    selector = soup.select(css_selector)
    return selector[0].text.strip()
    

In [None]:
def get_review_data(product_name: str) -> list:
    raw_reviews = get_all_pages_for_thomann_review(product_name)
    stars = [get_stars_from_review(review) for review in raw_reviews]
    text = [get_text_from_review(review) for review in raw_reviews]
    product_name_list = [product_name for x in raw_reviews]
    return list(zip(product_name_list, text, stars))

def get_data_for_products(products: list) -> list:
    data = []

    for product in products:
        data.extend(get_review_data(product))

    return data

def get_dataframe_for_products(products: list, column_names = ['product_name', 'text', 'stars']):
    data = get_data_for_products(products)
    return pd.DataFrame(data, columns=column_names)


In [19]:
products = ["sennheiser_hd_25", "thomann_ctg10"]

data = get_dataframe_for_products(products)
data


Reading page 1 for product sennheiser_hd_25...
Sleeping for 0.9 seconds...
Reading page 2 for product sennheiser_hd_25...
Sleeping for 0.8 seconds...
Reading page 3 for product sennheiser_hd_25...
Sleeping for 0.1 seconds...
Reading page 4 for product sennheiser_hd_25...
Sleeping for 0.2 seconds...
Reading page 5 for product sennheiser_hd_25...
Sleeping for 0.0 seconds...
Reading page 6 for product sennheiser_hd_25...
Sleeping for 0.2 seconds...
Reading page 7 for product sennheiser_hd_25...
Reading page 1 for product thomann_ctg10...
Sleeping for 0.0 seconds...
Reading page 2 for product thomann_ctg10...
Sleeping for 1.1 seconds...
Reading page 3 for product thomann_ctg10...
Sleeping for 1.8 seconds...
Reading page 4 for product thomann_ctg10...
Sleeping for 0.2 seconds...
Reading page 5 for product thomann_ctg10...
Sleeping for 1.5 seconds...
Reading page 6 for product thomann_ctg10...
Sleeping for 0.0 seconds...
Reading page 7 for product thomann_ctg10...
Sleeping for 0.2 seconds...

Unnamed: 0,product_name,text,stars
0,sennheiser_hd_25,Den HD-25 habe ich Anfang der 2000er Jahre ken...,5
1,sennheiser_hd_25,Zum Kopfhörer selbst brauche ich wohl nichts m...,5
2,sennheiser_hd_25,"Ich brauchte einen DJ-Kopfhörer, der einen gee...",5
3,sennheiser_hd_25,Einsatz: Ich arbeite als Tonmeister am Filmset...,5
4,sennheiser_hd_25,"Der HD-25 ist ein Hörer, den ich für bestimmte...",5
...,...,...,...
2297,thomann_ctg10,normalerweise tune ich mit meinem Iphone. Dies...,5
2298,thomann_ctg10,Für den Preis ein absoluter Ladenmitnehmer ;o)...,4
2299,thomann_ctg10,"tolles Gerät, sehr einfach zu bedienen, auch f...",5
2300,thomann_ctg10,Stimmgerät funktioniert prima. Für den Preis i...,4
