# Notebook: Filter Reviews from Collected HTMLs

## Packages

In [1]:
from langdetect import detect
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize

## Settings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%capture
!python -m spacy download de_core_news_lg

In [4]:
nlp = spacy.load("de_core_news_lg")

## Constants

In [5]:
REVIEWS_PATH = "../datasets/reviews_urls.csv"
RANDOM_STATE = 43

## Code

### Load Dataset

In [6]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [7]:
columns = ['review_id', 'restaurant_id', 'page_index', 'title', 'date', 'author_name', 'author_location', 'text', 'rating', 'restaurant_name', 'language_code']
data_reviews = []

In [8]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["text"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [9]:
for index, row in reviews_df.iterrows():
    path_review = "../datasets/reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    review["restaurant_name"] = doc_soup.find('a', class_='HEADING').get_text()
    review["language_code"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    review["review_id"] = row["review_id"]
    review["restaurant_id"] = row["restaurant_id"]
    review["page_index"] = row["page_index"]
    data_reviews.append(review)

In [10]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)

In [11]:
duplicate_rows = df_reviews[df_reviews.duplicated(subset=['review_id'], keep=False)]
duplicate_rows

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code


### Delete Examples without Data

There are rare cases where the text from the rating is not returned with the GET request to the page from the restaurant rating. These will now be excluded.

In [12]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['text'] == '') | (df_reviews['title'] == '')].index)

### Filter Languages

We are only considering reviews in german language.

In [13]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['language_code'] != 'de')].index)

Even if the language code = "de", we have observed that reviews in other languages sometimes have "de" as the language code. These are excluded with the help of Google's language-detection.

In [14]:
df_reviews['detected_language'] = df_reviews['text'].apply(lambda x: detect(x))
# Examples of reviews that were excluded
df_reviews[df_reviews['detected_language'] != 'de']

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language


In [15]:
df_reviews = df_reviews[df_reviews['detected_language'] == 'de']

### Check for Duplicates

### Remove Reviews Posted Before June 2022 

In [16]:
month_mapping = {
    "Januar": 1, "Februar": 2, "März": 3, "April": 4, "Mai": 5, "Juni": 6,
    "Juli": 7, "August": 8, "September": 9, "Oktober": 10, "November": 11, "Dezember": 12
}

def convert_date(date_string):
    day, month_name, year = date_string.split()
    day = day.replace(".", "")
    month = month_mapping[month_name]
    return pd.Timestamp(int(year), month, int(day))

df_reviews["date"] = df_reviews["date"].apply(convert_date)

In [17]:
df_reviews = df_reviews[df_reviews["date"] >= pd.Timestamp(2022, 7, 1)]

In [18]:
df_reviews

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!! Schm...,1.0,Vapiano,de,de
10,906970320,8000775,0,Unverschämter Kellner … Blondi mit Goldkette,2023-07-31,MichaelA759,"Biberach, Deutschland",Wir sind eigentlich große Freunde der Osteria-...,1.0,L'Osteria München Bogenhausen,de,de
11,891912786,8000775,0,Gerne wieder!!!,2023-05-25,Mirjam H,,"Tolle Qualität der Speisen, gute Lage des Loka...",5.0,L'Osteria München Bogenhausen,de,de
12,888709323,8000775,0,Für Quick Lunch ganz ok,2023-05-02,thordym,"Niebüll, Deutschland",Wir hatten nicht ganz viel Zeit für unser Mitt...,3.0,L'Osteria München Bogenhausen,de,de
13,888554589,8000775,0,Lecker und gemütlich,2023-05-01,Angelika B,,Gemütliche Einrichtung und Atmosphäre. Leckere...,4.0,L'Osteria München Bogenhausen,de,de
14,886547453,8000775,0,Chaotische Zustände - Lange Wartezeiten - Kalt...,2023-04-17,Freedom10972899719,"München, Deutschland","Grundsätzlich muss ich sagen, dass ich mit der...",1.0,L'Osteria München Bogenhausen,de,de
15,885333225,8000775,0,Lecker - Schnell - Freundlich,2023-04-08,HannahMaria1892,"München, Deutschland",Wir waren mit der Familie an Ostersamstag in d...,5.0,L'Osteria München Bogenhausen,de,de
16,885147996,8000775,0,Lecker aber schlechter Service,2023-04-07,freddykrueger88,"Achim, Deutschland",Erstmal wurde man gekonnt an der Tür ignoriert...,1.0,L'Osteria München Bogenhausen,de,de
17,883265167,8000775,0,Mittagessen,2023-03-23,629christinaz,,Hat alles sehr gut gepasst ☺️ gerne kommen wir...,5.0,L'Osteria München Bogenhausen,de,de
18,882425450,8000775,0,Spontaner Besuch,2023-03-16,Anja B,,Haben kurzfristig noch einen Platz bekommen. A...,5.0,L'Osteria München Bogenhausen,de,de


### Anonymise

In [19]:
df_reviews["text_noanonymization"] = df_reviews["text"]

In [20]:
def anonymize_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["LOC", "PERSON", "DATE"] and ent.label_ != "Essen":
            text = text.replace(ent.text, f"<{ent.label_}>")
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_entities)

In [21]:
def anonymize_username(text, username):
    return text.replace(username, "<PERSON>")
df_reviews["text"] = df_reviews.apply(lambda row: anonymize_username(row["text"], row["author_name"]), axis=1)

### Filter Sentences

In [22]:
df_reviews_sentences = pd.DataFrame(columns=list(df_reviews.columns) + ['sentence_idx'])

for idx, row in df_reviews.iterrows():
    sentences = sent_tokenize(row['text'])
    sentence_index = 0
    for sentence in sentences:
        new_row = row.copy()  # Kopiere die gesamte Zeile
        new_row['text'] = sentence  # Setze 'text' auf den aktuellen Satz
        new_row['sentence_idx'] = sentence_index
        df_reviews_sentences.loc[len(df_reviews_sentences)] = new_row
        sentence_index += 1

In [23]:
df_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!!,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,0
1,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Schmeckte absolut nach nichts.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,1
2,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Unmotiviertes Personal.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,2
3,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Das hat rein gar nichts mit italienischer Lebe...,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,3
4,906970320,8000775,0,Unverschämter Kellner … Blondi mit Goldkette,2023-07-31,MichaelA759,"Biberach, Deutschland",Wir sind eigentlich große Freunde der Osteria-...,1.0,L'Osteria München Bogenhausen,de,de,Wir sind eigentlich große Freunde der Osteria-...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,873912904,8000775,2,Günstig am Ring gelegen,2023-01-04,Holger M,"München, Deutschland",Kurz: es hat geschmeckt.,3.0,L'Osteria München Bogenhausen,de,de,"Bestellt, bekommen und gegessen habe ich ein V...",1
75,873912904,8000775,2,Günstig am Ring gelegen,2023-01-04,Holger M,"München, Deutschland","Die Thunfischsoße, große Kapern und einwandfre...",3.0,L'Osteria München Bogenhausen,de,de,"Bestellt, bekommen und gegessen habe ich ein V...",2
76,873912904,8000775,2,Günstig am Ring gelegen,2023-01-04,Holger M,"München, Deutschland","Die Umstände wie Service, Location waren voll ...",3.0,L'Osteria München Bogenhausen,de,de,"Bestellt, bekommen und gegessen habe ich ein V...",3
77,872623932,8000775,2,Auf Online Bestellung 50 Minuten gewartet :-( ...,2022-12-23,Q3877IUmarkusf,,Bloß nicht über das System bestellen - angebli...,1.0,L'Osteria München Bogenhausen,de,de,Bloß nicht über das System bestellen - angebli...,0


In [24]:
df_reviews_sentences[df_reviews_sentences["text"].str.contains("maske", case=False)]

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx


In [25]:
df_reviews_sentences["rating"].value_counts()

rating
5.0    35
1.0    34
3.0     5
2.0     3
4.0     2
Name: count, dtype: int64

### Balancing

In [26]:
df_balanced_reviews_sentences = df_reviews_sentences.groupby(['rating'], group_keys=False).apply(lambda x: x.sample(600, replace=False, random_state=RANDOM_STATE))
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

ValueError: Cannot take a larger sample than population when 'replace=False'

### Randomisierung

In [None]:
df_balanced_reviews_sentences = df_balanced_reviews_sentences.sample(frac=1, random_state=RANDOM_STATE)  # 'frac=1' mischt den gesamten DataFrame
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

In [None]:
df_balanced_reviews_sentences["rating"].value_counts()

In [None]:
df_balanced_reviews_sentences.groupby(["review_id", "sentence_idx"]).size().reset_index(name="count")

In [None]:
df_balanced_reviews_sentences["restaurant_id"].value_counts()

### Store as .csv 

In [None]:
df_reviews.to_csv("../datasets/reviews.csv")

In [None]:
df_reviews_sentences.to_csv("../datasets/reviews_sentences.csv")

In [None]:
df_balanced_reviews_sentences.to_csv("../datasets/balanced_reviews_sentences.csv")

In [None]:
docker run -d --name doccano -p 8000:8000 -v app_data:/app/data doccano/doccano