# Notebook: Filter Reviews from Collected HTMLs

## Packages

In [1]:
from langdetect import detect
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize

## Settings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%capture
!python -m spacy download de_core_news_lg

In [4]:
nlp = spacy.load("de_core_news_lg")

## Constants

In [5]:
REVIEWS_PATH = "../datasets/reviews_urls.csv"
RANDOM_STATE = 43

## Code

### Load Dataset

In [6]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [7]:
columns = ['review_id', 'restaurant_id', 'page_index', 'title', 'date', 'author_name', 'author_location', 'text', 'rating', 'restaurant_name', 'language_code']
data_reviews = []

In [8]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["text"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [9]:
for index, row in reviews_df.iterrows():
    path_review = "../datasets/reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    review["restaurant_name"] = doc_soup.find('a', class_='HEADING').get_text()
    review["language_code"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    review["review_id"] = row["review_id"]
    review["restaurant_id"] = row["restaurant_id"]
    review["page_index"] = row["page_index"]
    data_reviews.append(review)

In [10]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)

### Delete Examples without Data

There are rare cases where the text from the rating is not returned with the GET request to the page from the restaurant rating. These will now be excluded.

In [11]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['text'] == '') | (df_reviews['title'] == '')].index)

### Filter Languages

We are only considering reviews in german language.

In [12]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['language_code'] != 'de')].index)

Even if the language code = "de", we have observed that reviews in other languages sometimes have "de" as the language code. These are excluded with the help of Google's language-detection.

In [13]:
df_reviews['detected_language'] = df_reviews['text'].apply(lambda x: detect(x))
# Examples of reviews that were excluded
df_reviews[df_reviews['detected_language'] != 'de']

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
2803,892819899,715362,0,Worst hard rock,31. Mai 2023,Phil D,,After a wonderful few days in Berlin decided t...,1.0,Hard Rock Cafe Berlin,de,en


In [14]:
df_reviews = df_reviews[df_reviews['detected_language'] == 'de']

### Check for Duplicates

In [16]:
duplicate_rows = df_reviews[df_reviews.duplicated(subset=['review_id'], keep=False)]
duplicate_rows

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language


### Anonymisierung

In [17]:
df_reviews["text_noanonymization"] = df_reviews["text"]

In [18]:
def anonymize_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["LOC", "PERSON", "DATE"] and ent.label_ != "Essen":
            text = text.replace(ent.text, f"<{ent.label_}>")
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_entities)

In [19]:
def anonymize_username(text, username):
    return text.replace(username, "<PERSON>")
df_reviews["text"] = df_reviews.apply(lambda row: anonymize_username(row["text"], row["author_name"]), axis=1)

### Filter Sentences

In [20]:
df_reviews_sentences = pd.DataFrame(columns=list(df_reviews.columns) + ['sentence_idx'])

for idx, row in df_reviews.iterrows():
    sentences = sent_tokenize(row['text'])
    sentence_index = 0
    for sentence in sentences:
        new_row = row.copy()  # Kopiere die gesamte Zeile
        new_row['text'] = sentence  # Setze 'text' auf den aktuellen Satz
        new_row['sentence_idx'] = sentence_index
        df_reviews_sentences.loc[len(df_reviews_sentences)] = new_row
        sentence_index += 1

### Balancing

In [21]:
df_balanced_reviews_sentences = df_reviews_sentences.groupby(['rating'], group_keys=False).apply(lambda x: x.sample(700, replace=False, random_state=RANDOM_STATE))
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,320444037,1119896,13,Überforderung in höchstem Maße,20. Oktober 2015,nalujulia,"Wien, Österreich","Bestellung, Zubereitung der Pasta, Bringen des...",1.0,Vapiano,de,de,Zu zweit stellten wir uns an der Pasta Bar in ...,2
1,521160362,718834,7,"Schlechter Service, unprofessionelle ""Köche"" -...",4. September 2017,onlystopontop,"München, Deutschland","Es gibt nur eines, wofür ich absolut niemals V...",1.0,VAPIANO München,de,de,Bis man mich mal dazu motiviert hat ein Restau...,5
2,685411373,2271294,16,Nie wieder,30. Juni 2019,Peter K,"Wülfrath, Nordrhein-Westfalen, Germany",Nachdem wir am Eingang von sehr freundlichen D...,1.0,Hard Rock Cafe,de,de,Nachdem wir am Eingang von sehr freundlichen D...,0
3,571867416,778662,5,Nach dem Halb Marathon,8. April 2018,167christiang,"Brandenburg, Deutschland",auf meine Nudeln warten!,1.0,Vapiano,de,de,Trotz Leere in diesem Vapiano musste ich 33min...,2
4,613198287,1520135,10,Mit Durchfall Garantie!!!! NIE WIEDER!!!!!,2. September 2018,gigi-8787,"Tübingen, Deutschland",Einmal und nie wieder!!,1.0,L'Osteria München Künstlerhaus,de,de,Das schlechteste Restaurant bei dem ich jemals...,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,355501122,2395287,19,Leckeres Essen vor dem Musical,15. März 2016,Peter R,,Gut dass ich schon ein paar Tage vorher einen ...,5.0,BLOCK HOUSE Friedrichstraße,de,de,Wir waren letzten Samstag vor dem Musical Hair...,1
3496,406050576,1867249,15,Lecker und preiswert,16. August 2016,juleb767,,"Wir haben zu 4. ein Mittagsmenü gegessen, es g...",5.0,HANS IM GLÜCK Burgergrill & Bar,de,de,"Wir haben zu 4. ein Mittagsmenü gegessen, es g...",0
3497,356486328,1867249,19,Viele Varianten,18. März 2016,cycling-freak65,Luxemburg,mache häufig einen Besuch im Hans in Glück wen...,5.0,HANS IM GLÜCK Burgergrill & Bar,de,de,mache häufig einen Besuch im Hans in Glück wen...,0
3498,391439182,4432816,18,Lecker vegan,11. Juli 2016,Martine R,"Mistelbach, Österreich",Vegane Bürger sind Klasse,5.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Gemütliches restaurant mit freundliche Bedienu...,1


### Randomisierung

In [22]:
df_balanced_reviews_sentences = df_balanced_reviews_sentences.sample(frac=1, random_state=RANDOM_STATE)  # 'frac=1' mischt den gesamten DataFrame
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,483808031,1520135,19,"Lage unschlagbar, Qualität entsprechend aller ...",13. Mai 2017,Irk247,"Augsburg, Deutschland","Das Restaurant ist zentral gelegen, trotzdem s...",4.0,L'Osteria München Künstlerhaus,de,de,"Das Restaurant ist zentral gelegen, trotzdem s...",0
1,635650543,7082848,11,Gelungen !,25. November 2018,luih201,,Keine lange Wartezeiten und Typisch Italienisch.,5.0,L'Osteria,de,de,"Waren schon öfter da, Essen immer super lecker...",2
2,809253853,2271294,6,Worst ever HRC,10. September 2021,andydrei,,"Wenn Sie allerdings trockene, lauwarme Burger ...",2.0,Hard Rock Cafe,de,de,Von anderen Hard Rock Cafés bin ich leckeres E...,2
3,567785421,1870837,10,Super lecker,20. März 2018,thomasrV8773VX,"Königswinter, Deutschland",Immer wenn ich in <LOC> bin komme ich nicht dr...,5.0,BLOCK HOUSE Jungfernstieg,de,de,Immer wenn ich in Hamburg bin komme ich nicht ...,0
4,817846970,9554533,0,Absolut top,7. November 2021,spstutorial,"Kulmbach, Deutschland",Wir hatten nicht reserviert und bekamen trotzd...,4.0,HANS IM GLÜCK Burgergrill & Bar,de,de,Wir hatten nicht reserviert und bekamen trotzd...,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3495,722535583,758281,2,"Rasche Bedienung, recht gute Speisen, Freundli...",30. Oktober 2019,Heinz G,"Zürich, Schweiz","Schade, ansonsten wäre das noch ein guter Ort,...",4.0,Vapiano,de,de,"Das Essen war ganz ok, Vapiano Styl eben. Aber...",3
3496,637091546,2429098,5,Riesige Pizza,2. Dezember 2018,steph70806,"Kornwestheim, Deutschland",Also das allerwichtigste zuerst: Die Pizza war...,4.0,L'Osteria,de,de,Also das allerwichtigste zuerst: Die Pizza war...,0
3497,483726529,1520135,19,Super Pizza,12. Mai 2017,sebastianbY3422VY,"München, Deutschland",Service kommt ganz auf den Kellner/in an.,4.0,L'Osteria München Künstlerhaus,de,de,Beste Pizza in der Stadt. Für ein Essen mit Fr...,3
3498,706342521,2271294,14,Einer meiner Lieblingsplätze in Hamburg,4. September 2019,Bernd P,"München, Deutschland","Das Essen ist einfach super, die Burger so zie...",5.0,Hard Rock Cafe,de,de,"Hamburg ist sicher nicht arm an tourist traps,...",2


In [23]:
df_balanced_reviews_sentences["rating"].value_counts()

rating
4.0    700
5.0    700
2.0    700
3.0    700
1.0    700
Name: count, dtype: int64

In [24]:
df_balanced_reviews_sentences.groupby(["review_id", "sentence_idx"]).size().reset_index(name="count")

Unnamed: 0,review_id,sentence_idx,count
0,103493884,12,1
1,106323330,0,1
2,108229054,1,1
3,108229054,5,1
4,114707299,1,1
...,...,...,...
3495,910013568,3,1
3496,910516999,4,1
3497,910516999,9,1
3498,910516999,11,1


In [29]:
df_balanced_reviews_sentences["restaurant_id"].value_counts()

restaurant_id
1520135    233
2429098    220
758281     199
718834     197
1875739    193
1118814    186
2271294    186
1119896    185
715361     180
1867249    177
4432816    171
7082848    168
1027424    161
9554533    159
715362     157
778662     156
1870837    152
801344     146
1821839    143
2395287    131
Name: count, dtype: int64

### Store as .csv 

In [26]:
df_reviews.to_csv("../datasets/reviews.csv")

In [27]:
df_reviews_sentences.to_csv("../datasets/reviews_sentences.csv")

In [28]:
df_balanced_reviews_sentences.to_csv("../datasets/balanced_reviews_sentences.csv")