# Notebook: Filter Reviews from Collected HTMLs

## Packages

In [1]:
from langdetect import detect
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize

## Settings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%capture
#!python -m spacy download de_core_news_lg

In [4]:
nlp = spacy.load("de_core_news_lg")

## Constants

In [5]:
REVIEWS_PATH = "../datasets/reviews_urls.csv"
RANDOM_STATE = 43

## Code

### Load Dataset

In [6]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [7]:
columns = ['review_id', 'restaurant_id', 'page_index', 'title', 'date', 'author_name', 'author_location', 'text', 'rating', 'restaurant_name', 'language_code']
data_reviews = []

In [8]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["text"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [9]:
for index, row in reviews_df.iterrows():
    path_review = "../datasets/reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    review["restaurant_name"] = doc_soup.find('a', class_='HEADING').get_text()
    review["language_code"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    review["review_id"] = row["review_id"]
    review["restaurant_id"] = row["restaurant_id"]
    review["page_index"] = row["page_index"]
    data_reviews.append(review)

In [10]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)

In [11]:
duplicate_rows = df_reviews[df_reviews.duplicated(subset=['review_id'], keep=False)]
duplicate_rows

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code


### Delete Examples without Data

There are rare cases where the text from the rating is not returned with the GET request to the page from the restaurant rating. These will now be excluded.

In [12]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['text'] == '') | (df_reviews['title'] == '')].index)

### Filter Languages

We are only considering reviews in german language.

In [13]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['language_code'] != 'de')].index)

Even if the language code = "de", we have observed that reviews in other languages sometimes have "de" as the language code. These are excluded with the help of Google's language-detection.

In [14]:
df_reviews['detected_language'] = df_reviews['text'].apply(lambda x: detect(x))
# Examples of reviews that were excluded
df_reviews[df_reviews['detected_language'] != 'de']

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
869,774382373,13295509,0,ES WAR nur akzeptable.It jast not very tasty.G...,15. Oktober 2020,EmmiQ,"Mexiko-Stadt, Mexiko",Dass Hamburger sind nicht außergewöhnlich. Mei...,4.0,HANS IM GLÜCK Burgergrill & Bar,de,en


In [15]:
df_reviews = df_reviews[df_reviews['detected_language'] == 'de']

### Check for Duplicates

### Remove Reviews Posted Before June 2022 

In [16]:
month_mapping = {
    "Januar": 1, "Februar": 2, "März": 3, "April": 4, "Mai": 5, "Juni": 6,
    "Juli": 7, "August": 8, "September": 9, "Oktober": 10, "November": 11, "Dezember": 12
}

def convert_date(date_string):
    day, month_name, year = date_string.split()
    day = day.replace(".", "")
    month = month_mapping[month_name]
    return pd.Timestamp(int(year), month, int(day))

df_reviews["date"] = df_reviews["date"].apply(convert_date)

In [17]:
df_reviews = df_reviews[df_reviews["date"] >= pd.Timestamp(2022, 7, 1)]

In [18]:
df_reviews

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!! Schm...,1.0,Vapiano,de,de
10,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",5.0,Vapiano,de,de
11,904367426,1119896,0,Enttäuschung,2023-07-20,V6519ILannab,"Berlin, Deutschland",Ich war früher ein Vapiano Fan (also in 2012-2...,1.0,Vapiano,de,de
12,865243904,1119896,0,wir gehen gerne hier hin,2022-10-20,575klat,,Wir gehen gerne hier hin. Immer ok für das Pre...,5.0,Vapiano,de,de
13,863710312,1119896,0,"Gut gelegen, Essen naja",2022-10-08,DirkU42,"Bielefeld, Deutschland","Sehr gut gelegen am Potsdamer Platz, freundlic...",3.0,Vapiano,de,de
...,...,...,...,...,...,...,...,...,...,...,...,...
2170,879326522,9758291,1,"Top Service, top essen",2023-02-19,ubique23,,"Sind immer wieder mal hier, super Location, zu...",5.0,L'Osteria Oberhausen,de,de
2171,872123275,9758291,1,"Super Service, stylisches Ambiente, klasse Pre...",2022-12-18,SabineS964,Arnsberg,"Tolles, modernes, stylisches und mit Raumgefüh...",5.0,L'Osteria Oberhausen,de,de
2172,863022044,9758291,1,Riiiesige Pizza!,2022-10-03,wieselwaemser,"Stein am Rhein, Schweiz","Sehr gut besucht, freundliche Bedienung • gemi...",4.0,L'Osteria Oberhausen,de,de
2173,862698017,9758291,1,Ganz schlecht.,2022-10-01,1961mh2019,"Neuss, Deutschland",Wir waren eine größere Gruppe. Bis auf eine Pe...,1.0,L'Osteria Oberhausen,de,de


### Anonymise

In [19]:
df_reviews["text_noanonymization"] = df_reviews["text"]

In [20]:
def anonymize_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["LOC", "PERSON", "DATE"] and ent.label_ != "Essen":
            text = text.replace(ent.text, f"<{ent.label_}>")
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_entities)

In [21]:
def anonymize_username(text, username):
    return text.replace(username, "<PERSON>")
df_reviews["text"] = df_reviews.apply(lambda row: anonymize_username(row["text"], row["author_name"]), axis=1)

### Filter Sentences

In [72]:
df_reviews_sentences = pd.DataFrame(columns=list(df_reviews.columns) + ['sentence_idx'])

for idx, row in df_reviews.iterrows():
    sentences = sent_tokenize(row['text'])
    sentence_index = 0
    for sentence in sentences:
        if len(sentence) > 1:
            new_row = row.copy()  # Kopiere die gesamte Zeile
            new_row['text'] = sentence  # Setze 'text' auf den aktuellen Satz
            new_row['sentence_idx'] = sentence_index
            df_reviews_sentences.loc[len(df_reviews_sentences)] = new_row
            sentence_index += 1

In [73]:
df_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!!,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,0
1,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Schmeckte absolut nach nichts.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,1
2,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Unmotiviertes Personal.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,2
3,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Das hat rein gar nichts mit italienischer Lebe...,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,3
4,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",5.0,Vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5385,854856609,9758291,1,Super lecker,2022-08-18,J9246GNcaros,,Wir kommen wieder.,5.0,L'Osteria Oberhausen,de,de,Wir waren das erste mal hier.Die Pizza war mee...,3
5386,854856609,9758291,1,Super lecker,2022-08-18,J9246GNcaros,,Das Ambiente war toll.,5.0,L'Osteria Oberhausen,de,de,Wir waren das erste mal hier.Die Pizza war mee...,4
5387,854856609,9758291,1,Super lecker,2022-08-18,J9246GNcaros,,Draußen zu sitzen war wunderschön.,5.0,L'Osteria Oberhausen,de,de,Wir waren das erste mal hier.Die Pizza war mee...,5
5388,854856609,9758291,1,Super lecker,2022-08-18,J9246GNcaros,,Bedienung war sehr lieb.,5.0,L'Osteria Oberhausen,de,de,Wir waren das erste mal hier.Die Pizza war mee...,6


In [74]:
df_reviews_sentences[df_reviews_sentences["text"].str.contains("maske", case=False)]

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
1143,881602437,4432816,0,Hausrecht steht über Gastfreundlichkeit,2023-03-09,Wuni123,"Nürnberg, Deutschland",Trotz Reservierung keinen Platz bekommen weil ...,1.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Leider Reinfall. Trotz Reservierung keinen Pla...,1
1144,881602437,4432816,0,Hausrecht steht über Gastfreundlichkeit,2023-03-09,Wuni123,"Nürnberg, Deutschland",Ärztliches Attest zur Maskenbefreiung wurde ni...,1.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Leider Reinfall. Trotz Reservierung keinen Pla...,2


In [75]:
df_reviews_sentences["rating"].value_counts()

rating
1.0    1692
5.0    1478
2.0     965
4.0     653
3.0     602
Name: count, dtype: int64

### Balancing

In [76]:
df_balanced_reviews_sentences = df_reviews_sentences.groupby(['rating'], group_keys=False).apply(lambda x: x.sample(600, replace=False, random_state=RANDOM_STATE))
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,869007343,8000775,0,Lächerlich,2022-11-19,Ddttmmdd,,Von den 10 Nudeltaschen waren 7 nicht gefüllt.,1.0,L'Osteria Dortmund,de,de,Local war Mittags maximal zu 20% voll. Habe da...,4
1,867774773,7006685,5,Freches Personal,2022-11-09,934dani123,,Belehrungen einem Gast gegenüber sind fehl am ...,1.0,L'Osteria Leipzig,de,de,Wir waren gestern Abend hier essen. Die Vorspe...,3
2,849835128,11816828,1,Unbeschreiblich schlechter Service,2022-07-25,mirey_xx,"Berlin, Deutschland",So wollten wir einfach nur schnell dort raus u...,1.0,L'Osteria Dresden Prager Strasse,de,de,"Das Essen zählt nicht in die Wertung, denn die...",2
3,877881046,1007610,1,Eine Frechheit,2023-02-08,diewiwi,"München, Deutschland",Die Pasta sind sehr dürftig in der Portion.,1.0,L'Osteria Rosenheim,de,de,Wir haben für eine Sitzung Essen bestellt. Die...,5
4,885427871,3236000,0,Enttäuschend,2023-04-09,_7591z,"Frankfurt am Main, Deutschland",Heute ist das schlechteste Tankstellen Restaur...,1.0,VAPIANO Karlsruhe,de,de,Bestellten Spqhetti Aglio er Olio OHNE Chili. ...,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,873756991,1820035,0,Sehr gut,2023-01-03,edoh2023,,Ich mag es hier zu essen beste pinssa,5.0,VAPIANO Darmstadt,de,de,Es war sehr lecker und gute Mitarbeiter top em...,1
2996,891417564,3692250,3,Vapiano Wunderbar,2023-05-22,T L,,"Ein fantastisches Lokal, absolut freundliche M...",5.0,Vapiano,de,de,"Ein fantastisches Lokal, absolut freundliche M...",0
2997,852756726,5003254,0,Einfach klasse,2022-08-08,Jutta1608,"Plettenberg, Deutschland","Sie sind nicht nur unfassbar gut, sondern auch...",5.0,L'Osteria Rostock,de,de,Wir waren in Rostock an dem gefühlt einzigen T...,6
2998,849861918,8121708,1,Abendessen,2022-07-25,axelr887,,Mega 💪 💪 💪 💪 Tip kam vom Hotel und war echt 👍 ...,5.0,L'Osteria Münster,de,de,Mega 💪 💪 💪 💪 Tip kam vom Hotel und war echt 👍 ...,0


### Randomisierung

In [77]:
df_balanced_reviews_sentences = df_balanced_reviews_sentences.sample(frac=1, random_state=RANDOM_STATE)  # 'frac=1' mischt den gesamten DataFrame
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,888500600,10454635,0,"Burger gut, Lokal eng und laut",2023-05-01,Karin L,,"Die Burger sind gut, die Pommes sehr gut.",3.0,HANS IM GLÜCK Burgergrill & Bar,de,de,"Sonst meide ich Restaurantketten, aber an eine...",1
1,887012563,1308296,0,Bloch House ist Block House,2023-04-20,RalphBausRod,,Was ich aber in anbetracht der Lage verstehen ...,3.0,BLOCK HOUSE Am Aegi,de,de,Block House ist Block House. Nach der Messe mi...,6
2,873522843,19042916,0,Ordentliches Steak und superfreundlicher Service,2023-01-01,JRSCH1962,"Hannover, Deutschland","Die Fleischqualität war gut, die Baked Potato ...",4.0,BLOCK HOUSE Erfurt,de,de,Das „Mr Rumpsteak“ wurde wie geordert „medium“...,1
3,894161363,11816828,1,Empfehlenswerter Ausflug,2023-06-09,Climber07002801964,,Die Pizzen sind riesig und einfach super lecker.,4.0,L'Osteria Dresden Prager Strasse,de,de,Die Pizzen sind riesig und einfach super lecke...,0
4,894863636,5510297,0,Für uns ein Klassiker...,2023-06-13,macrippchen,"Brunsbüttel, Deutschland","Sehr aufmerksamer Service, das Fleisch lecker,...",5.0,BLOCK HOUSE Pöseldorf,de,de,"Alles auf den Punkt, unfassbar guter Service, ...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,845764015,8489732,0,"Guter Standard, freundlicher Service, kleine V...",2022-07-02,petermO6801RX,"Rüdersdorf, Deutschland",mit dem Sauerrahm gemacht wird.,4.0,BLOCK HOUSE Harburg,de,de,"Ich gehe gerne in Block-House-Restaurants, wei...",7
2996,879321144,2015988,0,Zuverlässig und riesige Pizzen,2023-02-19,CasaColonia,"Troisdorf, Deutschland",Die <LOC> in <LOC> ist stehts sauber und biete...,4.0,L'Osteria Troisdorf,de,de,Die L’Osteria in Troisdorf ist stehts sauber u...,0
2997,849351269,10685715,0,Tiptop,2022-07-23,christianzW2917NM,"Hamburg, Deutschland",Fritten waren gut und wir haben uns eine große...,4.0,HANS IM GLÜCK Burgergrill & Bar,de,de,Essen wir man das erwartet. Stabiler Burger. P...,3
2998,883301760,5270577,1,Lecker schmecker,2023-03-23,Hannah M,,Es war etwas viel los aber trotzdem sehr freun...,4.0,L'Osteria,de,de,Das Essen war sehr lecker und frisch zubereite...,1


In [78]:
df_balanced_reviews_sentences["rating"].value_counts()

rating
3.0    600
4.0    600
5.0    600
2.0    600
1.0    600
Name: count, dtype: int64

In [79]:
df_balanced_reviews_sentences.groupby(["review_id", "sentence_idx"]).size().reset_index(name="count")

Unnamed: 0,review_id,sentence_idx,count
0,845622217,1,1
1,845622217,4,1
2,845622217,6,1
3,845655286,0,1
4,845655286,1,1
...,...,...,...
2995,912098351,0,1
2996,912098351,1,1
2997,912098351,2,1
2998,912098351,3,1


In [80]:
df_balanced_reviews_sentences["restaurant_id"].value_counts()

restaurant_id
3692250     128
7006685     106
1520135      72
7082848      70
10463354     67
           ... 
7599521       2
5510297       2
10454645      2
3680501       1
5826611       1
Name: count, Length: 145, dtype: int64

### Store as .csv 

In [81]:
df_reviews.to_csv("../datasets/reviews.csv")

In [82]:
df_reviews_sentences.to_csv("../datasets/reviews_sentences.csv")

In [83]:
df_balanced_reviews_sentences.to_csv("../datasets/balanced_reviews_sentences.csv")