# Notebook: Filter Reviews from Collected HTMLs

## Packages

In [1]:
from langdetect import detect
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize

## Settings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%capture
!python -m spacy download de_core_news_lg

In [4]:
nlp = spacy.load("de_core_news_lg")

## Constants

In [5]:
REVIEWS_PATH = "../datasets/reviews_urls.csv"
RANDOM_STATE = 43

## Code

### Load Dataset

In [6]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [7]:
columns = ['review_id', 'restaurant_id', 'page_index', 'title', 'date', 'author_name', 'author_location', 'text', 'rating', 'restaurant_name', 'language_code']
data_reviews = []

In [8]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["text"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [9]:
for index, row in reviews_df.iterrows():
    path_review = "../datasets/reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    review["restaurant_name"] = doc_soup.find('a', class_='HEADING').get_text()
    review["language_code"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    review["review_id"] = row["review_id"]
    review["restaurant_id"] = row["restaurant_id"]
    review["page_index"] = row["page_index"]
    data_reviews.append(review)

In [10]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)

In [15]:
duplicate_rows = df_reviews[df_reviews.duplicated(subset=['review_id'], keep=False)]
duplicate_rows

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language


### Delete Examples without Data

There are rare cases where the text from the rating is not returned with the GET request to the page from the restaurant rating. These will now be excluded.

In [11]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['text'] == '') | (df_reviews['title'] == '')].index)

### Filter Languages

We are only considering reviews in german language.

In [12]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['language_code'] != 'de')].index)

Even if the language code = "de", we have observed that reviews in other languages sometimes have "de" as the language code. These are excluded with the help of Google's language-detection.

In [13]:
df_reviews['detected_language'] = df_reviews['text'].apply(lambda x: detect(x))
# Examples of reviews that were excluded
df_reviews[df_reviews['detected_language'] != 'de']

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language


In [14]:
df_reviews = df_reviews[df_reviews['detected_language'] == 'de']

### Check for Duplicates

### Remove Reviews Posted Before June 2022 

In [16]:
month_mapping = {
    "Januar": 1, "Februar": 2, "März": 3, "April": 4, "Mai": 5, "Juni": 6,
    "Juli": 7, "August": 8, "September": 9, "Oktober": 10, "November": 11, "Dezember": 12
}

def convert_date(date_string):
    day, month_name, year = date_string.split()
    day = day.replace(".", "")
    month = month_mapping[month_name]
    return pd.Timestamp(int(year), month, int(day))

df_reviews["date"] = df_reviews["date"].apply(convert_date)

In [17]:
df_reviews = df_reviews[df_reviews["date"] >= pd.Timestamp(2022, 7, 1)]

In [18]:
df_reviews

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!! Schm...,1.0,Vapiano,de,de
200,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",5.0,Vapiano,de,de
201,904367426,1119896,0,Enttäuschung,2023-07-20,V6519ILannab,"Berlin, Deutschland",Ich war früher ein Vapiano Fan (also in 2012-2...,1.0,Vapiano,de,de
202,865243904,1119896,0,wir gehen gerne hier hin,2022-10-20,575klat,,Wir gehen gerne hier hin. Immer ok für das Pre...,5.0,Vapiano,de,de
203,863710312,1119896,0,"Gut gelegen, Essen naja",2022-10-08,DirkU42,"Bielefeld, Deutschland","Sehr gut gelegen am Potsdamer Platz, freundlic...",3.0,Vapiano,de,de
...,...,...,...,...,...,...,...,...,...,...,...,...
3815,869075102,801344,1,Achtung Riesenpizza,2022-11-20,hobar05,"Mainz, Deutschland",Wer auf viel zu große und entsprechend teure P...,4.0,L'Osteria Regensburg,de,de
3816,868856950,801344,1,Bleibend gut!,2022-11-18,PoxdorferJung,"Franken, Deutschland",Wieder hier in der L´Osteria und wieder können...,5.0,L'Osteria Regensburg,de,de
3817,862909957,801344,1,"Lecker, aber zu laut",2022-10-02,AceOfSpades68,"Troisdorf, Deutschland",Essen war lecker . Das passt eigentlich in jed...,3.0,L'Osteria Regensburg,de,de
3818,859771038,801344,1,Zum Abgewöhnen,2022-09-12,Claudius_753,,"Ja, also, was soll ich sagen… wir waren recht ...",1.0,L'Osteria Regensburg,de,de


### Anonymise

In [19]:
df_reviews["text_noanonymization"] = df_reviews["text"]

In [20]:
def anonymize_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["LOC", "PERSON", "DATE"] and ent.label_ != "Essen":
            text = text.replace(ent.text, f"<{ent.label_}>")
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_entities)

In [21]:
def anonymize_username(text, username):
    return text.replace(username, "<PERSON>")
df_reviews["text"] = df_reviews.apply(lambda row: anonymize_username(row["text"], row["author_name"]), axis=1)

### Filter Sentences

In [22]:
df_reviews_sentences = pd.DataFrame(columns=list(df_reviews.columns) + ['sentence_idx'])

for idx, row in df_reviews.iterrows():
    sentences = sent_tokenize(row['text'])
    sentence_index = 0
    for sentence in sentences:
        new_row = row.copy()  # Kopiere die gesamte Zeile
        new_row['text'] = sentence  # Setze 'text' auf den aktuellen Satz
        new_row['sentence_idx'] = sentence_index
        df_reviews_sentences.loc[len(df_reviews_sentences)] = new_row
        sentence_index += 1

In [25]:
df_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!!,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,0
1,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Schmeckte absolut nach nichts.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,1
2,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Unmotiviertes Personal.,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,2
3,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Das hat rein gar nichts mit italienischer Lebe...,1.0,Vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,3
4,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",5.0,Vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1127,859361589,801344,1,"Gutes Essen, Tische sehr eng beinander und ein...",2022-09-10,CydoniaOblonga,"Regensburg, Deutschland","Die Auswahl sind 10, 15 oder 20%.",2.0,L'Osteria Regensburg,de,de,Zugegeben die Pizzen und Pasta Gerichte sind s...,8
1128,859361589,801344,1,"Gutes Essen, Tische sehr eng beinander und ein...",2022-09-10,CydoniaOblonga,"Regensburg, Deutschland","Welche Frechheit, in einer Pizzeria 20% Trinkg...",2.0,L'Osteria Regensburg,de,de,Zugegeben die Pizzen und Pasta Gerichte sind s...,9
1129,859361589,801344,1,"Gutes Essen, Tische sehr eng beinander und ein...",2022-09-10,CydoniaOblonga,"Regensburg, Deutschland","!Eine 5% Auswahl, was in meinen Augen angemess...",2.0,L'Osteria Regensburg,de,de,Zugegeben die Pizzen und Pasta Gerichte sind s...,10
1130,859361589,801344,1,"Gutes Essen, Tische sehr eng beinander und ein...",2022-09-10,CydoniaOblonga,"Regensburg, Deutschland",Gemütlich ist das Lokal leider überhaupt nicht...,2.0,L'Osteria Regensburg,de,de,Zugegeben die Pizzen und Pasta Gerichte sind s...,11


In [28]:
df_reviews_sentences[df_reviews_sentences["text"].str.contains("maske", case=False)]

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
173,881602437,4432816,0,Hausrecht steht über Gastfreundlichkeit,2023-03-09,Wuni123,"Nürnberg, Deutschland",Trotz Reservierung keinen Platz bekommen weil ...,1.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Leider Reinfall. Trotz Reservierung keinen Pla...,1
174,881602437,4432816,0,Hausrecht steht über Gastfreundlichkeit,2023-03-09,Wuni123,"Nürnberg, Deutschland",Ärztliches Attest zur Maskenbefreiung wurde ni...,1.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Leider Reinfall. Trotz Reservierung keinen Pla...,2


In [34]:
df_reviews_sentences["rating"].value_counts()

rating
1.0    357
5.0    223
4.0    200
2.0    189
3.0    163
Name: count, dtype: int64

### Balancing

In [33]:
df_balanced_reviews_sentences = df_reviews_sentences.groupby(['rating'], group_keys=False).apply(lambda x: x.sample(150, replace=False, random_state=RANDOM_STATE))
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,909563365,7082848,0,Nicht zu empfehlen.,2023-08-11,Mercii G,,hat EWIG gedauert.,1.0,L'Osteria,de,de,"Katastrophe. Essen war nicht gut, Service 0/10...",4
1,849473883,2271294,3,Leider nicht mehr wie früher,2022-07-23,OlliK1979,"Achern, Deutschland",Anschließend gab es Diskussionen und wir wurde...,1.0,Hard Rock Cafe,de,de,"Wir waren schon öfter in Hard Rock Cafés, auch...",5
2,846703493,758281,0,Vollkommen überteuert und unfreundlicher Inhaber,2022-07-07,hubertbF8343KP,Welt,Das hat auch die Anzahl der Gäste bestätigt.Vi...,1.0,Vapiano,de,de,Vapiano war mal sehr gut und wir sind wirklich...,16
3,859283836,1875739,0,Schlechtes Servicepersonal,2022-09-09,Cruiser55172487005,"München, Deutschland",Schlechter Umgang mit Kunden.,1.0,HANS IM GLÜCK Burgergrill & Bar,de,de,Unfreundliches Service Personal. Schlechter Um...,1
4,846489637,715361,0,Ganz mieser Service,2022-07-06,frankfurterjungs2022,,!,1.0,Hard Rock Cafe,de,de,"Waren eine JGA-Gruppe von 13 Mann, es war heiß...",6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745,884693551,715362,0,Leckere Burger und freundlicher Service,2023-04-03,Andreas_aus_B,"Berlin, Deutschland",Wir haben hier einen wirklich gelungenen Abend...,5.0,Hard Rock Cafe Berlin,de,de,Wir haben hier einen wirklich gelungenen Abend...,0
746,873553033,4432816,0,Lieblingsburger,2023-01-01,KathiB189,,Freundlicher und sehr schneller Service.,5.0,HANS IM GLÜCK - Burgergrill and Bar,de,de,Freundlicher und sehr schneller Service. Lecke...,0
747,884693551,715362,0,Leckere Burger und freundlicher Service,2023-04-03,Andreas_aus_B,"Berlin, Deutschland",Der Service war dazu sehr herzlich.,5.0,Hard Rock Cafe Berlin,de,de,Wir haben hier einen wirklich gelungenen Abend...,2
748,885452727,1520135,1,Pizza im Herzen der Stadt,2023-04-09,Clara E,,"Preis-Leistung-Verhältnis ist wirklich top, be...",5.0,L'Osteria München Künstlerhaus,de,de,Das Personal empfängt dich willkommen. Der Ein...,2


### Randomisierung

In [None]:
df_balanced_reviews_sentences = df_balanced_reviews_sentences.sample(frac=1, random_state=RANDOM_STATE)  # 'frac=1' mischt den gesamten DataFrame
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)
df_balanced_reviews_sentences

In [None]:
df_balanced_reviews_sentences["rating"].value_counts()

In [None]:
df_balanced_reviews_sentences.groupby(["review_id", "sentence_idx"]).size().reset_index(name="count")

In [None]:
df_balanced_reviews_sentences["restaurant_id"].value_counts()

### Store as .csv 

In [None]:
df_reviews.to_csv("../datasets/reviews.csv")

In [None]:
df_reviews_sentences.to_csv("../datasets/reviews_sentences.csv")

In [None]:
df_balanced_reviews_sentences.to_csv("../datasets/balanced_reviews_sentences.csv")

In [None]:
docker run -d --name doccano -p 8000:8000 -v app_data:/app/data doccano/doccano