# Notebook: Filter Reviews from Collected HTMLs

## Packages

In [1]:
from langdetect import detect
from bs4 import BeautifulSoup
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize
import re

## Settings

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
%%capture
#!python -m spacy download de_core_news_lg

In [4]:
nlp = spacy.load("de_core_news_lg")

## Constants

In [5]:
RESTAURANT_URLS = "restaurant_metadata_with_highest_page_index.json"
REVIEWS_PATH = "reviews_urls.csv"
RANDOM_STATE = 43

## Code

### Load Dataset

In [6]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [7]:
columns = ['review_id', 'restaurant_id', 'page_index', 'title', 'date', 'author_name', 'author_location', 'text', 'rating', 'restaurant_name', 'language_code']
data_reviews = []

In [8]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["text"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [9]:
for index, row in reviews_df.iterrows():
    path_review = "reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    try:
        review["language_code"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    except:
        review["language_code"] = "not defined"
    review["review_id"] = row["review_id"]
    review["restaurant_id"] = row["restaurant_id"]
    review["page_index"] = row["page_index"]
    data_reviews.append(review)

In [10]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)

### Add restaurant name

In [11]:
with open(RESTAURANT_URLS, 'r') as json_file:
    restaurant_metadata = json.load(json_file)

In [12]:
restaurant_dict = {entry['id']: entry['restaurant'] for entry in restaurant_metadata}
restaurant_dict_str = {int(k): v for k, v in restaurant_dict.items()}
df_reviews['restaurant_name'] = df_reviews['restaurant_id'].map(restaurant_dict_str)

### Check for Duplicates

In [13]:
duplicate_rows = df_reviews[df_reviews.duplicated(subset=['review_id'], keep=False)]
duplicate_rows

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code


### Delete Examples without Data

There are rare cases where the text from the rating is not returned with the GET request to the page from the restaurant rating. These will now be excluded.

In [14]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['text'] == '') | (df_reviews['title'] == '')].index)

### Filter Languages

We are only considering reviews in german language.

In [15]:
df_reviews = df_reviews.drop(df_reviews[(df_reviews['language_code'] != 'de')].index)

Even if the language code = "de", we have observed that reviews in other languages sometimes have "de" as the language code. These are excluded with the help of Google's language-detection.

In [16]:
df_reviews['detected_language'] = df_reviews['text'].apply(lambda x: detect(x))
# Examples of reviews that were excluded
df_reviews[df_reviews['detected_language'] != 'de']

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language
967,774382373,13295509,0,ES WAR nur akzeptable.It jast not very tasty.G...,15. Oktober 2020,EmmiQ,"Mexiko-Stadt, Mexiko",Dass Hamburger sind nicht außergewöhnlich. Mei...,4.0,hans im glück,de,en
1892,861674842,6415648,0,Nett aber laaaangsam / Nice but sloooowly,24. September 2022,davidmonba,"Valencia, Spanien","Der Ort ist nett, das Essen und die Getränke s...",4.0,l'osteria,de,en
3105,851895150,5930727,0,Entäuscht,4. August 2022,AbhiS_Hamburg_75,,I would advise against the falafel dishes beca...,1.0,dean&david,de,en


In [17]:
df_reviews = df_reviews[df_reviews['detected_language'] == 'de']

### Remove Reviews Posted Before June 2022 

In [18]:
month_mapping = {
    "Januar": 1, "Februar": 2, "März": 3, "April": 4, "Mai": 5, "Juni": 6,
    "Juli": 7, "August": 8, "September": 9, "Oktober": 10, "November": 11, "Dezember": 12
}

def convert_date(date_string):
    day, month_name, year = date_string.split()
    day = day.replace(".", "")
    month = month_mapping[month_name]
    return pd.Timestamp(int(year), month, int(day))

df_reviews["date"] = df_reviews["date"].apply(convert_date)

In [19]:
df_reviews = df_reviews[df_reviews["date"] >= pd.Timestamp(2022, 7, 1)]

In [20]:
df_reviews.reset_index(drop=True, inplace=True)

### Anonymise

In [21]:
df_reviews["text_noanonymization"] = df_reviews["text"]

In [22]:
def anonymize_entities(text):
    doc = nlp(text)
    for ent in doc.ents:
        if ent.label_ in ["LOC", "PERSON", "DATE"] and ent.label_ != "Essen":
            text = text.replace(ent.text, f"<{ent.label_}>")
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_entities)

In [23]:
def anonymize_restaurant_name(text):
    restaurant_names = [
        "vapiano",
        "hans im glück",
        "hans ins glück",
        "dean&david",
        "dean und david",
        "dean & david",
        "dean and david",
        "losteria",
        "l osteria",
        "l'osteria",
        "l‘osteria",
        "l´osteria",
        "Llosteria",
        "L’Osteria",
        "la osteria",
        "L`Osteria",
        "L’Hosteria",
        "blockhouse",
        "block house",
        "block hause",
        "blockhaus",
        "blockouse",
        "Block Houses",
        "vapianos",
    ]
    for name in restaurant_names:
        text = re.sub(r'\b' + re.escape(name) + r'\b', "<RESTAURANT_NAME>", text, flags=re.IGNORECASE)
    return text

df_reviews["text"] = df_reviews["text"].apply(anonymize_restaurant_name)

In [24]:
def anonymize_username(text, username):
    return text.replace(username, "<PERSON>")
df_reviews["text"] = df_reviews.apply(lambda row: anonymize_username(row["text"], row["author_name"]), axis=1)

### Store as .csv 

In [25]:
df_reviews.to_csv("reviews_dataset/reviews.csv")

In [26]:
df_reviews

Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization
0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!! Schm...,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...
1,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe <RESTAURANT_NAME>.Gutes Essen,sup...",5.0,vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M..."
2,904367426,1119896,0,Enttäuschung,2023-07-20,V6519ILannab,"Berlin, Deutschland",Ich war früher ein <RESTAURANT_NAME> Fan (also...,1.0,vapiano,de,de,Ich war früher ein Vapiano Fan (also in 2012-2...
3,865243904,1119896,0,wir gehen gerne hier hin,2022-10-20,575klat,,Wir gehen gerne hier hin. Immer ok für das Pre...,5.0,vapiano,de,de,Wir gehen gerne hier hin. Immer ok für das Pre...
4,863710312,1119896,0,"Gut gelegen, Essen naja",2022-10-08,DirkU42,"Bielefeld, Deutschland","Sehr gut gelegen am <LOC>, freundliche Bedienu...",3.0,vapiano,de,de,"Sehr gut gelegen am Potsdamer Platz, freundlic..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653,868756286,25149391,0,Vielen Dank,2022-11-17,christiansL1481LW,,"Ausgezeichneter Service, komme gerne wieder. H...",5.0,dean&david,de,de,"Ausgezeichneter Service, komme gerne wieder. H..."
1654,868742251,25149391,0,Top,2022-11-17,626miguelw,,"Sehr lecker, schneller und guter Service, nett...",5.0,dean&david,de,de,"Sehr lecker, schneller und guter Service, nett..."
1655,868661042,25149391,0,Ausgesprochen schenelles gutes Essen und gesund!,2022-11-16,A6205ZAadrianav,,"Essen war sehr lecker Besonders positiv ist, d...",5.0,dean&david,de,de,"Essen war sehr lecker Besonders positiv ist, d..."
1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland","Essen war lecker, Bedienung war in Ordnung.Pre...",4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre..."
