# Filter Reviews from Collected HTMLs

## Packages

In [140]:
from bs4 import BeautifulSoup
import pandas as pd
import json

## Constants

In [141]:
REVIEWS_PATH = "../datasets/reviews_urls.csv"

## Code

### Load Dataset

In [142]:
reviews_df = pd.read_csv(REVIEWS_PATH)

### Load Reviews

In [143]:
# Deutsch filtern

In [144]:
columns = ['title', 'date', 'author_name', 'author_location', 'entry', 'rating', 'restaurant_name', 'language_review']
data_reviews = []

In [145]:
def load_review(review_soup):
    review = {}
    review["title"] = review_soup.find("div", attrs={"class": "quote"}).get_text()
    review["date"] = review_soup.find(class_='ratingDate')['title']
    review["author_name"] = review_soup.find(class_='scrname').get_text()
    user_location_element = review_soup.find(class_='userLocation')
    if user_location_element:
        user_location = user_location_element.get_text()
    else:
        user_location = None
    review["author_location"] = user_location
    review["entry"] = review_soup.find(class_='partial_entry').get_text()
    review["rating"] = int(review_soup.find(class_='reviewItemInline').find('span', class_='ui_bubble_rating')['class'][1].split('_')[1]) / 10
    return review

In [146]:
for index, row in reviews_df.iterrows():
    #print(row["review_id"])
    path_review = "../datasets/reviews_restaurants_html/restaurant_" + str(row['restaurant_id']) + "_review_" + str(row["review_id"]) + ".html"
    with open(path_review, 'r', encoding='utf-8') as file:
        html_content = file.read()
    doc_soup = BeautifulSoup(html_content, 'html.parser')
    review_soup = doc_soup.find(id="review_"+str(row["review_id"]))
    review = load_review(review_soup)
    review["restaurant_name"] = doc_soup.find('a', class_='HEADING').get_text()
    review["language_review"] = doc_soup.find("div", class_="prw_reviews_user_links_hsx").span["data-language"]
    data_reviews.append(review)

In [147]:
df_reviews = pd.DataFrame(data_reviews, columns=columns)
df_reviews

Unnamed: 0,title,date,author_name,author_location,entry,rating,restaurant_name,language_review
0,"Gute Pasta, Risotto auch gut, Kellner unaufmer...",13. August 2023,Alfred G,"München, Deutschland","l Osteria ist Systemgastronomie, das sollte ma...",3.0,L'Osteria Regensburg,de
1,Unprofessional und Frechheit,12. August 2023,Paola C,,"Eine totale Enttäuschung, ich und mein Freund ...",1.0,L'Osteria Regensburg,de
2,"Einzigartige Antipast, supernettes Personal!!!",5. August 2023,Daydream19240815995,,"Sehr freundliches Personal. Hat uns empfohlen,...",5.0,L'Osteria Regensburg,de
3,Sehr schlecht,24. Juni 2023,Julia S,,Die Pizza war komplett kalt und mager. Wir hat...,1.0,L'Osteria Regensburg,de
4,A richdiga Idaljena,22. Juni 2023,seppal51,"Bad Tölz, Deutschland","sehr freundliches Personal, das Essen war gut,...",5.0,L'Osteria Regensburg,de
5,Sehr unhöfliche belegschaft. Schlechtes Ambiente,10. Juni 2023,Markus W,,Sehr unhöfliche belegschaft sogar mit Polizei ...,1.0,L'Osteria Regensburg,de
6,Essen gut Personal unfreundlich,30. April 2023,Sunshine59717948580,"Wismar, Deutschland",Das Essen grundsätzlich sehr lecker und schnel...,3.0,L'Osteria Regensburg,de
7,Gut,30. April 2023,Günther M,,"Guter Service, hervorragende Pizzen., etwas zu...",4.0,L'Osteria Regensburg,de
8,Besuch in Regensburg,8. April 2023,Companion34963673275,"Willich, Deutschland",Wir hatten zuvor einen Tisch reserviert. Auf j...,2.0,L'Osteria Regensburg,de
9,Kleine Enttäuschung,20. März 2023,C2231XKelisabeths,"Regensburg, Deutschland","Sonntag gegen 14:00 in die L'Osteria, weil nic...",2.0,L'Osteria Regensburg,de
