# Notebook: Filter Sentences from Reviews to create Dataset for Annotation

## Packages

In [1]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import KMeans
from nltk.corpus import stopwords
from bertopic import BERTopic
from langdetect import detect
from bs4 import BeautifulSoup
from hdbscan import HDBSCAN
from umap import UMAP
import pandas as pd
import spacy
import json
import nltk
from nltk.tokenize import sent_tokenize
import re

  @numba.jit()
  @numba.jit()
  @numba.jit()
  from .autonotebook import tqdm as notebook_tqdm
  @numba.jit()


## Settings

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
%%capture
#!python -m spacy download de_core_news_lg

## Constants

In [4]:
REVIEWS_PATH = "reviews_dataset/reviews.csv"
RANDOM_STATE = 43
STOPWORDS = set(stopwords.words('german'))
N_TOPICS = 6

In [5]:
nlp = spacy.load('de_core_news_sm')

## Code

### Load Dataset

In [6]:
df_reviews = pd.read_csv(REVIEWS_PATH)

In [7]:
df_reviews

Unnamed: 0.1,Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization
0,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!! Schm...,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...
1,1,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe <RESTAURANT_NAME>.Gutes Essen,sup...",5.0,vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M..."
2,2,904367426,1119896,0,Enttäuschung,2023-07-20,V6519ILannab,"Berlin, Deutschland",Ich war früher ein <RESTAURANT_NAME> Fan (also...,1.0,vapiano,de,de,Ich war früher ein Vapiano Fan (also in 2012-2...
3,3,865243904,1119896,0,wir gehen gerne hier hin,2022-10-20,575klat,,Wir gehen gerne hier hin. Immer ok für das Pre...,5.0,vapiano,de,de,Wir gehen gerne hier hin. Immer ok für das Pre...
4,4,863710312,1119896,0,"Gut gelegen, Essen naja",2022-10-08,DirkU42,"Bielefeld, Deutschland","Sehr gut gelegen am <LOC>, freundliche Bedienu...",3.0,vapiano,de,de,"Sehr gut gelegen am Potsdamer Platz, freundlic..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653,1653,868756286,25149391,0,Vielen Dank,2022-11-17,christiansL1481LW,,"Ausgezeichneter Service, komme gerne wieder. H...",5.0,dean&david,de,de,"Ausgezeichneter Service, komme gerne wieder. H..."
1654,1654,868742251,25149391,0,Top,2022-11-17,626miguelw,,"Sehr lecker, schneller und guter Service, nett...",5.0,dean&david,de,de,"Sehr lecker, schneller und guter Service, nett..."
1655,1655,868661042,25149391,0,Ausgesprochen schenelles gutes Essen und gesund!,2022-11-16,A6205ZAadrianav,,"Essen war sehr lecker Besonders positiv ist, d...",5.0,dean&david,de,de,"Essen war sehr lecker Besonders positiv ist, d..."
1656,1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland","Essen war lecker, Bedienung war in Ordnung.Pre...",4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre..."


### Filter Sentences

In [8]:
df_reviews_sentences = pd.DataFrame(columns=list(df_reviews.columns) + ['sentence_idx'])

for idx, row in df_reviews.iterrows():
    sentences = sent_tokenize(row['text'], language="german")
    sentence_index = 0
    for sentence in sentences:
        if len(sentence) > 1:
            new_row = row.copy() 
            new_row['text'] = sentence 
            new_row['sentence_idx'] = sentence_index
            df_reviews_sentences.loc[len(df_reviews_sentences)] = new_row
            sentence_index += 1

In [9]:
df_reviews_sentences

Unnamed: 0.1,Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx
0,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!!,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,0
1,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Schmeckte absolut nach nichts.,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,1
2,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Unmotiviertes Personal.,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,2
3,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Das hat rein gar nichts mit italienischer Lebe...,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,3
4,1,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe <RESTAURANT_NAME>.Gutes Essen,sup...",5.0,vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7427,1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland",Ansonsten sauber und aufgeräumt.,4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre...",1
7428,1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland",Ein bisschen gehobener Imbisscharakter.,4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre...",2
7429,1657,896044837,25173833,0,Empfehlenswert!,2023-06-20,dr_anke_schaefer,"Rostock, Deutschland",Immer wieder die beste Alternative für das Bus...,5.0,dean&david,de,de,Immer wieder die beste Alternative für das Bus...,0
7430,1657,896044837,25173833,0,Empfehlenswert!,2023-06-20,dr_anke_schaefer,"Rostock, Deutschland","Die Crunchy Chicken Bowl ist nur zu empfehlen,...",5.0,dean&david,de,de,Immer wieder die beste Alternative für das Bus...,1


### Setup BERTopic

In [10]:
# Source: https://data-dive.com/binary-text-classification-predict-ratings-part3-transformer-neural-network-bert/
def clean_text(text):
    RE_WSPACE = re.compile(r"\s+", re.IGNORECASE)
    RE_ASCII = re.compile(r"[^A-Za-zÀ-ž ]", re.IGNORECASE)
    RE_SINGLECHAR = re.compile(r"\b[A-Za-zÀ-ž]\b", re.IGNORECASE)

    text = re.sub(r'@\w+', '', text)
    text = re.sub(RE_ASCII, " ", text)
    text = re.sub(RE_SINGLECHAR, " ", text)
    text = re.sub(RE_WSPACE, " ", text)
    return text.strip()

def lemmatize_remove_stopwords_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_.lower() for token in doc if token.text.lower() not in STOPWORDS])
    return lemmatized_text

df_reviews_sentences["cleaned_text"] = df_reviews_sentences["text"].apply(clean_text)
df_reviews_sentences["cleaned_text"] = df_reviews_sentences["cleaned_text"].apply(lemmatize_remove_stopwords_text)
sentences = df_reviews_sentences.cleaned_text.to_list()
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=RANDOM_STATE)
model = BERTopic(language="german", nr_topics=N_TOPICS, umap_model=umap_model, top_n_words=20, embedding_model="paraphrase-multilingual-MiniLM-L12-v2")
topics, probabilities = model.fit_transform(sentences)
topics = model.reduce_outliers(sentences, topics, strategy="distributions")
model.update_topics(sentences, topics, top_n_words=20)

In [11]:
df_reviews_sentences["BERTopic_topic"] =  model.get_document_info(sentences)["Name"]
df_reviews_sentences

Unnamed: 0.1,Unnamed: 0,review_id,restaurant_id,page_index,title,date,author_name,author_location,text,rating,restaurant_name,language_code,detected_language,text_noanonymization,sentence_idx,cleaned_text,BERTopic_topic
0,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Die schlechteste Pasta aglio olio ever!!!,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,0,schlecht pasta aglio olio ev,1_pizza_pizze_pasta_gut
1,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Schmeckte absolut nach nichts.,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,1,schmeckt absolut,0_essen_loc_restaurant_gut
2,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Unmotiviertes Personal.,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,2,unmotiviertes personal,0_essen_loc_restaurant_gut
3,0,867460923,778662,0,Absolut enttäuschend,2022-11-06,BackPacker563486,"Berlin, Deutschland",Das hat rein gar nichts mit italienischer Lebe...,1.0,vapiano,de,de,Die schlechteste Pasta aglio olio ever!!! Schm...,3,rein gar italienisch lebensart kochkunst tun,0_essen_loc_restaurant_gut
4,1,907309442,1119896,0,Bestes Italienisches Restaurant Berlin,2023-08-02,Luca R,,"Ich lieeeebe <RESTAURANT_NAME>.Gutes Essen,sup...",5.0,vapiano,de,de,"Ich lieeeebe Vapiano.Gutes Essen,super nette M...",0,lieeeeben restaurant name gut essen super nett...,0_essen_loc_restaurant_gut
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7427,1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland",Ansonsten sauber und aufgeräumt.,4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre...",1,ansonsten sauber aufräumen,0_essen_loc_restaurant_gut
7428,1656,868462723,21174965,0,Eine Empfehlung kann ich gerne geben,2022-11-14,Andreas G,"Sinzig, Deutschland",Ein bisschen gehobener Imbisscharakter.,4.0,dean&david,de,de,"Essen war lecker, Bedienung war in Ordnung.Pre...",2,bissch gehoben imbisscharakter,0_essen_loc_restaurant_gut
7429,1657,896044837,25173833,0,Empfehlenswert!,2023-06-20,dr_anke_schaefer,"Rostock, Deutschland",Immer wieder die beste Alternative für das Bus...,5.0,dean&david,de,de,Immer wieder die beste Alternative für das Bus...,0,immer gut alternative businesslunch,0_essen_loc_restaurant_gut
7430,1657,896044837,25173833,0,Empfehlenswert!,2023-06-20,dr_anke_schaefer,"Rostock, Deutschland","Die Crunchy Chicken Bowl ist nur zu empfehlen,...",5.0,dean&david,de,de,Immer wieder die beste Alternative für das Bus...,1,crunchy chicken bowl empfehlen gesund lecker f...,0_essen_loc_restaurant_gut


In [12]:
df_reviews_sentences["BERTopic_topic"].value_counts()

BERTopic_topic
0_essen_loc_restaurant_gut                          6135
1_pizza_pizze_pasta_gut                              802
2_steak_burger_gut_punkt                             266
4_vegan_vegetarisch_geben_auswahl                    109
3_pommes_burger_pomme_kalt                            94
-1_anschreien_weitermachen_ausdrücken_beruhigend      26
Name: count, dtype: int64

### Balancing

In [13]:
df_balanced_reviews_sentences = df_reviews_sentences.groupby(['rating'], group_keys=False).apply(lambda x: x.sample(600, replace=False, random_state=RANDOM_STATE))
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)

### Randomisierung

In [14]:
df_balanced_reviews_sentences = df_balanced_reviews_sentences.sample(frac=1, random_state=RANDOM_STATE)  # 'frac=1' mischt den gesamten DataFrame
df_balanced_reviews_sentences.reset_index(drop=True, inplace=True)

### Check If Sentences are Unique

In [15]:
df_balanced_reviews_sentences.groupby(["review_id", "sentence_idx"]).size().reset_index(name="count")

Unnamed: 0,review_id,sentence_idx,count
0,845622217,5,1
1,845764015,0,1
2,845764015,1,1
3,845764015,3,1
4,845764015,4,1
...,...,...,...
2995,912772807,5,1
2996,912772807,6,1
2997,912773256,3,1
2998,912776146,0,1


### Store as .csv 

In [16]:
df_reviews_sentences.to_csv("reviews_dataset/reviews_sentences.csv")

In [17]:
df_balanced_reviews_sentences.to_csv("reviews_dataset/balanced_reviews_sentences.csv")