# Notebook: Convert Raw Table Sheet Annotations to txt

## Packages

In [61]:
import pandas as pd
import re

## Code

In [None]:
path_input = 'csv/test.csv' # Diese Stelle austauschen je nachdem, welche csv konvertiert werden soll
path_output = 'gerest/test'
df = pd.read_csv(path_input)[:555] #1602 -> train_dev, 555 -> test

In [63]:
df

Unnamed: 0,text,Kommentar,tag_1,tag_2,tag_3,tag_4,tag_5,tag_6,tag_7,tag_8,tag_9
0,Diese sind sehr schmackhaft und die Portionen ...,,"NULL, foodq, pos, schmackhaft","Portionen, foods, pos, großzügig",,,,,,,
1,"Unsere Bedienung, Susanna, war sehr freundlich...",,"Susanna, serv, pos, freundlich","Susanna, serv, pos, zuvorkommend",,,,,,,
2,Wie kann die Küche einen steinharten doppelt g...,,"Almhütten-Burger, foodq, neg, steinharten","Almhütten-Burger, foodq, neg, doppelt garen",,,,,,,
3,"Kleine Portionen und 1,5 std Wartezeit von Bes...",,"Wartezeit von Bestellung, serv, neg, 1,5 std","Portionen, foods, neg, Kleine",,,,,,,
4,"Leider etwas zäh und nicht warm genug, der Spe...",,"NULL, foodq, neg, zäh","Speckkloß, foodq, neg, lauwarm","Speckkloß, foodp, neg, 29€","Speckkloß, foodp, neg, nicht warm genug",,,,,
...,...,...,...,...,...,...,...,...,...,...,...
550,Immer wieder gerne.,,"NULL, restaurantg, pos, wieder gerne",,,,,,,,
551,Hier verbringt man seinen Abend gerne.,,"NULL, restaurantg, pos, gerne",,,,,,,,
552,"Ich fand die Bewirtung klasse, super freundlic...",,"Bewirtung, serv, pos, klasse","Patron, serv, pos, freundlicher",,,,,,,
553,Wir waren zum wiederholten Male im Augustiner ...,,"Augustiner, restaurantg, pos, gut",,,,,,,,


In [64]:

# 'location general', 'food prices', 'food quality', 'food general',
#     'ambience general', 'service general', 'restaurant prices',
#     'drinks prices', 'restaurant miscellaneous', 'drinks quality',
#     'drinks style_options', 'restaurant general', 'food style_options'

AC = {
    "loca": "location general",
    "location": "location general",
    "serv": "service general",
    "foodq": "food quality",
    "foodg": "food general",
    "amb": "ambience general",
    "drinkq": "drinks quality",
    "restaurantp": "restaurant prices",
    "foodp": "food prices",
    "drinkp": "drinks prices",
    "restaurantm": "restaurant miscellaneous",
    "drinks": "drinks style_options",
    "foods": "food style_options",
    "restaurantg": "restaurant general"
}

POL = {
    "pos": "positive",
    "neg": "negative",
    "neu": "neutral"
}


def convert_aspects(aspects_raw, text):
    aspects = []

    for aspect in aspects_raw:
        # Prüfung, ob Aspect Term exakt im Text vorkommt (Phrase-Level)
        #if aspect[0] != "NULL" and not re.search(r'\b' + re.escape(aspect[0]) + r'\b', text) and " "+aspect[0]+" " not in text and " "+aspect[0]+"!" not in text and " "+aspect[0]+"." not in text and " "+aspect[0]+"," not in text and "("+aspect[0]+")" not in text and "\""+aspect[0]+"\"" not in text :
        if aspect[0] != "NULL" and aspect[0] not in text:
            print(
                f"##### ERROR: Aspect term '{aspect[0]}' not found as a whole phrase in text:", text)

        # Prüfung, ob die Aspect Category valide ist
        if aspect[1] not in AC.keys():
            print(f"##### ERROR: Aspect category {aspect[1]} not found:", text)

        # Prüfung, ob die Aspect Sentiment valide ist
        if aspect[2] not in POL.keys():
            print(
                f"##### ERROR: Aspect sentiment {aspect[2]} not found in:", text)
        if len(aspect[3]) < 2:
            print(
                f"##### ERROR: Opinion term '{aspect[3]}' too short in:", text)
        if len(aspect[0]) < 2:
            print(
                f"##### ERROR: Aspect term '{aspect[3]}' too long in:", text)

        # Prüfung, ob Opinion Term exakt im Text vorkommt (Phrase-Level)
        #if not re.search(r'\b' + re.escape(aspect[3]) + r'\b', text) and " "+aspect[3]+" " not in text and " "+aspect[3]+"." not in text and " "+aspect[3]+"!" not in text and " "+aspect[3]+"," not in text and "("+aspect[3]+")" not in text and "\”"+aspect[3]+"\"" not in text:
        if not aspect[3] in text:
            print(
                f"##### ERROR: Opinion term '{aspect[3]}' not found as a whole phrase in text:", text)

        aspects.append([aspect[0], AC[aspect[1]], POL[aspect[2]], aspect[3]])

    return aspects

In [65]:
data = ""

total_empty = 0
total_anonym = 0
total_more_than_one = 0
total_more_than_one_sentence = 0
total_examples = 0
total_valid_examples = 0
total_not_english = 0
total_can_t_be_understood = 0


def process_aspect_text(aspect_text):
    """
    Teilt den Eingabetext basierend auf Kommas, behält maximal 3 Substrings pro Eintrag
    und verbindet die restlichen, wenn mehr als 4 Substrings existieren.

    :param aspect_text: str, Eingabetext, z.B. "Aspect1, Aspect2, Aspect3, Aspect4, Aspect5"
    :return: list, verarbeitete Liste von Strings
    """
    processed_list = [s for s in re.split(r',', aspect_text)]

    if len(processed_list) > 4:
        processed_list = processed_list[:3] + [','.join(processed_list[3:])]
    processed_list = [s.strip() for s in processed_list]

    return processed_list


for index, row in df.iterrows():
    rows = row.tolist()
    entry_text = rows[0]

    all_raw_lists = []
    for aspect_text in rows[2:17]:
        try:
            raw_list = process_aspect_text(aspect_text)
            all_raw_lists.append(raw_list)
        except:
            pass

    # 1. Count all examples without aspects
    no_aspects = False
    if len(all_raw_lists) == 0:
        no_aspects = True
        total_empty += 1

    # 2. Count all examples that are not anoymized
    anonym_found = False
    for sublist in all_raw_lists:
        if "anonym" in sublist:
            if not anonym_found:
                total_anonym += 1
                anonym_found = True
            break

    # 3. Count all examples that are not english
    not_english_found = False
    for sublist in all_raw_lists:
        if "not english" in sublist:
            if not not_english_found:
                total_not_english += 1
                not_english_found = True
            break

    # 4. Count all examples with more than one sentence
    more_than_one_sentence_found = False
    for sublist in all_raw_lists:
        if "more than 1 sentence" in sublist:
            if not more_than_one_sentence_found:
                total_more_than_one_sentence += 1
                more_than_one_sentence_found = True
            break

    # 5. Count all "can't be understood"
    cant_be_understood_found = False
    for sublist in all_raw_lists:
        if "can't be understood" in sublist:
            if not cant_be_understood_found:
                total_can_t_be_understood += 1
                cant_be_understood_found = True
            break

    if (no_aspects or anonym_found or more_than_one_sentence_found or not_english_found or cant_be_understood_found):
        pass
    else:
        # 5. Check for examples with < 4 sentiment elements
        has_four_elements = True
        for sublist in all_raw_lists:
            if len(sublist) < 4:
                print("##### ERROR:", entry_text, sublist)
                has_four_elements = False

        if has_four_elements == True:
            aspects = convert_aspects(all_raw_lists, entry_text)
            data += entry_text + "####" + str(aspects) + "\n"
            total_valid_examples += 1

    # print(all_raw_lists)

    total_examples += 1

print("Empty:", total_empty)
print("Anonymized:", total_anonym)
print("More than 1 sentence:", total_more_than_one_sentence)
print("Valid examples:", total_valid_examples)
print("Not English:", total_not_english)
print("Can't be understood", total_can_t_be_understood)
print("-----")
print(total_examples)

Empty: 7
Anonymized: 2
More than 1 sentence: 2
Valid examples: 544
Not English: 0
Can't be understood 0
-----
555


In [66]:
with open(path_output+".txt", 'w', encoding='utf-8') as file:
    file.write(data)