# Notebook: Convert Raw Table Sheet Annotations to txt

## Packages

In [None]:
import pandas as pd
import re

## Code

In [None]:
path_input = 'input.csv' # Diese Stelle austauschen je nachdem, welche csv konvertiert werden soll
path_output = './'
df = pd.read_csv(path_input)

In [None]:
df

In [None]:

# 'location general', 'food prices', 'food quality', 'food general',
#     'ambience general', 'service general', 'restaurant prices',
#     'drinks prices', 'restaurant miscellaneous', 'drinks quality',
#     'drinks style_options', 'restaurant general', 'food style_options'

AC = {
    "loca": "location general",
    "location": "location general",
    "serv": "service general",
    "foodq": "food quality",
    "foodg": "food general",
    "amb": "ambience general",
    "drinkq": "drinks quality",
    "restaurantp": "restaurant prices",
    "foodp": "food prices",
    "drinkp": "drinks prices",
    "restaurantm": "restaurant miscellaneous",
    "drinks": "drinks style_options",
    "foods": "food style_options",
    "restaurantg": "restaurant general"
}

POL = {
    "pos": "positive",
    "neg": "negative",
    "neu": "neutral"
}


def convert_aspects(aspects_raw, text):
    aspects = []
    for aspect in aspects_raw:

        if aspect[0] not in text and aspect[0] != "NULL":
            print(
                f"##### ERROR: Aspect term '{aspect[0]}' not found in text:", text)
        if aspect[1] not in AC.keys():
            print(f"##### ERROR: Aspect category {aspect} not found:", text)
        if aspect[2] not in POL.keys():
            print(
                f"##### ERROR: Aspect sentiment {aspect[2]} not found in:", text)
        if aspect[3] not in text:
            print(
                f"##### ERROR: Opinion term '{aspect[3]}' not found in text:", text)

        aspects.append([aspect[0], AC[aspect[1]], POL[aspect[2]], aspect[3]])
    return aspects

In [None]:
data = ""

total_empty = 0
total_anonym = 0
total_more_than_one = 0
total_more_than_one_sentence = 0
total_examples = 0
total_valid_examples = 0
total_not_english = 0
total_can_t_be_understood = 0


def process_aspect_text(aspect_text):
    """
    Teilt den Eingabetext basierend auf Kommas, behält maximal 3 Substrings pro Eintrag
    und verbindet die restlichen, wenn mehr als 4 Substrings existieren.

    :param aspect_text: str, Eingabetext, z.B. "Aspect1, Aspect2, Aspect3, Aspect4, Aspect5"
    :return: list, verarbeitete Liste von Strings
    """
    processed_list = [s for s in re.split(r',', aspect_text)]

    if len(processed_list) > 4:
        processed_list = processed_list[:3] + [','.join(processed_list[3:])]
    processed_list = [s.strip() for s in processed_list]

    return processed_list


for index, row in df.iterrows():
    rows = row.tolist()
    entry_text = rows[0]

    all_raw_lists = []
    for aspect_text in rows[2:16]:
        try:
            raw_list = process_aspect_text(aspect_text)
            all_raw_lists.append(raw_list)
        except:
            pass

    # 1. Count all examples without aspects
    no_aspects = False
    if len(all_raw_lists) == 0:
        no_aspects = True
        total_empty += 1

    # 2. Count all examples that are not anoymized
    anonym_found = False
    for sublist in all_raw_lists:
        if "anonym" in sublist:
            if not anonym_found:
                total_anonym += 1
                anonym_found = True
            break

    # 3. Count all examples that are not english
    not_english_found = False
    for sublist in all_raw_lists:
        if "not english" in sublist:
            if not not_english_found:
                total_not_english += 1
                not_english_found = True
            break

    # 4. Count all examples with more than one sentence
    more_than_one_sentence_found = False
    for sublist in all_raw_lists:
        if "more than 1 sentence" in sublist:
            if not more_than_one_sentence_found:
                total_more_than_one_sentence += 1
                more_than_one_sentence_found = True
            break

    # 5. Count all "can't be understood"
    cant_be_understood_found = False
    for sublist in all_raw_lists:
        if "can't be understood" in sublist:
            if not cant_be_understood_found:
                total_can_t_be_understood += 1
                cant_be_understood_found = True
            break

    if (no_aspects or anonym_found or more_than_one_sentence_found or not_english_found or cant_be_understood_found):
        pass
    else:
        # 5. Check for examples with < 4 sentiment elements
        has_four_elements = True
        for sublist in all_raw_lists:
            if len(sublist) < 4:
                print("##### ERROR:", entry_text, sublist)
                has_four_elements = False

        if has_four_elements == True:
            aspects = convert_aspects(all_raw_lists, entry_text)
            data += entry_text + "####" + str(aspects) + "\n"
            total_valid_examples += 1

    # print(all_raw_lists)

    total_examples += 1

print("Empty:", total_empty)
print("Anonymized:", total_anonym)
print("More than 1 sentence:", total_more_than_one_sentence)
print("Valid examples:", total_valid_examples)
print("Not English:", total_not_english)
print("Can't be understood", total_can_t_be_understood)
print("-----")
print(total_examples)

In [None]:
with open(path_output+".txt", 'w', encoding='utf-8') as file:
    file.write(data)