# Notebook: Merge/Filter Annotations

## Packages

In [68]:
import pandas as pd
import ast
import os

## Parameters

In [69]:
PATH_ANNOTATIONS = "annotations/"

## Code

### 1. Check if every review got annotated

In [70]:
import os
import pandas as pd

annotator_counts = {}

for filename in os.listdir(PATH_ANNOTATIONS):
    if filename.endswith('.csv'):
        file_path = os.path.join(PATH_ANNOTATIONS, filename)
        df = pd.read_csv(file_path)
        annotator_column = df['annotator']
        
        for annotator in annotator_column:
            if annotator in annotator_counts:
                annotator_counts[annotator] += 1
            else:
                annotator_counts[annotator] = 1

for annotator, count in annotator_counts.items():
    print(f'{annotator}: {count} times')

nils-constantin.hellwig@stud.uni-regensburg.de: 18 times


### 2. Merge Annotations

In [71]:
pd.set_option('display.max_colwidth', None)

In [72]:
def convert_label_string_to_dict(label_str):
    try:
        return ast.literal_eval(label_str)
    except (SyntaxError, ValueError):
        return {}
    
df1 = pd.read_csv(PATH_ANNOTATIONS + "annotation_1.csv")
df2 = pd.read_csv(PATH_ANNOTATIONS + "annotation_2.csv")
df1['label_implicit'] = df1['label_implicit'].apply(convert_label_string_to_dict)
df2['label_implicit'] = df2['label_implicit'].apply(convert_label_string_to_dict)
df1['label_explicit'] = df1['label_explicit'].apply(convert_label_string_to_dict)
df2['label_explicit'] = df2['label_explicit'].apply(convert_label_string_to_dict)

When looking at the implicit aspects, we can ignore the place in the text that got annotated

In [73]:
def remove_useless_keys_from_object(obj):
    return [{k: v for k, v in item.items() if k not in ["end", "text", "start"]} for item in obj]

df1['label_implicit'] = df1['label_implicit'].apply(remove_useless_keys_from_object)
df2['label_implicit'] = df2['label_implicit'].apply(remove_useless_keys_from_object)

In [74]:
def find_common_and_count_unique(a1, a2):
    common_objects = []
    count_unique = 0
    for obj1 in a1:
        if obj1 in a2:
            common_objects.append(obj1)
            a2.remove(obj1)
        else:
            count_unique += 1
    
    for obj2 in a2:
        if obj2 not in common_objects:
            count_unique += 1
    
    return common_objects, count_unique

In [78]:
a1 = [{"name": "max"}, {"name": "thomas"}, {"name": "jo"}]
a2= [{"name": "nils"}, {"name": "alex"}]

find_common_and_count_unique(a1, a2)

[{'name': 'max'}, {'name': 'thomas'}, {'name': 'jo'}] [{'name': 'nils'}, {'name': 'alex'}]
-- {'name': 'max'}
-- {'name': 'thomas'}
-- {'name': 'jo'}


([], 5)

Remove duplicate annotations. It might happen that an annotator annotates the same aspect-phrase twice with the same label and polarity.

In [75]:
df1['label_explicit'] = df1['label_explicit'].apply(lambda x: [dict(t) for i, t in enumerate(x) if t not in x[:i]])
df2['label_explicit'] = df2['label_explicit'].apply(lambda x: [dict(t) for i, t in enumerate(x) if t not in x[:i]])

df2duplikate entfernen -> evtl kommt das mal vor

In [76]:
annoated_dataset_total = pd.DataFrame([])

for idx in range(len(df1)):
    equal_explicit_annotation = []
    for item1 in df1['label_explicit'].iloc[idx]:
        for item2 in df2['label_explicit'].iloc[idx]:
            if item1 == item2:
                equal_explicit_annotation.append(item1)

    equal_implicit_annotation = find_common_and_count_unique(df1['label_implicit'].iloc[idx], df2['label_implicit'].iloc[idx])
                
    annoated_dataset_total = pd.concat([annoated_dataset_total, pd.DataFrame({
        'label_explicit': [str(equal_explicit_annotation)],
        'label_implicit': [str(equal_implicit_annotation[0])],
        'unique_implicit_count': equal_implicit_annotation[1],
        'id': df1['id'].iloc[idx],
        'restaurant_id': df1['restaurant_id'].iloc[idx],
        'review_id': df1['review_id'].iloc[idx],
        'text': df1['text'].iloc[idx],
    })], ignore_index=True)

annoated_dataset_total

[] []
[] []
[] []
[{'labels': ['AMBIENT-INTERIOR-NEGATIVE-no-phrase']}, {'labels': ['AMBIENT-INTERIOR-NEGATIVE-no-phrase']}] [{'labels': ['AMBIENT-INTERIOR-NEGATIVE-no-phrase']}]
-- {'labels': ['AMBIENT-INTERIOR-NEGATIVE-no-phrase']}
[] []
[] []
[] []
[] []
[] []


Unnamed: 0,label_explicit,label_implicit,unique_implicit_count,id,restaurant_id,review_id,text
0,"[{'end': 15, 'text': 'Wartezeit', 'start': 6, 'labels': ['SERVICE-NEGATIVE']}, {'end': 47, 'text': 'Restaurant', 'start': 37, 'labels': ['AMBIENT-INTERIOR-NEGATIVE']}]",[],0,68133812,1870837,870392222,"Lange Wartezeit auf einen Tisch, das Restaurant war sehr voll und somit auch sehr laut."
1,"[{'end': 45, 'text': 'Kartoffeln', 'start': 35, 'labels': ['FOOD-POSITIVE']}, {'end': 59, 'text': 'Salat', 'start': 54, 'labels': ['FOOD-POSITIVE']}]",[],0,68133813,1308296,887012563,Die Steaks waren sehr gut auch die Kartoffeln und der Salat.
2,"[{'end': 7, 'text': 'Fritten', 'start': 0, 'labels': ['FOOD-POSITIVE']}, {'end': 54, 'text': 'Portion', 'start': 47, 'labels': ['FOOD-POSITIVE']}]",[],0,68133814,10685715,849351269,Fritten waren gut und wir haben uns eine große Portion mit 3 Personen geteilt.
3,"[{'end': 25, 'text': 'Toiletten', 'start': 16, 'labels': ['AMBIENT-INTERIOR-NEGATIVE']}]",[{'labels': ['AMBIENT-INTERIOR-NEGATIVE-no-phrase']}],1,68133815,11895711,881137672,"Leider sind die Toiletten eine Etage höher und wohl für Behinderte schwer zu erreichen, aber sauber und ordentlich."
4,"[{'end': 12, 'text': 'Essen', 'start': 7, 'labels': ['FOOD-POSITIVE']}]",[],0,68133816,5967363,871925315,Tolles Essen!
5,[],[],0,68133817,10130718,882916333,Die Hälfte habe ich übrig gelassen!
6,"[{'end': 21, 'text': 'Pizza', 'start': 16, 'labels': ['FOOD-NEUTRAL']}, {'end': 76, 'text': 'Gewürze', 'start': 69, 'labels': ['FOOD-NEGATIVE']}]",[],0,68133818,801344,885335081,"Wir hatten eine Pizza bestellt die vom Belag ok war , jedoch fehlten Gewürze."
7,"[{'end': 9, 'text': 'Tisch', 'start': 4, 'labels': ['AMBIENT-INTERIOR-NEGATIVE']}]",[],0,68133819,6949511,875208633,"Der Tisch, der uns dann, wiederum mit einem Fingerzeig (wir hatten eine Reservierung) angeboten wurde, war eine Frechheit."
8,"[{'end': 104, 'text': 'Kasse', 'start': 99, 'labels': ['AMBIENT-INTERIOR-NEGATIVE']}]",[],0,68133820,6914832,861065137,"Das Preis-Leistungs-Verhältnis stimmt nicht mehr finden wir.Am ärgerlichsten war, dass sich an der Kasse beim Gehen eine Schlange bildete, weil wohl kein Angestellter kassieren wollte/konnte."


### To-do: Filter annotations with mixed label 

### To-do: Filter annotations with no label

### To-do: Dataframe

In [77]:
annoated_dataset_total.to_csv("annoated_dataset_total.csv")