# Notebook: Comparison Between GERestaurant and SemEval

## Packages

In [87]:
import xml.etree.ElementTree as ET
from collections import Counter
import json
import uuid

## Settings / Constants

In [88]:
ASPECT_CATEGORIES = ["GENERAL-IMPRESSION", "FOOD", "SERVICE", "AMBIENCE", "PRICE"]

## Code

### Load Dataset

In [89]:
def convert_xml_to_json(xml_file_path):
    reviews = []

    tree = ET.parse(xml_file_path)
    root = tree.getroot()

    for review in root.findall(".//Review"):
        for sentence in review.findall(".//sentence"):
            review_data = {"tags": [], "text": ""}
            opinions = sentence.find("Opinions")
            if opinions is not None:
                tags = []
                for opinion in opinions.findall("Opinion"):
                    if opinion.attrib["category"].split("#")[1] == "PRICES":
                        label = "PRICE"
                    elif opinion.attrib["category"].split("#")[0] == "RESTAURANT":
                        label = "GENERAL-IMPRESSION"
                    elif opinion.attrib["category"].split("#")[0] == "LOCATION":
                        label = "AMBIENCE"
                    elif opinion.attrib["category"].split("#")[0] == "DRINKS":
                        label = "FOOD"
                    else:
                        label = opinion.attrib["category"].split("#")[0]
                    tag = {
                        "start": int(opinion.attrib["from"]),
                        "end": int(opinion.attrib["to"]),
                        "text": opinion.attrib["target"],
                        "label": label,
                        "polarity": opinion.attrib["polarity"].upper()
                    }
                    tag["tag_with_polarity"] = f"{tag['label']}-{tag['polarity']}"
                    if tag["text"] == "NULL":
                        tag["tag_with_polarity_and_type"] = f"{tag['label']}-{tag['polarity']}-no-phrase-implicit"
                        tag["type"] = "label-implicit"
                    else:
                        tag["tag_with_polarity_and_type"] = f"{tag['label']}-{tag['polarity']}-explicit"
                        tag["type"] = "label-explicit"
                    tags.append(tag)

                review_data["id"] = sentence.attrib["id"]
                review_data["tags"].extend(tags)
                review_data["text"] = sentence.find("text").text

                reviews.append(review_data)

    return reviews

Download GERestaurant from this page:

* SemEval-2015-Restaurant Train-Set: [Download](http://metashare.ilsp.gr:8080/repository/browse/semeval-2015-absa-restaurant-reviews-train-data/b2ac9c0c198511e4a109842b2b6a04d751e6725f2ab847df88b19ea22cb5cc4a/)
* SemEval-2015-Restaurant Test-Set: [Download](http://metashare.ilsp.gr:8080/repository/browse/semeval-2015-absa-restaurants-reviews-test-data-gold-annotations/d32aeb3e9ca011e4a350842b2b6a04d737ee004f7cdc428bbf1ad4bd67977d22/)
* SemEval-2016-Restaurant Train-Set: [Download](http://metashare.ilsp.gr:8080/repository/browse/semeval-2016-absa-restaurant-reviews-english-train-data-subtask-1/cd28e738562f11e59e2c842b2b6a04d703f9dae461bb4816a5d4320019407d23/)
* SemEval-2016-Restaurant Test-Set: [Download](http://metashare.ilsp.gr:8080/repository/browse/semeval-2016-absa-restaurant-reviews-english-test-data-gold-subtask-1/42bd97c6d17511e59dbe842b2b6a04d721d1933085814d9daed8fbcbe54c0615/)
* Add `.xml` files to subdirectory /SemEval

In [90]:
rest15path_train = "SemEval/ABSA-15_Restaurants_Train_Final.xml"
rest15path_test = "SemEval/ABSA15_Restaurants_Test.xml"

rest16path_train = "SemEval/ABSA16_Restaurants_Train_SB1_v2 2.xml"
rest16path_test = "SemEval/EN_REST_SB1_TEST.xml.gold"

In [91]:
reviews_rest15 = convert_xml_to_json(rest15path_train) + convert_xml_to_json(rest15path_test)
reviews_rest16 = convert_xml_to_json(rest16path_train) + convert_xml_to_json(rest16path_test)

len(reviews_rest15), len(reviews_rest16)

(1702, 2384)

In [93]:
with open("../data/dataset_filtered.json", "r", encoding="utf-8") as json_file:
    # JSON-Datei als Dictionary laden
    reviews_gerestaurant = json.load(json_file)
len(reviews_gerestaurant)

3078

In [94]:
datasets = {"GERestaurant": reviews_gerestaurant,
            "SemEval-2015 (Restaurant)": reviews_rest15,
            "SemEval-2016 (Restaurant)": reviews_rest16}

### Analysis

#### Category Count

In [95]:
def get_category_counts(dataset):
    return Counter(tag["label"] for item in dataset for tag in item.get("tags", []))

In [96]:
for dataset_name in datasets.keys():
    ac_counts = get_category_counts(datasets[dataset_name])
    n_aspects = sum(ac_counts.values())
    ac_print = ""
    for ac in ASPECT_CATEGORIES:
        ac_print += str(round(ac_counts[ac] * 100 / n_aspects, 1))+"\,\%" + " & "
    ac_print = ac_print[:-2]

    print(dataset_name, "&", ac_print, "\\\\")

GERestaurant & 18.0\,\% & 39.7\,\% & 25.2\,\% & 11.2\,\% & 5.9\,\%  \\
SemEval-2015 (Restaurant) & 20.6\,\% & 42.6\,\% & 17.7\,\% & 11.5\,\% & 7.5\,\%  \\
SemEval-2016 (Restaurant) & 20.6\,\% & 43.6\,\% & 17.9\,\% & 10.8\,\% & 7.1\,\%  \\


#### Polarity Count

In [97]:
def get_polarity_counts(dataset):
    return Counter(tag["polarity"] for item in dataset for tag in item.get("tags", []))

In [98]:
for dataset_name in datasets.keys():
    pol_counts = get_polarity_counts(datasets[dataset_name])
    n_aspects = sum(pol_counts.values())
    pol_print = ""
    for pol in ["POSITIVE", "NEGATIVE", "NEUTRAL"]:
        pol_print += str(round(pol_counts[pol] * 100 / n_aspects, 1))+"\%" + " & "
    pol_print = pol_print[:-2]

    print(dataset_name, "&", pol_print, "\\\\")

GERestaurant & 54.2\% & 41.6\% & 4.2\%  \\
SemEval-2015 (Restaurant) & 66.1\% & 30.0\% & 3.9\%  \\
SemEval-2016 (Restaurant) & 67.4\% & 28.3\% & 4.3\%  \\


#### Implicit vs Explicit Count

In [101]:
def get_phrase_type_counts(dataset):
    return Counter(tag["type"] for item in dataset for tag in item.get("tags", []))

In [103]:
for dataset_name in datasets.keys():
    phrase_type_counts = get_phrase_type_counts(datasets[dataset_name])
    n_phrase_types = sum(phrase_type_counts.values())
    phrase_type_print = ""
    for ptype in ["label-implicit", "label-explicit"]:
        phrase_type_print += str(round(phrase_type_counts[ptype] * 100 / n_phrase_types, 1))+"\%" + " & "
    phrase_type_print = phrase_type_print[:-2]

    print(dataset_name, "&", phrase_type_print, "\\\\")

GERestaurant & 27.0\% & 73.0\%  \\
SemEval-2015 (Restaurant) & 24.9\% & 75.1\%  \\
SemEval-2016 (Restaurant) & 24.8\% & 75.2\%  \\


#### Dokumente, die in SemEval14 und SemEval15 vorkommen

In [99]:
idsA = [review["id"] for review in reviews_rest15]
idsB = [review["id"] for review in reviews_rest16]

In [100]:
# Annahme: textsA und textsB sind bereits definiert

# Länge der Listen textsA und textsB bestimmen
anzahl_texte_A = len(idsA)
anzahl_texte_B = len(idsB)

# Anzahl der Texte, die in beiden Listen vorkommen
anzahl_gemeinsame_texte = len(set(idsA).intersection(idsB))

print("Anzahl der Texte in Rest15:", anzahl_texte_A)
print("Anzahl der Texte in Rest16:", anzahl_texte_B)
print("Anzahl der Texte, die sowohl in Rest15 als auch in Rest16 vorkommen:", anzahl_gemeinsame_texte)


Anzahl der Texte in Rest15: 1702
Anzahl der Texte in Rest16: 2384
Anzahl der Texte, die sowohl in Rest15 als auch in Rest16 vorkommen: 1700
