# Notebook: Split Dataset in folds

## Packages

In [1]:
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd
import json

In [2]:
%%capture
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [3]:
RANDOM_STATE = 43
DATASET_PATH = "dataset_total/filtered_dataset.json"
N_FOLDS = 5

## Code

### Load Data

In [4]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Show most frequent Aspect Terms

In [5]:
tag_count = Counter(tag['text'] for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit')
dict(tag_count.most_common(10))

{'Kellner': 334,
 'Personal': 333,
 'Service': 167,
 'Suppe': 167,
 'Uerige': 167,
 'Kuchen': 167,
 'Tortenauswahl': 167,
 'Interior Design': 167,
 'Ambiente': 167,
 'Getränke': 167}

In [6]:
tag_most_common_terms = defaultdict(Counter)

for entry in dataset:
    for tag in entry['tags']:
        if tag['type'] == 'label-explicit':
            term = tag['text']
            tag_in_polarity = tag['tag_with_polarity']
            tag_most_common_terms[tag_in_polarity][term] += 1

top_terms_data = [{'Aspect': tag, 'Term': term, 'Frequency': count}
                  for tag, term_counts in tag_most_common_terms.items()
                  for term, count in term_counts.most_common(5)]

pd.DataFrame(top_terms_data)

Unnamed: 0,Aspect,Term,Frequency
0,SERVICE-NEGATIVE,Kellner,334
1,SERVICE-NEGATIVE,Service,167
2,FOOD-POSITIVE,Suppe,167
3,FOOD-POSITIVE,Kuchen,167
4,FOOD-POSITIVE,Essen,166
5,SERVICE-NEUTRAL,Personal,167
6,AMBIENCE-NEGATIVE,Uerige,167
7,AMBIENCE-POSITIVE,Tortenauswahl,167
8,AMBIENCE-POSITIVE,Interior Design,167
9,AMBIENCE-POSITIVE,Ambiente,167


### Show most frequent Classes

In [19]:
tag_count = Counter(tag['label'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENCE': 668,
 'FOOD': 834,
 'GENERAL-IMPRESSION': 832,
 'PRICE': 166,
 'SERVICE': 1001}

In [18]:
tag_count = Counter(tag['tag_with_polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))   

{'AMBIENCE-NEGATIVE': 167,
 'AMBIENCE-POSITIVE': 501,
 'FOOD-NEGATIVE': 167,
 'FOOD-NEUTRAL': 167,
 'FOOD-POSITIVE': 500,
 'GENERAL-IMPRESSION-NEGATIVE': 333,
 'GENERAL-IMPRESSION-POSITIVE': 499,
 'PRICE-NEGATIVE': 166,
 'SERVICE-NEGATIVE': 668,
 'SERVICE-NEUTRAL': 167,
 'SERVICE-POSITIVE': 166}

In [17]:
tag_count = Counter(tag['tag_with_polarity_and_type'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENCE-NEGATIVE-explicit': 167,
 'AMBIENCE-POSITIVE-explicit': 501,
 'FOOD-NEGATIVE-explicit': 167,
 'FOOD-NEUTRAL-explicit': 167,
 'FOOD-POSITIVE-explicit': 500,
 'GENERAL-IMPRESSION-NEGATIVE-no-phrase-implicit': 333,
 'GENERAL-IMPRESSION-POSITIVE-explicit': 166,
 'GENERAL-IMPRESSION-POSITIVE-no-phrase-implicit': 333,
 'PRICE-NEGATIVE-explicit': 166,
 'SERVICE-NEGATIVE-explicit': 501,
 'SERVICE-NEGATIVE-no-phrase-implicit': 167,
 'SERVICE-NEUTRAL-explicit': 167,
 'SERVICE-POSITIVE-explicit': 166}

### Most frequent polarity

In [10]:
tag_count = Counter(tag['polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'NEGATIVE': 1501, 'NEUTRAL': 334, 'POSITIVE': 1666}

In [21]:
tag_count = Counter(tag['polarity'] for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit')
dict(sorted(tag_count.items()))

{'NEGATIVE': 1001, 'NEUTRAL': 334, 'POSITIVE': 1333}

In [23]:
tag_count = Counter(tag['polarity'] for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-implicit')
dict(sorted(tag_count.items()))

{'NEGATIVE': 500, 'POSITIVE': 333}

### Durchschnittliche Anzahl an Tags pro Beispiel

In [24]:
sum(len(entry['tags']) for entry in dataset) / len(dataset)

1.4004

In [25]:
average_explicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit') / len(dataset)
average_implicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-implicit') / len(dataset)

print(f"AVG label-explicit: {average_explicit_tags}")
print(f"AVG label-implicit: {average_implicit_tags}")

AVG label-explicit: 1.0672
AVG label-implicit: 0.3332


### AVG words

In [26]:
word_counts = [len(word_tokenize(entry['text'], language='german')) for entry in dataset]
average_word_count = np.mean(word_counts)
std_word_count = np.std(word_counts)

print(f"MEAN: {average_word_count:.2f}")
print(f"SD: {std_word_count:.2f}")

MEAN: 13.48
SD: 8.87
