# Notebook: Split Dataset in folds

## Packages

In [49]:
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd
import json

In [50]:
%%capture
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [51]:
RANDOM_STATE = 43
DATASET_PATH = "dataset_total/filtered_dataset.json"
N_FOLDS = 5

## Code

### Load Data

In [52]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

### Show most frequent Terms

In [53]:
tag_count = Counter(tag['text'] for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit')
dict(tag_count.most_common(10))

{'Fleisch': 416,
 'Sauce': 209,
 'Steinpilze': 209,
 'Service': 209,
 'Speisen': 209,
 'Preis-Leistungsverhältnis': 209,
 'Verkauf': 209,
 'Preis': 208,
 'Fisch': 208,
 'Meeresfrüchten': 208}

In [54]:
tag_most_common_terms = defaultdict(Counter)

for entry in dataset:
    for tag in entry['tags']:
        if tag['type'] == 'label-explicit':
            term = tag['text']
            tag_in_polarity = tag['tag_with_polarity']
            tag_most_common_terms[tag_in_polarity][term] += 1

top_terms_data = [{'Aspect': tag, 'Term': term, 'Frequency': count}
                  for tag, term_counts in tag_most_common_terms.items()
                  for term, count in term_counts.most_common(5)]

pd.DataFrame(top_terms_data)

Unnamed: 0,Aspect,Term,Frequency
0,FOOD-NEGATIVE,Sauce,209
1,FOOD-NEGATIVE,Fleisch,208
2,FOOD-NEGATIVE,Nudeln,208
3,FOOD-NEUTRAL,Steinpilze,209
4,FOOD-NEUTRAL,Kaffee,208
5,FOOD-NEUTRAL,Dessert,208
6,FOOD-NEUTRAL,Essen,208
7,FOOD-NEUTRAL,Pizza,208
8,SERVICE-POSITIVE,Service,209
9,FOOD-POSITIVE,Speisen,209


### Show most frequent Classes

In [55]:
tag_count = Counter(tag['label'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENCE': 208,
 'FOOD': 2707,
 'GENERAL-IMPRESSION': 416,
 'PRICE': 1042,
 'SERVICE': 209}

In [56]:
tag_count = Counter(tag['tag_with_polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENCE-NEGATIVE': 208,
 'FOOD-NEGATIVE': 625,
 'FOOD-NEUTRAL': 1041,
 'FOOD-POSITIVE': 1041,
 'GENERAL-IMPRESSION-NEGATIVE': 208,
 'GENERAL-IMPRESSION-POSITIVE': 208,
 'PRICE-NEGATIVE': 834,
 'PRICE-NEUTRAL': 208,
 'SERVICE-POSITIVE': 209}

In [57]:
tag_count = Counter(tag['tag_with_polarity_and_type'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENCE-NEGATIVE-explicit': 208,
 'FOOD-NEGATIVE-explicit': 625,
 'FOOD-NEUTRAL-explicit': 1041,
 'FOOD-POSITIVE-explicit': 1041,
 'GENERAL-IMPRESSION-NEGATIVE-no-phrase-implicit': 208,
 'GENERAL-IMPRESSION-POSITIVE-explicit': 208,
 'PRICE-NEGATIVE-explicit': 834,
 'PRICE-NEUTRAL-no-phrase-implicit': 208,
 'SERVICE-POSITIVE-explicit': 209}

### Most frequent polarity

In [58]:
tag_count = Counter(tag['polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'NEGATIVE': 1875, 'NEUTRAL': 1249, 'POSITIVE': 1458}

### Durchschnittliche Anzahl an Tags

In [59]:
sum(len(entry['tags']) for entry in dataset) / len(dataset)

1.8328

In [60]:
average_explicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit') / len(dataset)
average_implicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-implicit') / len(dataset)

print(f"AVG label-explicit: {average_explicit_tags}")
print(f"AVG label-implicit: {average_implicit_tags}")

AVG label-explicit: 1.6664
AVG label-implicit: 0.1664


### AVG words

In [61]:
word_counts = [len(word_tokenize(entry['text'], language='german')) for entry in dataset]
average_word_count = np.mean(word_counts)
std_word_count = np.std(word_counts)

print(f"MEAN: {average_word_count:.2f}")
print(f"SD: {std_word_count:.2f}")

MEAN: 14.25
SD: 7.24
