# Notebook: Split Dataset in folds

## Packages

In [37]:
import nltk
from collections import defaultdict, Counter
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
import pandas as pd
import json

In [38]:
%%capture
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Parameters

In [39]:
RANDOM_STATE = 43
DATASET_PATH = "dataset_total/filtered_dataset.json"
N_FOLDS = 5

## Code

### Load Data

In [40]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

In [41]:
dataset[:1]

[{'tags': [{'end': 12,
    'start': 4,
    'tag_with_polarity': 'SERVICE-POSITIVE',
    'tag_with_polarity_and_type': 'SERVICE-POSITIVE-explicit',
    'text': 'Personal',
    'type': 'label-explicit',
    'label': 'SERVICE',
    'polarity': 'POSITIVE'}],
  'text': 'Das Personal freundlich.',
  'aspect_available_without_judgement': False,
  'two_or_more_sentences': False,
  'city': 'leipzig',
  'date': '2022-11-07',
  'title': 'Schönes Ambiente',
  'rating': 4.0,
  'review_id': 867609212,
  'page_index': 1,
  'author_name': 'schneiderrebecca',
  'sentence_idx': 4,
  'language_code': 'de',
  'restaurant_id': 743219,
  'author_location': 'Würzburg, Deutschland',
  'restaurant_name': 'Cafe Riquet',
  'detected_language': 'de',
  'text_noanonymization': 'Ich war gegen 17:30 Uhr in dem Kaffeehaus. Die Auswahl der Kuchen & Torten war dementsprechend nicht mehr sehr groß. Dennoch waren der Amerikano und der Stachelbeere-Baiser Kuchen gut. Die Räumlichkeit ist sehr schön. Das Personal freundlic

### Show most frequent Terms

In [42]:
tag_count = Counter(tag['text'] for entry in dataset for tag in entry['tags'])
dict(tag_count.most_common(10))

{'Service': 294,
 'Essen': 294,
 'Personal': 148,
 'Speisen': 147,
 'Bier': 147,
 'Gang': 147,
 'Restaurant': 147,
 'Ausblick auf der Terrasse': 147,
 'Küche': 147,
 'Preis': 147}

In [43]:
tag_most_common_terms = defaultdict(Counter)

for entry in dataset:
    for tag in entry['tags']:
        term = tag['text']
        tag_in_polarity = tag['tag_with_polarity']
        tag_most_common_terms[tag_in_polarity][term] += 1

top_terms_data = [{'Aspect': tag, 'Term': term, 'Frequency': count}
                  for tag, term_counts in tag_most_common_terms.items()
                  for term, count in term_counts.most_common(5)]

pd.DataFrame(top_terms_data)

Unnamed: 0,Aspect,Term,Frequency
0,SERVICE-POSITIVE,Service,294
1,SERVICE-POSITIVE,Personal,148
2,FOOD-POSITIVE,Speisen,147
3,FOOD-POSITIVE,Bier,147
4,FOOD-POSITIVE,Gang,147
5,FOOD-POSITIVE,Küche,147
6,FOOD-POSITIVE,Rose,147
7,GENERAL-IMPRESSION-POSITIVE,Restaurant,147
8,AMBIENT-INTERIOR-POSITIVE,Ausblick auf der Terrasse,147
9,AMBIENT-INTERIOR-POSITIVE,<LOC>,147


### Show most frequent Classes

In [44]:
tag_count = Counter(tag['label'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENT-INTERIOR': 588,
 'FOOD': 1764,
 'GENERAL-IMPRESSION': 441,
 'PRICE': 147,
 'SERVICE': 442}

In [45]:
tag_count = Counter(tag['tag_with_polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENT-INTERIOR-NEGATIVE': 147,
 'AMBIENT-INTERIOR-NEUTRAL': 147,
 'AMBIENT-INTERIOR-POSITIVE': 294,
 'FOOD-NEGATIVE': 882,
 'FOOD-POSITIVE': 882,
 'GENERAL-IMPRESSION-NEGATIVE': 147,
 'GENERAL-IMPRESSION-NEUTRAL': 147,
 'GENERAL-IMPRESSION-POSITIVE': 147,
 'PRICE-NEGATIVE': 147,
 'SERVICE-POSITIVE': 442}

In [46]:
tag_count = Counter(tag['tag_with_polarity_and_type'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'AMBIENT-INTERIOR-NEGATIVE-explicit': 147,
 'AMBIENT-INTERIOR-NEUTRAL-explicit': 147,
 'AMBIENT-INTERIOR-POSITIVE-explicit': 294,
 'FOOD-NEGATIVE-explicit': 882,
 'FOOD-POSITIVE-explicit': 882,
 'GENERAL-IMPRESSION-NEGATIVE-no-phrase-implicit': 147,
 'GENERAL-IMPRESSION-NEUTRAL-explicit': 147,
 'GENERAL-IMPRESSION-POSITIVE-explicit': 147,
 'PRICE-NEGATIVE-explicit': 147,
 'SERVICE-POSITIVE-explicit': 442}

### Most frequent polarity

In [47]:
tag_count = Counter(tag['polarity'] for entry in dataset for tag in entry['tags'])
dict(sorted(tag_count.items()))

{'NEGATIVE': 1323, 'NEUTRAL': 294, 'POSITIVE': 1765}

### Durchschnittliche Anzahl an Tags

In [48]:
sum(len(entry['tags']) for entry in dataset) / len(dataset)

1.3528

In [49]:
average_explicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-explicit') / len(dataset)
average_implicit_tags = sum(1 for entry in dataset for tag in entry['tags'] if tag['type'] == 'label-implicit') / len(dataset)

print(f"AVG label-explicit: {average_explicit_tags}")
print(f"AVG label-implicit: {average_implicit_tags}")

AVG label-explicit: 1.294
AVG label-implicit: 0.0588


### AVG words

In [50]:
word_counts = [len(word_tokenize(entry['text'], language='german')) for entry in dataset]
average_word_count = np.mean(word_counts)
std_word_count = np.std(word_counts)

print(f"MEAN: {average_word_count:.2f}")
print(f"SD: {std_word_count:.2f}")

MEAN: 11.06
SD: 8.53
