# Notebook: Create 5 Splits

## Packages

In [41]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from collections import Counter
import numpy as np
import random
import json

## Parameters

In [42]:
DATASET_PATH = "dataset_total/filtered_dataset.json"
N_FOLDS = 5

In [43]:
CRITERIA_RS = "tag_with_polarity_and_type"
ASPECTS = ["SERVICE", "FOOD", "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}-{mentioning_type}" for aspect in ASPECTS for polarity in POLARITIES for mentioning_type in MENTIONING_TYPE]
COMBINATIONS

['SERVICE-POSITIVE-implicit',
 'SERVICE-POSITIVE-explicit',
 'SERVICE-NEGATIVE-implicit',
 'SERVICE-NEGATIVE-explicit',
 'SERVICE-NEUTRAL-implicit',
 'SERVICE-NEUTRAL-explicit',
 'FOOD-POSITIVE-implicit',
 'FOOD-POSITIVE-explicit',
 'FOOD-NEGATIVE-implicit',
 'FOOD-NEGATIVE-explicit',
 'FOOD-NEUTRAL-implicit',
 'FOOD-NEUTRAL-explicit',
 'GENERAL-IMPRESSION-POSITIVE-implicit',
 'GENERAL-IMPRESSION-POSITIVE-explicit',
 'GENERAL-IMPRESSION-NEGATIVE-implicit',
 'GENERAL-IMPRESSION-NEGATIVE-explicit',
 'GENERAL-IMPRESSION-NEUTRAL-implicit',
 'GENERAL-IMPRESSION-NEUTRAL-explicit',
 'AMBIENCE-POSITIVE-implicit',
 'AMBIENCE-POSITIVE-explicit',
 'AMBIENCE-NEGATIVE-implicit',
 'AMBIENCE-NEGATIVE-explicit',
 'AMBIENCE-NEUTRAL-implicit',
 'AMBIENCE-NEUTRAL-explicit',
 'PRICE-POSITIVE-implicit',
 'PRICE-POSITIVE-explicit',
 'PRICE-NEGATIVE-implicit',
 'PRICE-NEGATIVE-explicit',
 'PRICE-NEUTRAL-implicit',
 'PRICE-NEUTRAL-explicit']

## Code

### Load Data

In [44]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

In [45]:
[tag[CRITERIA_RS] for tag in dataset[1]["tags"]]

['SERVICE-NEGATIVE-explicit']

In [46]:
labels_one_hot = []
for i in range(len(dataset)):
    tags_in_example = list(set([tag[CRITERIA_RS] for tag in dataset[i]["tags"]]))
    #print(tags_in_example)
    one_hot_encoded_combination = np.array([1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
    #print(one_hot_encoded_combination)
    labels_one_hot.append(one_hot_encoded_combination)

In [47]:
labels_one_hot[:4]

[array([0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])]

### Split

In [48]:
random_state = 0
found_balanced_split = False

while found_balanced_split == False:
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    split_sizes = []
    idx = 0
    for train_index, test_index in mskf.split(dataset, labels_one_hot):
        test_dataset = [dataset[i] for i in test_index]
        print(len(test_dataset), Counter(
            [tag["label"] for example in test_dataset for tag in example["tags"]]))
        split_sizes.append(len(test_dataset))
        with open(f"../07 train classifier/real/split_{idx}.json", 'w', encoding='utf-8') as split_file:
            json.dump(test_dataset, split_file, ensure_ascii=False)
        idx += 1

    if any(item != 500 for item in split_sizes) == False:
        print(split_sizes, random_state)
        found_balanced_split = True
    random_state += 1

500 Counter({'SERVICE': 200, 'FOOD': 167, 'GENERAL-IMPRESSION': 164, 'AMBIENCE': 131, 'PRICE': 33})
500 Counter({'SERVICE': 203, 'FOOD': 167, 'GENERAL-IMPRESSION': 166, 'AMBIENCE': 139, 'PRICE': 34})
500 Counter({'SERVICE': 198, 'GENERAL-IMPRESSION': 174, 'FOOD': 168, 'AMBIENCE': 134, 'PRICE': 33})
500 Counter({'SERVICE': 203, 'FOOD': 166, 'GENERAL-IMPRESSION': 160, 'AMBIENCE': 128, 'PRICE': 33})
500 Counter({'SERVICE': 197, 'GENERAL-IMPRESSION': 168, 'FOOD': 166, 'AMBIENCE': 136, 'PRICE': 33})
[500, 500, 500, 500, 500] 0
