# Notebook: Create 5 Splits

## Packages

In [1]:
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from collections import Counter
import numpy as np
import random
import json

## Parameters

In [2]:
DATASET_PATH = "dataset_total/total_dataset.json"
N_FOLDS = 5

In [3]:
CRITERIA_RS = "tag_with_polarity_and_type"
ASPECTS = ["SERVICE", "FOOD", "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
MENTIONING_TYPE = ["implicit", "explicit"]
COMBINATIONS = [f"{aspect}-{polarity}" for aspect in ASPECTS for polarity in POLARITIES]
COMBINATIONS

['SERVICE-POSITIVE',
 'SERVICE-NEGATIVE',
 'SERVICE-NEUTRAL',
 'FOOD-POSITIVE',
 'FOOD-NEGATIVE',
 'FOOD-NEUTRAL',
 'GENERAL-IMPRESSION-POSITIVE',
 'GENERAL-IMPRESSION-NEGATIVE',
 'GENERAL-IMPRESSION-NEUTRAL',
 'AMBIENCE-POSITIVE',
 'AMBIENCE-NEGATIVE',
 'AMBIENCE-NEUTRAL',
 'PRICE-POSITIVE',
 'PRICE-NEGATIVE',
 'PRICE-NEUTRAL']

## Code

### Load Data

In [4]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

In [5]:
[tag[CRITERIA_RS] for tag in dataset[1]["tags"]]

['FOOD-NEGATIVE-explicit']

In [6]:
labels_one_hot = []
for i in range(len(dataset)):
    tags_in_example = list(set([tag[CRITERIA_RS] for tag in dataset[i]["tags"]]))
    #print(tags_in_example)
    one_hot_encoded_combination = np.array([1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
    #print(one_hot_encoded_combination)
    labels_one_hot.append(one_hot_encoded_combination)

In [7]:
labels_one_hot[:4]

[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]

### Split

In [8]:
random_state = 0
found_balanced_split = False

while found_balanced_split == False:
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=random_state)

    split_sizes = []
    idx = 0
    for train_index, test_index in mskf.split(dataset, labels_one_hot):
        test_dataset = [dataset[i] for i in test_index]
        print(len(test_dataset), Counter(
            [tag["label"] for example in test_dataset for tag in example["tags"]]))
        split_sizes.append(len(test_dataset))
        with open(f"../07 train classifier/real/split_{idx}.json", 'w', encoding='utf-8') as split_file:
            json.dump(test_dataset, split_file, ensure_ascii=False)
        idx += 1

    if any(item != 500 for item in split_sizes) == False:
        print(split_sizes, random_state)
        found_balanced_split = True
    random_state += 1

500 Counter({'FOOD': 274, 'SERVICE': 169, 'GENERAL-IMPRESSION': 137, 'AMBIENCE': 80, 'PRICE': 60})
500 Counter({'FOOD': 299, 'SERVICE': 194, 'GENERAL-IMPRESSION': 113, 'AMBIENCE': 85, 'PRICE': 40})
500 Counter({'FOOD': 276, 'SERVICE': 173, 'GENERAL-IMPRESSION': 121, 'AMBIENCE': 75, 'PRICE': 29})
500 Counter({'FOOD': 302, 'SERVICE': 157, 'GENERAL-IMPRESSION': 132, 'AMBIENCE': 81, 'PRICE': 43})
500 Counter({'FOOD': 273, 'SERVICE': 162, 'GENERAL-IMPRESSION': 133, 'AMBIENCE': 74, 'PRICE': 43})
[500, 500, 500, 500, 500] 0
