# Notebook: Split Dataset in folds

## Packages

In [1]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter
import numpy as np
import random
import json

## Parameters

In [2]:
RANDOM_STATE = 43
DATASET_PATH = "dataset_total/filtered_dataset.json"
N_FOLDS = 5

In [3]:
random.seed(RANDOM_STATE)

In [4]:
ASPECTS = ["SERVICE", "FOOD", "GENERAL-IMPRESSION", "AMBIENCE", "PRICE"]
POLARITIES = ["POSITIVE", "NEGATIVE", "NEUTRAL"]
COMBINATIONS = [aspect for aspect in ASPECTS]
COMBINATIONS

['SERVICE', 'FOOD', 'GENERAL-IMPRESSION', 'AMBIENCE', 'PRICE']

## Code

### Load Data

In [5]:
with open(DATASET_PATH, 'r', encoding='utf-8') as json_file:
    dataset = json.load(json_file)

In [6]:
labels_one_hot = []
for i in range(len(dataset)):
    tags_in_example = list(set([tag["label"] for tag in dataset[i]["tags"]]))
    #print(tags_in_example)
    one_hot_encoded_combination = np.array([1 if tag in tags_in_example else 0 for tag in COMBINATIONS])
    #print(one_hot_encoded_combination)
    labels_one_hot.append(one_hot_encoded_combination)

In [7]:
labels_one_hot[:40]

[array([1, 1, 0, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([1, 0, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([0, 1, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0]),
 array([1, 1, 0, 0, 0]),
 array([0, 1, 0, 0, 0]),
 array([0, 0, 1, 0, 0]),
 array([0, 0, 0, 0, 1]),
 array([1, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0]),
 array([0, 0, 1, 0, 0]),
 array([1, 1, 0, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([1, 0, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([0, 1, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0]),
 array([1, 1, 0, 0, 0]),
 array([0, 1, 0, 0, 0]),
 array([0, 0, 1, 0, 0]),
 array([0, 0, 0, 0, 1]),
 array([1, 0, 0, 0, 0]),
 array([0, 1, 0, 0, 0]),
 array([0, 0, 1, 0, 0]),
 array([1, 1, 0, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([1, 0, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([0, 1, 0, 1, 0]),
 array([0, 0, 1, 0, 0]),
 array([1, 0, 0, 0, 0]),
 array([0, 0, 0, 1, 0]),
 array([1, 1, 0, 0, 0]),
 array([0, 1, 0, 0, 0])]

In [7]:
unique_combinations = [''.join(map(str, row)) for row in labels_one_hot]
string_to_number = {string: i for i, string in enumerate(set(unique_combinations))}
labels_as_numbers = [string_to_number[string] for string in unique_combinations]

In [9]:
labels_as_numbers[:20]

[5, 7, 4, 3, 0, 3, 7, 1, 5, 6, 3, 2, 7, 6, 3, 5, 7, 4, 3, 0]

### Split

In [18]:
skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=RANDOM_STATE)

In [19]:
idx = 0
for train_index, test_index in skf.split(dataset, labels_as_numbers):
    test_dataset = [dataset[i] for i in test_index]

    print(len(test_dataset), Counter([tag["label"] for example in test_dataset for tag in example["tags"]]))
    
    with open(f"../07 train classifier/real/split_{idx}.json", 'w', encoding='utf-8') as split_file:
        json.dump(test_dataset, split_file, ensure_ascii=False)
        
    idx+=1

500 Counter({'SERVICE': 201, 'FOOD': 166, 'GENERAL-IMPRESSION': 164, 'AMBIENCE': 134, 'PRICE': 33})
500 Counter({'SERVICE': 200, 'GENERAL-IMPRESSION': 171, 'FOOD': 167, 'AMBIENCE': 132, 'PRICE': 33})
500 Counter({'SERVICE': 200, 'FOOD': 168, 'GENERAL-IMPRESSION': 157, 'AMBIENCE': 134, 'PRICE': 33})
500 Counter({'SERVICE': 200, 'FOOD': 168, 'GENERAL-IMPRESSION': 165, 'AMBIENCE': 134, 'PRICE': 33})
500 Counter({'SERVICE': 200, 'GENERAL-IMPRESSION': 175, 'FOOD': 165, 'AMBIENCE': 134, 'PRICE': 34})
