In [1]:
import json
from collections import Counter, defaultdict
import random
import numpy as np
from utils import *

In [2]:
def create_word_dictionaries():
    word_to_id = {}
    id_to_word = {}
    word_count = Counter()
    id_word = 0
    with open(DATASET_PATH) as json_file:
        for line in json_file:
            data_dict = json.loads(line)
            review_sentences = data_dict['review_sentences']
            for _, sentence in review_sentences:
                for word in sentence.split():
                    if word not in word_to_id.keys():
                        word = clean_word(word)
                        word_to_id[word] = id_word
                        id_to_word[id_word] = word
                        id_word += 1
                    word_count[word] += 1
        word_to_id[SPECIAL_CHARACTER] = id_word
        id_to_word[id_word] = SPECIAL_CHARACTER
    save_pickle(word_to_id, "word_to_id.pickle")
    save_pickle(id_to_word, "id_to_word.pickle")
    save_pickle(word_count, "word_count.pickle")
    return word_to_id, id_to_word, word_count

In [3]:
def fetch_samples():
    true_samples = []
    false_samples = []
    with open(DATASET_PATH) as json_file:
        for line in json_file:
            sample = json.loads(line)
            if not sample['has_spoiler']:
                false_samples.append(sample)
            else:
                true_samples.append(sample)
    random.shuffle(true_samples)
    random.shuffle(false_samples)
    return true_samples, false_samples


def split_samples(samples, rat1, rat2):
    true_samples, false_samples = fetch_samples()
    first_chunk = int(rat1*len(samples))
    second_chunk = int(rat2*len(samples))
    return samples[:first_chunk] , samples[first_chunk:first_chunk+second_chunk], samples[first_chunk+second_chunk:]


def train_valid_test_split(test_ratio, valid_ratio, class_ratio=1.0): 
    # class_ratio = 1 - means same number of sample for each class
    true_samples, false_samples = fetch_samples()
    size = int(min(len(true_samples), len(false_samples))/class_ratio)
    true_test, true_valid, true_train = split_samples(true_samples[:size],test_ratio,valid_ratio)
    false_test, false_valid, false_train = split_samples(false_samples[:size],test_ratio,valid_ratio)
    train, valid, test = true_train+false_train, true_valid+false_valid, true_test+false_test
    random.shuffle(train)
    random.shuffle(valid)
    random.shuffle(test)
    save_pickle(train, "train.pickle")
    save_pickle(valid, "valid.pickle")
    save_pickle(test, "test.pickle")
    return train, valid, test

In [None]:
_, _, _ = create_word_dictionaries()
_, _, _ = train_valid_test_split(TEST_RATIO, VALID_RATIO)