In [1]:
import json
import random
import numpy as np
from collections import Counter, defaultdict
from functools import partial
from utils import *

In [2]:
def create_small_subset(dataset_path, size=1000000):
    with open(DATASET_PATH) as json_file:
        with open(dataset_path, 'w') as new_file:
            for _, line in zip(range(size), json_file):
                new_file.write(line)

In [3]:
def cut_vocabulary(word_count, size):
    size = min(size,len(word_count))
    id_to_word = {} # to pharse the sentences
    word_to_id = defaultdict(partial(map_to_not_in_vocabulary, size)) # for loading embeddings
    for ind, (word, _) in enumerate(word_count.most_common(size)):
        id_to_word[ind] = word
        word_to_id[word] = ind
    id_to_word[size] = SPECIAL_CHARACTER
    return id_to_word, word_to_id


def save_dictionaries(dataset_path):
    word_count = Counter()
    book_to_id = {}
    user_to_id = {}
    book_reviews_count = defaultdict(int)
    word_per_book_count = defaultdict(Counter)
    with open(dataset_path) as json_file:
        for line in json_file:
            data_dict = json.loads(line)
            book_id_org = data_dict['book_id']
            user_id = data_dict['user_id']
            book_to_id[book_id_org] = book_to_id.get(book_id_org, len(book_to_id))
            user_to_id[user_id] = user_to_id.get(user_id, len(user_to_id))
            book_id = book_to_id[book_id_org]
            review_sentences = data_dict['review_sentences']
            book_reviews_count[book_id] += 1
            for _, sentence in review_sentences:
                for word in sentence.split():
                    word = clean_word(word)
                    word_count[word] += 1
                    word_per_book_count[word][book_id] += 1
                    
    id_to_word, word_to_id = cut_vocabulary(word_count, size=VOCAB_SIZE)
    print(f"Vocab size: {len(word_count)}")
    save_pickle(word_to_id, "word_to_id.pickle")
    save_pickle(id_to_word, "id_to_word.pickle")
    save_pickle(word_count, "word_count.pickle")
    save_pickle(book_to_id, "book_to_id.pickle")
    save_pickle(user_to_id, "user_to_id.pickle")
    save_pickle(book_reviews_count, "book_reviews_count.pickle")
    save_pickle(word_per_book_count, "word_per_book_count.pickle")

In [4]:
def fetch_samples(dataset_path):
    true_samples = []
    false_samples = []
    with open(dataset_path) as json_file:
        for line in json_file:
            sample = json.loads(line)
            if not sample['has_spoiler']:
                false_samples.append(sample)
            else:
                true_samples.append(sample)
    random.shuffle(true_samples)
    random.shuffle(false_samples)
    return true_samples, false_samples


def split_samples(samples, rat1, rat2):
    first_chunk = int(rat1*len(samples))
    second_chunk = int(rat2*len(samples))
    return samples[first_chunk+second_chunk:], samples[:first_chunk] , samples[first_chunk:first_chunk+second_chunk]


def train_valid_test_split(dataset_path, test_ratio, valid_ratio, class_ratio=1.0): 
    # class_ratio = 1 - means same number of sample for each class
    true_samples, false_samples = fetch_samples(dataset_path)
    if class_ratio is None:
        train, valid, test = split_samples(false_samples+true_samples,test_ratio,valid_ratio)
    else:
        size = int(min(len(true_samples), len(false_samples))/class_ratio)
        print(f"size: size")
        true_train, true_valid, true_test = split_samples(true_samples[:size],test_ratio,valid_ratio)
        false_train, false_valid, false_test = split_samples(false_samples[:size],test_ratio,valid_ratio)
        train, valid, test = true_train+false_train, true_valid+false_valid, true_test+false_test
    random.shuffle(train)
    random.shuffle(valid)
    random.shuffle(test)
    print(f"Train size: {len(train)}")
    print(f"Valid size: {len(valid)}")
    print(f"Test size: {len(test)}")
    save_pickle(train, "train.pickle")
    save_pickle(valid, "valid.pickle")
    save_pickle(test, "test.pickle")
    return train, valid, test

In [5]:
create_small_subset(SAMPLE_DATASET_PATH)

In [6]:
save_dictionaries(SAMPLE_DATASET_PATH)

Vocab size: 712166


In [7]:
_, _, _ = train_valid_test_split(SAMPLE_DATASET_PATH, TEST_RATIO, VALID_RATIO, class_ratio=1)

size: size
Train size: 77622
Valid size: 25874
Test size: 25874
