In [1]:
#Imports
from scripts import bookdatafunctions as bdf
from scripts import corpusMLfunctions as cmf
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#Constants
AGE_SHEET = "ISBN_MAPS/ISBN2AGE.xlsx"
AUTH_SHEET = "ISBN_MAPS/ISBN2AUTH.xlsx"
CONLLUS_FOLDER = "Conllus"
SNIPPET_LENS = [5,10,25,50,75,100]

In [3]:
#Load corpus
corpus = bdf.mapGroup2Age(bdf.maskPropn(bdf.initBooksFromConllus(CONLLUS_FOLDER)), AGE_SHEET)

In [4]:
#Generate train-test-eval split for keys
author_level_split = cmf.splitOnAuthorLevel(list(corpus.keys()), AUTH_SHEET)
train_target, test_target, eval_target = cmf.generateAgeStratificationAmounts(corpus, 0.7)
#Genre targets for train for TCBC v1.0
train_target[1] = 158
train_target[2] = 32
train_target[3] = 20
#Genre targets for test for TCBC v1.0
test_target[1] = 34
test_target[2] = 7
test_target[3] = 5

print(sum(list(train_target.values()))-210)

203


In [5]:
def checkDuplicates(trainkeys: list, testkeys:list):
    for key in trainkeys:
        if key in testkeys:
            return False
    return True

def checkGenres(trainkeys: list, correct_amounts: dict):
    amounts = {1:0, 2:0, 3:0}
    for key in trainkeys:
        genre = int(key[-1])
        amounts[genre] = amounts[genre] + 1
    for key in correct_amounts:
        if abs(correct_amounts[key]-amounts[key]) > 2:
            return False
    return True

def good_books_in_train(trainkeys, goods):
    for g in goods:
        if g in trainkeys:
            return True
    return False

def no_bad_books_in_train(trainkeys, bads):
    for b in bads:
        if b in trainkeys:
            return False
    return True

In [6]:
import random
def doTrainTestEvalSplitSeriesLevel(author_level_split: dict[str,list[str]], train_target_amounts: dict[int,int], test_target_amounts: dict[int,int], eval_target_amounts: dict[int,int], good_train_books=None, bad_train_books=None):
    
    """
    Function which splits a corpus into (roughly) stratified datasets for training, evaluation, and testing
    """
    train_keys = good_train_books
    test_keys = []
    eval_keys = []
    authors = list(author_level_split.keys())
    #Shuffle the authors so we don't always end up with the same sets
    random.shuffle(authors)
    for author in authors:
        keys_to_add = author_level_split[author]
        #Don't do dupes!
        keys_to_add = [x for x in keys_to_add if x not in good_train_books]
        #Get dicts for age:number of entries
        to_add = cmf.getNumOfEntriesPerAge(keys_to_add)
        current_train = cmf.getNumOfEntriesPerAge(train_keys)
        current_test = cmf.getNumOfEntriesPerAge(test_keys)
        current_eval = cmf.getNumOfEntriesPerAge(eval_keys)

        amounts = {1:0, 2:0, 3:0}
        for key in train_keys:
            genre = int(key[-1])
            amounts[genre] = amounts[genre] + 1
        keys_to_add_amounts = {1:0, 2:0, 3:0}
        for key in keys_to_add:
            genre = int(key[-1])
            keys_to_add_amounts[genre] = keys_to_add_amounts[genre] + 1

        #Use a flag to determine whether to return true or false
        toset = 'train'
        for age in to_add:
            #If age not yet present in the train set, then immediately return True as we want to add the batch to Train 
            if not age in list(current_train.keys()) and no_bad_books_in_train(keys_to_add, bad_train_books):
                toset = 'train'
                break
            if amounts[3]+keys_to_add_amounts[3]>20:
                toset = 'test'
                break 
            #If the target age has not yet been met, then continue to check rest of the ages before making any conclusions
            if current_train[age] < train_target_amounts[age]:
                continue
            #If the target amount has been reached in Train, then if an age is not present in Eval, immediately add the batch there
            if not age in list(current_eval.keys()):
                toset = 'eval'
                break
            #If the target amount has been reached in Train and Eval, then if an age is not present in Test, immediately add the batch there
            if not age in list(current_test.keys()):
                toset = 'test'
                break
            #If all previous checks pass and Eval is underpopulated, then tentatively add the batch to TE, but see the remaining ages if stronger conditions are met
            if current_eval[age] < eval_target_amounts[age]:
                toset = 'eval'
            #If all previous checks pass and Test is underpopulated, then tentatively add the batch to TE, but see the remaining ages if stronger conditions are met
            elif current_test[age] < test_target_amounts[age]:
                toset = 'test'
            elif len(train_keys)+len(keys_to_add)>210:
                toset = 'test'
                break
            #If all else passes, then just tentatively add the batch to train
            else:
                toset = 'train'
        
        if toset == 'train':
            train_keys += keys_to_add
        elif toset == 'eval':
            eval_keys += keys_to_add
        else:
            test_keys += keys_to_add
    return train_keys, test_keys, eval_keys

In [7]:
import random
def doTrainTestEvalSplitNoSeriesLevel(book_ids, train_target_amounts: dict[int,int], test_target_amounts: dict[int,int]):
    
    """
    Function which splits a corpus into (roughly) stratified datasets for training, evaluation, and testing
    """
    train_keys = []
    test_keys = []
    eval_keys = []

    current_train = {x:0 for x in train_target_amounts.keys()}
    current_test = {x:0 for x in test_target_amounts.keys()}
    #Shuffle the authors so we don't always end up with the same sets
    random.shuffle(book_ids)
    for book in book_ids:
        #Append based on age and genre
        age = int(bdf.findAgeFromID(book))
        genre = int(book[-1])
        if current_train.get(age, 0)+1<=train_target_amounts[age] and current_train[genre]+1<=train_target_amounts[genre]:
            train_keys.append(book)
            current_train[age] = current_train[age]+1
            current_train[genre] = current_train[genre]+1
        elif current_test.get(age, 0)+1<=test_target_amounts[age] and current_test[genre]+1<=test_target_amounts[genre]:
            test_keys.append(book)
            current_test[age] = current_test[age]+1
            current_test[genre] = current_test[genre]+1
        else:
            eval_keys.append(book)
    return train_keys, test_keys, eval_keys

In [None]:
from pprint import pprint
trainkeys_straps = []
testkeys_straps = []
evalkeys_straps = []

#After getting further into this rabbit hole, I want to test if we can generate straps that include/exclude these books:

good_train_books = ['9789527337905_15_2', '9789510350843_13_3', '9789526309330_14_3', '9789526308616_13_3', '9789526308623_14_3', '9789526310053_15_3', '9789526308630_15_3']
bad_train_books = ['9789523560772_14_1', '9789523563841_14_1', '9789523564954_13_1', '9789523990234_12_1', '9789511349938_12_1', '9789511349945_12_1', '9789511252825_13_3', '9789511307143_14_3', '9789511347552_7_3', '9789511383604_8_3', '9789510427927_7_1']

only_novels = [x for x in list(corpus.keys()) if x[-1]=='1']

while len(trainkeys_straps) != 100:
    train_keys, test_keys, eval_keys = doTrainTestEvalSplitNoSeriesLevel(only_novels, train_target, test_target)
    if checkDuplicates(train_keys, test_keys+eval_keys):
        trainkeys_straps.append(train_keys)
        testkeys_straps.append(test_keys)
        evalkeys_straps.append(eval_keys)
        print(len(trainkeys_straps)+1)
    else:
        amounts = {1:0, 2:0, 3:0}
        for key in train_keys:
            genre = int(key[-1])
            amounts[genre] = amounts[genre] + 1
        pprint(amounts)

2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101


In [10]:
counter = 0
for x in trainkeys_straps:
    for y in trainkeys_straps:
        if x==y:
            counter += 1
print(counter)

100


In [None]:
temp_dict = []
for i in range(len(trainkeys_straps)):
    temp_dict.append({'id':i, 'train_keys':trainkeys_straps[i], 'eval_keys':evalkeys_straps[i], 'test_keys':testkeys_straps[i]})
with open("NewKeylists_only_novels.jsonl", 'w') as f:
    f.write('\n'.join(map(json.dumps, temp_dict)))