In [52]:
#Imports
from scripts import bookdatafunctions as bdf
from scripts import corpusMLfunctions as cmf
import os
import numpy as np
import shutil
import pandas as pd
import random
from pprint import pprint

In [53]:
#Constants
AGE_SHEET = "ISBN_MAPS/ISBN2AGE.xlsx"
AUTH_SHEET = "ISBN_MAPS/ISBN2AUTH.xlsx"
CONLLUS_FOLDER = "Conllus"
SNIPPET_LENS = [5,10,25,50,75,100]

In [54]:
#Load corpus
corpus = bdf.mapGroup2Age(bdf.maskPropn(bdf.initBooksFromConllus(CONLLUS_FOLDER)), AGE_SHEET)

## Test with one book in one dataset

In [55]:
def generateAgeStratificationAmounts(corpus_with_ages: dict[str,pd.DataFrame], train_size: float) -> tuple[dict[int,int],dict[int,int],dict[int,int]]:
    train = {}
    test = {}
    eval = {}
    ages = bdf.getAvailableAges(corpus_with_ages)
    for age in ages:
        raw_amount = len([x for x in list(corpus_with_ages.keys()) if bdf.findAgeFromID(x)==str(age)])
        train[age] = int(raw_amount*train_size)
        test[age] = int((raw_amount-train[age])/2)
        eval[age] = int((raw_amount-train[age])/2)
    return train, test, eval

In [56]:
def checkGenreBalance(keylist: list[str], train_size: float, key_to_add: str):
    fiction_size = 225*train_size
    nonfiction_size = 45*train_size
    textbook_size = 30*train_size

    amounts = len([y for y in keylist if y[-1] == key_to_add[-1]])
    if key_to_add[-1] == '1' and amounts < fiction_size:
        return True
    elif key_to_add[-1] == '2' and amounts < nonfiction_size:
        return True
    elif key_to_add[-1] == '3' and amounts < textbook_size:
        return True
    return False

In [57]:
def doTrainTestEvalSplitWithGenres(keys: list[str], train_target_amounts: dict[int,int], test_target_amounts: dict[int,int], eval_target_amounts: dict[int,int]):
    
    """
    Function which splits a corpus into (roughly) stratified datasets for training, evaluation, and testing
    """
    train_keys = []
    test_keys = []
    eval_keys = []
    random.shuffle(keys)
    for key in keys:
        #Get dicts for age:number of entries
        current_train = cmf.getNumOfEntriesPerAge(train_keys)
        current_test = cmf.getNumOfEntriesPerAge(test_keys)
        #Check which list to add the key to
        age = int(bdf.findAgeFromID(key))
        if current_train.get(age, -1) < train_target_amounts[age] and checkGenreBalance(train_keys, 0.7, key):
            train_keys.append(key)
        elif current_test.get(age, -1) < test_target_amounts[age] and checkGenreBalance(test_keys, 0.15, key):
            test_keys.append(key)
        else:
            eval_keys.append(key)

    return train_keys, test_keys, eval_keys

In [79]:
#Generate keylist with all snippets from one book being in one dataset
keys = list(corpus.keys())
train_target, test_target, eval_target = generateAgeStratificationAmounts(corpus, 0.7)
train_keys, test_keys, eval_keys = doTrainTestEvalSplitWithGenres(keys, train_target, test_target, eval_target)

In [80]:
print(len(train_keys))
print(len(test_keys))
print(len(eval_keys))

203
44
53


## Test with only train-test-eval splitting by huggingface (snippets can be in any set)

## Run TFIDF test

In [86]:
#Imports
from scripts import bookdatafunctions as bdf
from scripts import corpusMLfunctions as cmf
import pandas as pd
import numpy as np
from datasets import Dataset, disable_progress_bars
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import f1_score
import optuna
import json
import multiprocessing as mp
import shutil
import warnings
from tqdm import tqdm

#Constants
AGES = ['5','6','7','8','9','10','11','12','13','14','15']
BASE_BEG = "SnippetDatasets/"
BASE_MID = "sniplen_"
BASE_END = ".jsonl"
KEYLISTS = "Keylists.jsonl"
keylists = []
with open(KEYLISTS, 'r') as f:
    for line in f:
        keylists.append(json.loads(line))

#Helper functions
def do_nothing(ex):
    return ex.lower()

def conllu_tokenizer(ex):
    return ex.replace("\n", "\t").replace("|", "\t").split("\t")

def whitespace_tokenizer(ex):
    return ex.split(" ")


def generateIntervals(ages: list[str]):
    intervals = []
    #Powersets of 2 intervals
    for i in range(2, len(ages)-1):
        #temp = intervals[2]
        intervals.append((ages[:i], ages[i:]))
        #intervals[2] = temp
    #Powersets for 3 intervals
    for i in range(2, len(ages)-3):
        for j in range(i+2, len(ages)-1):
            #temp = intervals[3]
            intervals.append((ages[:i], ages[i:j], ages[j:]))
            #intervals[3] = temp
    #Powersets for 4 intervals
    for i in range(2, len(ages)-5):
        for j in range(i+2, len(ages)-3):
            for k in range(j+2, len(ages)-1):
                #temp = intervals[4]
                intervals.append((ages[:i], ages[i:j], ages[j:k], ages[k:]))
                #intervals[4] = temp
    return intervals

def reMapLabels(ex, intervals):
    age = ex['age']
    if age > 14:
        age = 15
    age = str(age)
    for n in intervals:
        if age in n:
            ex['label'] = n[0]+'-'+n[-1]
    return ex

def initDatasets():
    train_keys = keylists[0]['train_keys']
    eval_keys = keylists[0]['eval_keys']
    test_keys = keylists[0]['test_keys']
    #Also makes it easier to clean cache files and use space more efficiently
    cache_dir = "cache_dir/temp/"
    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)
    os.mkdir(cache_dir)
    train_ds = cmf.combineSnippedBooksToDS(train_keys, '50', cache_dir, cache_file=cache_dir+"0_50_train.jsonl", folder=BASE_BEG, inc_raw_text=True, inc_hpfv=True)
    eval_ds = cmf.combineSnippedBooksToDS(eval_keys, '50', cache_dir, cache_file=cache_dir+"0_50_eval.jsonl", folder=BASE_BEG, inc_raw_text=True, inc_hpfv=True)
    test_ds = cmf.combineSnippedBooksToDS(test_keys, '50', cache_dir, cache_file=cache_dir+"0_50_test.jsonl", folder=BASE_BEG, inc_raw_text=True, inc_hpfv=True)
    return train_ds, eval_ds, test_ds

def initDatasetsNoBookLevelSplitting():
    train_keys = keylists[0]['train_keys'] + keylists[0]['eval_keys'] + keylists[0]['test_keys']
    #Also makes it easier to clean cache files and use space more efficiently
    cache_dir = "cache_dir/temp/"
    if os.path.exists(cache_dir):
        shutil.rmtree(cache_dir)
    os.mkdir(cache_dir)
    train_ds = cmf.combineSnippedBooksToDS(train_keys, '50', cache_dir, cache_file=cache_dir+"0_50_train.jsonl", folder=BASE_BEG, inc_raw_text=True, inc_hpfv=True)
    train_ds = train_ds.class_encode_column("age")
    pprint(train_ds.features)
    train_test = train_ds.train_test_split(test_size=0.7, stratify_by_column='age')
    test_eval = train_test['test'].train_test_split(test_size=0.5, stratify_by_column='age')



    return train_test['train'], test_eval['train'], test_eval['test']

#Code to run in parallel
def evaluateGroups(train_ds: Dataset, eval_ds: Dataset, test_ds: Dataset, intervals):
    #Map labels to match the intervals given (aka re-assign labels)
    train_ds = train_ds.map(reMapLabels, fn_kwargs={"intervals":intervals})
    eval_ds = eval_ds.map(reMapLabels, fn_kwargs={"intervals":intervals})
    test_ds = test_ds.map(reMapLabels, fn_kwargs={"intervals":intervals})
    #Initialize and fir our vectorizer
    vectorizer = TfidfVectorizer(norm='l2', tokenizer=whitespace_tokenizer, preprocessor=do_nothing, max_features=2000).fit(train_ds['raw_text'])
    #Vectorize datasets (re-using other code.. This could be neater but oh well :)
    vectorized_train = vectorizer.transform(train_ds['raw_text'])
    vectorized_eval = vectorizer.transform(eval_ds['raw_text'])
    vectorized_test = vectorizer.transform(test_ds['raw_text'])

    returnable = {}
    c_eval_pairs= []
    
    #Very quick hyperparam optimization as we have computational resources
    def objective(trial):
        #Defining hyperparameters to tune
        c = trial.suggest_float('c', 1e-10, 1e+0, log=True)
        pen = trial.suggest_categorical('pen', ['l1', 'l2'])
        tol = trial.suggest_float('tol', 1e-10, 1e-3, log=True)
        clf = LinearSVC(
            random_state=42,
            C=c,
            tol=tol,
            penalty=pen
        )
        clf.fit(vectorized_train, train_ds['label'])
        predicted = clf.predict(vectorized_eval)
        f1 = f1_score(eval_ds['label'], predicted, average="macro")
        c_eval_pairs.append([c, f1])
        return f1

    # Your code for hyperparameter optimization here
    study = optuna.create_study(direction='maximize')
    optuna.logging.disable_default_handler()
    study.optimize(objective, n_trials=25)

    #Run with best params
    clf = LinearSVC(
        penalty=study.best_trial.params['pen'],
        random_state=42,
        C=study.best_trial.params['c'],
        tol=study.best_trial.params['tol'],
    )
    clf.fit(vectorized_train, train_ds['label'])
    test_predict = clf.predict(vectorized_test)

    #Generate unique id
    id = "_".join([x[0]+"-"+x[-1] for x in intervals])

    #Assign returnble values
    returnable['f1'] = f1_score(test_ds['label'], test_predict, average="macro")
    returnable['id'] = id
    returnable['labels'] = clf.classes_.tolist()
    returnable['conf_matrix'] = metrics.confusion_matrix(test_ds['label'], test_predict).tolist()
    returnable['c'] = study.best_trial.params['c']
    returnable['tol'] = study.best_trial.params['tol']
    returnable['penalty'] = study.best_trial.params['pen']
    #returnable['c_eval_scores'] = c_eval_pairs

    #Write JSON-file
    #with open("TestResults/DifferentAgeGroups_tfidf/"+id+".json", 'w') as f:
    #    f.write(json.dumps(returnable))

    return returnable

In [87]:
warnings.filterwarnings('ignore') 
os.environ['PYTHONWARNINGS']='ignore'
disable_progress_bars()
train_ds_base, eval_ds_base, test_ds_base = initDatasets()
interval_splits = generateIntervals(AGES)[21:22]
dicts = []
with tqdm(total=len(interval_splits)) as pbar:
    for i in interval_splits:
        dicts.append(evaluateGroups(train_ds_base, eval_ds_base, test_ds_base, i))
        pbar.update(1)
print("Waiting done!")

100%|██████████| 1/1 [01:25<00:00, 85.56s/it]

Waiting done!





In [89]:
pprint(dicts)

[{'c': 0.12994152259537403,
  'conf_matrix': [[893, 88, 342], [118, 393, 220], [470, 268, 247]],
  'f1': 0.4811302060604299,
  'id': '5-8_9-12_13-15',
  'labels': ['13-15', '5-8', '9-12'],
  'penalty': 'l2',
  'tol': 2.4568823492943698e-08}]
