In [1]:
#Imports
from scripts import corpusMLfunctions as cmf
from datasets import logging, disable_progress_bars
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import json
import sys
import multiprocessing as mp
import os
import numpy as np
import shutil
#Constants
BASE_BEG = "SnippetDatasets/"
BASE_MID = "sniplen_"
BASE_END = ".jsonl"
KEYLISTS = "Keylists.jsonl"
SNIPPET_LENS = ['5','10','25','50','75','100']
CHOSEN_PARAMS = [{'c':18.677891780920422, 'tol':1e-05}]
#Set logging to not be as annoying
logging.set_verbosity(40)

#Defining functions for the program
def do_nothing(ex):
    return ex

def whitespace_tokenizer(ex):
    return ex.split(" ")

#Version for only using TfIdfVectorizer with raw text as input
def manualStudy(params, SNIPPET_LENS, keylists, i, k, cache_dir, overwrite: bool=True):
    disable_progress_bars()
    filename = "TestResults/ParamOptim_List_"+str(i)+"_SnipLen_"+str(SNIPPET_LENS[k])+"_Results.jsonl"
    cache_file_train = cache_dir+str(i)+"_text_"+str(SNIPPET_LENS[k])+"_train.jsonl"
    cache_file_test = cache_dir+str(i)+"_text_"+str(SNIPPET_LENS[k])+"_test.jsonl"
    if overwrite or not os.path.exists(filename):
        hf_cache_dir = cache_dir+str(i)+"_text_"+str(SNIPPET_LENS[k])+"_ds"
        train_keys = keylists[i]['train_keys']
        #Temporary edit to test with combining eval+test as we are not param optimizing
        eval_keys = keylists[i]['eval_keys']+keylists[i]['test_keys']
        train_dss = cmf.combineSnippedBooksToDS(train_keys, SNIPPET_LENS[k], hf_cache_dir, cache_file_train, inc_raw_text=True, folder=BASE_BEG)
        eval_dss = cmf.combineSnippedBooksToDS(eval_keys, SNIPPET_LENS[k], hf_cache_dir,  cache_file_test, inc_raw_text=True, folder=BASE_BEG)
        #Empty cache after we don't need it
        os.remove(cache_file_train)
        os.remove(cache_file_test)
        #with open(cache_file, 'w') as writer:
        #    writer.write("")
        #Continue on
        vectorizer = TfidfVectorizer(norm='l2', tokenizer=whitespace_tokenizer, preprocessor=do_nothing, max_features=2000).fit(train_dss['raw_text'])
        #print("Worker for length ",SNIPPET_LENS[k]," and keylist ",i," activated!")
        returnable = []
        for pair in params:
            #Train a new classifier for each set of params
            
                clf = LinearSVC(
                    loss='squared_hinge', penalty='l2',
                    random_state=42,
                    C=pair['c'],
                    tol=pair['tol']
                )
                clf.fit(vectorizer.transform(train_dss['raw_text']), train_dss['label'])
                predicted = clf.predict(vectorizer.transform(eval_dss['raw_text']))
                f1 = f1_score(eval_dss['label'], predicted, average="macro")
                #Reverse the dictionary
                index2feature = {}
                for feature,idx in vectorizer.vocabulary_.items():
                    assert idx not in index2feature #This really should hold
                    index2feature[idx]=feature
                #Now we can query index2feature to get the feature names as we need
                high_prio = {}
                # make a list of (weight, index), sort it
                for j in clf.classes_:
                    lst=[]
                    for idx,weight in enumerate(clf.coef_[list(clf.classes_).index(j)]):
                        lst.append((weight,idx))
                    lst.sort() #sort

                    #Print first few and last few
                    #for weight,idx in lst[:20]: #first 30 (ie lowest weight)
                    #    print(index2feature[idx])
                    #print("----------------------------------------------------")
                    #Take the last 30 (lst[-30:]) but these now come from weakest to strongest
                    #so reverse the list using [::-1]
                    highest_prio = []
                    for weight,idx in lst[-100:][::-1]:
                        highest_prio.append(index2feature[idx])
                    high_prio[j] = highest_prio
                returnable.append({'keylist_id':i, 'sniplen':SNIPPET_LENS[k], 'c':pair['c'], 'tol':pair['tol'], 'f1':f1, 'important_feats_7-8':high_prio['7-8'], 'important_feats_9-12':high_prio['9-12'], 'important_feats_13+':high_prio['13+']})
        with open(filename, 'w') as f:
            f.write('\n'.join(map(json.dumps, returnable)))
        #Clear hf cache to manage space
        shutil.rmtree(hf_cache_dir)

def testParamResults(permutations: int, keylists: list):
    #For local machines
    pool = mp.Pool(mp.cpu_count()-2)
    #For CSC environments
    #pool = mp.Pool(len(os.sched_getaffinity(0)))
    pbar = tqdm(total=permutations*len(SNIPPET_LENS))
    def update(*a):
     pbar.update(1)
    #Generate temporary cache dir to manage memory
    cache_dir = "cache_dir/"
    if not os.path.exists(cache_dir):
        os.mkdir(cache_dir)
    #Add to list the test results of our 'manual' study
    for k in range(len(SNIPPET_LENS)):
        pool.apply_async(manualStudy, [CHOSEN_PARAMS, SNIPPET_LENS, keylists, 93, k, cache_dir, True], callback=update)
    #print("All running!")
    pool.close()
    #print("Pool closed!")
    pool.join()
    #print("Waiting done!")
    
    

#Main function
def main(cmd_args):
    #Fetch keylists
    keylists = []
    with open(KEYLISTS, 'r') as f:
        for line in f:
            keylists.append(json.loads(line))
    #testParamResults(1, keylists)
#Pass cmd args to main function
if __name__ == "__main__":
    main(sys.argv[1:])


  from .autonotebook import tqdm as notebook_tqdm


In [40]:
import optuna
from sklearn import metrics
from scripts import corpusMLfunctions as cmf
from datasets import logging, disable_progress_bars
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
import json
import sys
import multiprocessing as mp
import os
import numpy as np
import shutil
keylist_num = 93
k = 2

In [41]:
keylists = []
with open(KEYLISTS, 'r') as f:
    for line in f:
        keylists.append(json.loads(line))

In [42]:
#Defining functions for the program
def do_nothing(ex):
    return ex.lower()

In [43]:
def maskPropnWithMask(example):
    df = cmf.snippetConllu2DF(example['conllu'])
    df.loc[df['upos'] == 'PROPN', 'lemma'] = "[MASK]"
    df.loc[df['upos'] == 'PROPN', 'text'] = "[MASK]"
    example['masked_text'] = ' '.join(df['text'].to_numpy('str'))
    return example

In [44]:
from datasets import Dataset
from scripts import bookdatafunctions as bdf
def combineSnippedBooksToDS(keys: list[str], snip_len: str, cache_dir: str, cache_file:str, inc_raw_text: bool=False, inc_conllu: bool=False, inc_hpfv: bool=False, folder:str=None):
    #logging.set_verbosity(40)
    #Helper function to parse json-lines
    def jsonlReader(key: str):
        with open(folder+key+"/sniplen_"+snip_len+".jsonl") as reader:
            with open(cache_file, 'a') as tt:
                 #Only include the information we need for our specific purposes to save up on cache space
                 for line in reader:
                        if not inc_raw_text:
                            line = line[:line.find(",\"raw_text\":")] + line[line.find(",\"conllu\":"):]
                        if not inc_conllu:
                            line = line[:line.find(",\"conllu\":")] + line[line.find(",\"hp_fv\":"):]
                        if not inc_hpfv:
                            line = line[:line.find(",\"hp_fv\":")] + line[line.find("}\n"):]
                        tt.write(line)
    #Generate list of dicts, where each dict is a json-line
    for k in range(len(keys)):
        if int(bdf.findAgeFromID(keys[k])) < 9 or int(bdf.findAgeFromID(keys[k])) > 12:
            jsonlReader(keys[k])
    #Return a shuffled dataset
    return Dataset.from_json(cache_file, cache_dir=cache_dir).shuffle()

In [45]:
#Also makes it easier to clean cache files and use space more efficiently
shutil.rmtree("cache_dir/temp/")
os.mkdir("cache_dir/temp/")
cache_dir = "cache_dir/temp/"
cache_file_train = cache_dir+str(keylist_num)+"_"+str(SNIPPET_LENS[k])+"_train.jsonl"
cache_file_eval = cache_dir+str(keylist_num)+"_"+str(SNIPPET_LENS[k])+"_eval.jsonl"
cache_file_test = cache_dir+str(keylist_num)+"_"+str(SNIPPET_LENS[k])+"_test.jsonl"

In [46]:
train_keys = keylists[keylist_num]['train_keys']
#Temporary edit to test with combining eval+test as we are not param optimizing
eval_keys = keylists[keylist_num]['eval_keys']
test_keys = keylists[keylist_num]['test_keys']
train_dss = combineSnippedBooksToDS(train_keys, SNIPPET_LENS[k], cache_dir, cache_file_train, inc_raw_text=True, folder=BASE_BEG)
eval_dss = combineSnippedBooksToDS(eval_keys, SNIPPET_LENS[k], cache_dir,  cache_file_eval, inc_raw_text=True, folder=BASE_BEG)
test_dss = combineSnippedBooksToDS(test_keys, SNIPPET_LENS[k], cache_dir,  cache_file_test, inc_raw_text=True, folder=BASE_BEG)

#train_dss = train_dss.map(maskPropnWithMask)
#eval_dss = eval_dss.map(maskPropnWithMask)
#test_dss = test_dss.map(maskPropnWithMask)

#train_dss = train_dss.filter(lambda x: x['label'] != '9-12')
#eval_dss = eval_dss.filter(lambda x: x['label'] != '9-12')
#test_dss = test_dss.filter(lambda x: x['label'] != '9-12')
#Empty cache after we don't need it
os.remove(cache_file_train)
os.remove(cache_file_eval)
os.remove(cache_file_test)
#with open(cache_file, 'w') as writer:
#    writer.write("")
#Continue on
vectorizer = TfidfVectorizer(norm='l2', tokenizer=whitespace_tokenizer, preprocessor=do_nothing, max_features=2000).fit(train_dss['raw_text'])

Generating train split: 19938 examples [00:00, 309301.05 examples/s]
Generating train split: 6196 examples [00:00, 339480.45 examples/s]
Generating train split: 4034 examples [00:00, 386464.96 examples/s]


In [47]:
from pprint import pprint
from collections import Counter
import pandas as pd

pprint(train_dss)

#"Equal" the number based on age groups
min_nums_train = []
min_nums_eval = []
min_nums_test = []

for i in range(1):
    min_nums_train.append(np.min(list(Counter(train_dss['label']).values())))
    min_nums_eval.append(np.min(list(Counter(eval_dss['label']).values())))
    min_nums_test.append(np.min(list(Counter(test_dss['label']).values())))

for i in range(1):
    train_df = train_dss.to_pandas()
    #sampled = pd.concat([train_df[train_df['label'] == '7-8'].sample(min_nums_train[i], replace=False), train_df[train_df['label'] == '9-12'].sample(min_nums_train[i], replace=False), train_df[train_df['label'] == '13+'].sample(min_nums_train[i], replace=False)])
    sampled = pd.concat([train_df[train_df['label'] == '7-8'].sample(min_nums_train[i], replace=False), train_df[train_df['label'] == '13+'].sample(min_nums_train[i], replace=False)])
    sampled = sampled[sampled['book_id'].apply(lambda x: x[-1] == '1')]
    train_dss = Dataset.from_pandas(sampled)
    eval_df = eval_dss.to_pandas()
    #sampled = pd.concat([eval_df[eval_df['label'] == '7-8'].sample(min_nums_eval[i], replace=False), eval_df[eval_df['label'] == '9-12'].sample(min_nums_eval[i], replace=False), eval_df[eval_df['label'] == '13+'].sample(min_nums_eval[i], replace=False)])
    sampled = eval_df[eval_df['book_id'].apply(lambda x: x[-1] == '1')]
    eval_dss = Dataset.from_pandas(sampled)
    test_df = test_dss.to_pandas()
    #sampled = pd.concat([test_df[test_df['label'] == '7-8'].sample(min_nums_test[i], replace=False), test_df[test_df['label'] == '9-12'].sample(min_nums_test[i], replace=False), test_df[test_df['label'] == '13+'].sample(min_nums_test[i], replace=False)])
    sampled = test_df[test_df['book_id'].apply(lambda x: x[-1] == '1')]
    test_dss = Dataset.from_pandas(sampled)


Dataset({
    features: ['book_id', 'age', 'label', 'raw_text'],
    num_rows: 19938
})


In [48]:
def objective(trial):
    #Defining hyperparameters to tune
    c = trial.suggest_float('c', 1e-5, 1e+1, log=True)
    tol = trial.suggest_categorical('tol', [1e-6, 1e-5, 1e-4])
    clf = LinearSVC(
        loss='squared_hinge', penalty='l2',
        random_state=42,
        C=c,
        tol=tol
    )
    clf.fit(vectorizer.transform(train_dss['raw_text']), train_dss['label'])
    predicted = clf.predict(vectorizer.transform(eval_dss['raw_text']))
    return f1_score(eval_dss['label'], predicted, average="macro")

In [49]:
# Your code for hyperparameter optimization here
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2025-08-22 11:01:27,635] A new study created in memory with name: no-name-ad0d1ed8-7726-4da7-a33e-e35393f815a7
[I 2025-08-22 11:01:28,955] Trial 0 finished with value: 0.7623857306318145 and parameters: {'c': 6.6205772990868486, 'tol': 1e-06}. Best is trial 0 with value: 0.7623857306318145.
[I 2025-08-22 11:01:30,040] Trial 1 finished with value: 0.506185018090783 and parameters: {'c': 0.0003707799527256626, 'tol': 0.0001}. Best is trial 0 with value: 0.7623857306318145.
[I 2025-08-22 11:01:31,103] Trial 2 finished with value: 0.49718619477634807 and parameters: {'c': 0.0002569563847504035, 'tol': 1e-06}. Best is trial 0 with value: 0.7623857306318145.
[I 2025-08-22 11:01:32,305] Trial 3 finished with value: 0.7777848464844872 and parameters: {'c': 1.208654402777989, 'tol': 1e-06}. Best is trial 3 with value: 0.7777848464844872.
[I 2025-08-22 11:01:33,442] Trial 4 finished with value: 0.7727917024072777 and parameters: {'c': 1.7442047619639292, 'tol': 0.0001}. Best is trial 3 with v

In [50]:
clf = LinearSVC(
        loss='squared_hinge', penalty='l2',
        random_state=42,
        C=study.best_params['c'],
        tol=study.best_params['tol']
)
clf.fit(vectorizer.transform(train_dss['raw_text']), train_dss['label'])
predicted = clf.predict(vectorizer.transform(test_dss['raw_text']))
print(metrics.classification_report(predicted, test_dss['label']))

              precision    recall  f1-score   support

         13+       0.77      0.95      0.85      2586
         7-8       0.78      0.36      0.50      1172

    accuracy                           0.77      3758
   macro avg       0.77      0.66      0.67      3758
weighted avg       0.77      0.77      0.74      3758

