In [12]:
#Imports
from scripts import bookdatafunctions as bdf
from scripts import corpusMLfunctions as cmf
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, logging
from tqdm import tqdm
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics
from sklearn.metrics import f1_score
import optuna
import json
import multiprocessing as mp

In [13]:
BASE_BEG = "SnippetDatasets/"
BASE_MID = "sniplen_"
BASE_END = ".jsonl"
KEYLISTS = "Keylists.jsonl"
SNIPPET_LENS = ['5','10','25','50','75','100']
CHOSEN_PARAMS = [{'c':0.15, 'tol':1e-6}, {'c':0.4, 'tol':1e-4}, {'c':5, 'tol':1e-4}, {'c':15, 'tol':1e-3}, {'c':60, 'tol':1e-3}, {'c':120, 'tol':1e-3}]
logging.set_verbosity(40)

In [14]:
keylists = []
with open(KEYLISTS, 'r') as f:
    for line in f:
        keylists.append(json.loads(line))

In [15]:
def do_nothing(ex):
    return ex

def conllu_tokenizer(ex):
    return ex.replace("\n", "\t").replace("|", "\t").split("\t")

In [None]:
def manualStudy(params, SNIPPET_LENS, keylists, i, k, overwrite: bool=True):
    filename = "TestResults/ParamOptim_List_"+str(i)+"_SnipLen_"+str(SNIPPET_LENS[k])+"_Results.jsonl"
    if overwrite or not os.path.exists(filename):
        train_keys = keylists[i]['train_keys']
        eval_keys = keylists[i]['eval_keys']
        train_dss = cmf.combineSnippedBooksToDS(train_keys, SNIPPET_LENS[k], BASE_BEG)
        eval_dss = cmf.combineSnippedBooksToDS(eval_keys, SNIPPET_LENS[k], BASE_BEG)
        vectorizer = TfidfVectorizer(norm='l2', tokenizer=conllu_tokenizer, preprocessor=do_nothing, max_features=2000).fit(train_dss['conllu'])

        vecd_train_data = vectorizer.transform(train_dss['conllu'])
        vecd_eval_data = vectorizer.transform(eval_dss['conllu'])
        #print("Worker for length ",SNIPPET_LENS[k]," and keylist ",i," activated!")
        returnable = []
        for pair in params:
            #Train a new classifier for each set of params
            
                clf = LinearSVC(
                    random_state=42,
                    C=pair['c'],
                    tol=pair['tol']
                )
                clf.fit(vecd_train_data, train_dss['label'])
                predicted = clf.predict(vecd_eval_data)
                f1 = f1_score(eval_dss['label'], predicted, average="macro")
                #Reverse the dictionary
                index2feature = {}
                for feature,idx in vectorizer.vocabulary_.items():
                    assert idx not in index2feature #This really should hold
                    index2feature[idx]=feature
                #Now we can query index2feature to get the feature names as we need
                high_prio = {}
                # make a list of (weight, index), sort it
                for j in range(3):
                    lst=[]
                    for idx,weight in enumerate(clf.coef_[j]):
                        lst.append((weight,idx))
                    lst.sort() #sort

                    #Print first few and last few
                    #for weight,idx in lst[:20]: #first 30 (ie lowest weight)
                    #    print(index2feature[idx])
                    #print("----------------------------------------------------")
                    #Take the last 30 (lst[-30:]) but these now come from weakest to strongest
                    #so reverse the list using [::-1]
                    highest_prio = []
                    for weight,idx in lst[-100:][::-1]:
                        highest_prio.append(index2feature[idx])
                    high_prio[j] = highest_prio
                returnable.append({'keylist_id':i, 'sniplen':SNIPPET_LENS[k], 'c':pair['c'], 'tol':pair['tol'], 'f1':f1, 'important_feats_7-8':high_prio[0], 'important_feats_9-12':high_prio[1], 'important_feats_13+':high_prio[2]})
        with open(filename, 'w') as f:
            f.write('\n'.join(map(json.dumps, returnable)))

In [17]:
def testParamResults(permutations: int, keylists: list):
    pool = mp.Pool(mp.cpu_count())
    pbar = tqdm(total=permutations)
    def update(*a):
     pbar.update()
    for i in range(permutations):
        #Add to list the test results of our 'manual' study
        for k in range(len(SNIPPET_LENS)):
            pool.apply_async(manualStudy, [CHOSEN_PARAMS, SNIPPET_LENS, keylists, i, k], callback=update)
    #print("All running!")
    pool.close()
    #print("Pool closed!")
    pool.join()
    #print("Waiting done!")
    

In [18]:
testParamResults(1, keylists)

6it [05:21, 53.63s/it]


In [None]:
def testForAllSets():
    test_results = []
    with tqdm(range(100), desc="Iterating through permutations...") as pbar:
        for i in range(2):
            train_keys = keylists[i]['train_keys']
            eval_keys = keylists[i]['eval_keys']
            test_keys = keylists[i]['train_keys']
            train_dss = [cmf.combineSnippedBooksToDS(train_keys, x, BASE_BEG) for x in SNIPPET_LENS]
            eval_dss = [cmf.combineSnippedBooksToDS(eval_keys, x, BASE_BEG) for x in SNIPPET_LENS]
            test_dss = [cmf.combineSnippedBooksToDS(test_keys, x, BASE_BEG) for x in SNIPPET_LENS]
            vectorizers = [TfidfVectorizer(norm='l2', tokenizer=conllu_tokenizer, preprocessor=do_nothing, max_features=2000).fit(x['conllu']) for x in train_dss]

            vecd_train_datas = [vectorizers[i].transform(train_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]
            vecd_eval_datas = [vectorizers[i].transform(eval_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]
            vecd_test_datas = [vectorizers[i].transform(test_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]

            classifiers = {}
            f_scores = []
            for j in range(len(SNIPPET_LENS)):
                clf = LinearSVC(
                loss='squared_hinge', penalty='l2',
                                random_state=42,
                                C=62.42249746377182,
                                tol=0.001)
                clf.fit(vecd_train_datas[j], train_dss[j]['label'])
                classifiers[SNIPPET_LENS[j]] = clf
                test_predict = clf.predict(vecd_test_datas[j])
                f_scores.append(f1_score(test_dss[j]['label'], test_predict, average="macro"))

            #Reverse the dictionary
            index2features = {}
            for i in range(len(SNIPPET_LENS)):
                index2feature = {}
                vectorizer = vectorizers[i]
                for feature,idx in vectorizer.vocabulary_.items():
                    assert idx not in index2feature #This really should hold
                    index2feature[idx]=feature
                index2features[SNIPPET_LENS[i]] = index2feature
            #Now we can query index2feature to get the feature names as we need
            highest_prios = {}
            for i in SNIPPET_LENS:
                high_prio = {}
                classifier = classifiers[i]
                index2feature = index2features[i]
                # make a list of (weight, index), sort it
                for j in range(3):
                    lst=[]
                    for idx,weight in enumerate(classifier.coef_[j]):
                        lst.append((weight,idx))
                    lst.sort() #sort

                    #Print first few and last few
                    #for weight,idx in lst[:20]: #first 30 (ie lowest weight)
                    #    print(index2feature[idx])
                    #print("----------------------------------------------------")
                    #Take the last 30 (lst[-30:]) but these now come from weakest to strongest
                    #so reverse the list using [::-1]
                    highest_prio = []
                    for weight,idx in lst[-100:][::-1]:
                        highest_prio.append(index2feature[idx])
                    high_prio[j] = high_prio
                highest_prios[i] = high_prio
            test_results.append({'id':i, 'f1s':f_scores, 'feats':highest_prios})
            pbar.update(1)
    return test_results

        


In [12]:
results = testForAllSets()

Iterating through permutations...:   2%|▏         | 2/100 [12:09<9:55:37, 364.66s/it] 


In [13]:
print(results)

[{'id': '100', 'f1s': [0.5699851227037915, 0.6552567555699155, 0.7699173971144445, 0.856666833668649, 0.8923223430926203, 0.9031749401682863], 'feats': {'5': {0: {...}, 1: {...}, 2: {...}}, '10': {0: {...}, 1: {...}, 2: {...}}, '25': {0: {...}, 1: {...}, 2: {...}}, '50': {0: {...}, 1: {...}, 2: {...}}, '75': {0: {...}, 1: {...}, 2: {...}}, '100': {0: {...}, 1: {...}, 2: {...}}}}, {'id': '100', 'f1s': [0.5776534375819785, 0.657753819893878, 0.768711852225635, 0.8520056629480202, 0.8906756306062326, 0.9006938491005654], 'feats': {'5': {0: {...}, 1: {...}, 2: {...}}, '10': {0: {...}, 1: {...}, 2: {...}}, '25': {0: {...}, 1: {...}, 2: {...}}, '50': {0: {...}, 1: {...}, 2: {...}}, '75': {0: {...}, 1: {...}, 2: {...}}, '100': {0: {...}, 1: {...}, 2: {...}}}}]


In [None]:
train_keys = keylists[0]['train_keys']
eval_keys = keylists[0]['eval_keys']
test_keys = keylists[0]['train_keys']

In [None]:
train_dss = [cmf.combineSnippedBooksToDS(train_keys, x, BASE_BEG) for x in SNIPPET_LENS]
eval_dss = [cmf.combineSnippedBooksToDS(eval_keys, x, BASE_BEG) for x in SNIPPET_LENS]
test_dss = [cmf.combineSnippedBooksToDS(test_keys, x, BASE_BEG) for x in SNIPPET_LENS]

In [8]:
vectorizers = [TfidfVectorizer(norm='l2', tokenizer=conllu_tokenizer, preprocessor=do_nothing, max_features=2000).fit(x['conllu']) for x in train_dss]



In [9]:
vecd_train_datas = [vectorizers[i].transform(train_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]
vecd_eval_datas = [vectorizers[i].transform(eval_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]
vecd_test_datas = [vectorizers[i].transform(test_dss[i]['conllu']) for i in range(len(SNIPPET_LENS))]

In [None]:
from scipy.stats import hmean
# Your code to train the machine learning model on the training set and evaluate the performance on the validation set here
def objective(trial):
    #Defining hyperparameters to tune
    c = trial.suggest_float('c', 1e-1, 1e+2, log=True)
    tol = trial.suggest_categorical('tol', [1e-6, 1e-5, 1e-4, 1e-3])

    #Scaling the c-param by num_of_samples * sqrt of train size
    
    #c_scaler = 1

    f1s = []
    for i in range(len(SNIPPET_LENS)):
        #c_scaler = float(len(train_dss[i]['conllu']) * np.sqrt(0.7)) 
        clf = LinearSVC(
            random_state=42,
            C=c,
            tol=tol
        )
        clf.fit(vecd_train_datas[i], train_dss[i]['label'])
        predicted = clf.predict(vecd_eval_datas[i])
        f1s.append(f1_score(eval_dss[i]['label'], predicted, average="macro"))
    return np.min(f1s)

In [17]:
"""
from sklearn.linear_model import LogisticRegression

# Your code to train the machine learning model on the training set and evaluate the performance on the validation set here
def objective(trial):
    #Defining hyperparameters to tune
    c = trial.suggest_float('c', 1e-5, 1e+1, log=True)
    tol = trial.suggest_categorical('tol', [1e-6, 1e-5, 1e-4, 1e-3])

    #Scaling the c-param by num_of_samples * sqrt of train size
    #c_scaler = float(len(train_ds['conllu']) * np.sqrt(0.7)) 
    c_scaler = 1

    f1s = []
    for i in range(len(SNIPPET_LENS)):
        clf = LogisticRegression(
            penalty='l1',
            solver='liblinear',
            random_state=42,
            C=c,
            tol=tol
        )
        clf.fit(vecd_train_datas[i], train_dss[i]['age'])
        predicted = clf.predict(vecd_eval_datas[i])
        f1s.append(f1_score(eval_dss[i]['age'], predicted, average="macro"))
    return hmean(f1s)
"""

'\nfrom sklearn.linear_model import LogisticRegression\n\n# Your code to train the machine learning model on the training set and evaluate the performance on the validation set here\ndef objective(trial):\n    #Defining hyperparameters to tune\n    c = trial.suggest_float(\'c\', 1e-5, 1e+1, log=True)\n    tol = trial.suggest_categorical(\'tol\', [1e-6, 1e-5, 1e-4, 1e-3])\n\n    #Scaling the c-param by num_of_samples * sqrt of train size\n    #c_scaler = float(len(train_ds[\'conllu\']) * np.sqrt(0.7)) \n    c_scaler = 1\n\n    f1s = []\n    for i in range(len(SNIPPET_LENS)):\n        clf = LogisticRegression(\n            penalty=\'l1\',\n            solver=\'liblinear\',\n            random_state=42,\n            C=c,\n            tol=tol\n        )\n        clf.fit(vecd_train_datas[i], train_dss[i][\'age\'])\n        predicted = clf.predict(vecd_eval_datas[i])\n        f1s.append(f1_score(eval_dss[i][\'age\'], predicted, average="macro"))\n    return hmean(f1s)\n'

In [None]:
# Your code for hyperparameter optimization here
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

[I 2025-06-19 09:21:14,301] A new study created in memory with name: no-name-ae9a8335-e3de-4ef1-9cc6-771782f02866
[I 2025-06-19 09:21:27,434] Trial 0 finished with value: 0.19579512827637305 and parameters: {'c': 0.007783548769941197, 'tol': 1e-05}. Best is trial 0 with value: 0.19579512827637305.
[I 2025-06-19 09:21:45,665] Trial 1 finished with value: 0.26692117161017953 and parameters: {'c': 0.0583549361905685, 'tol': 0.0001}. Best is trial 1 with value: 0.26692117161017953.
[I 2025-06-19 09:21:50,760] Trial 2 finished with value: 0.19579512827637305 and parameters: {'c': 0.00022525489149142507, 'tol': 0.0001}. Best is trial 1 with value: 0.26692117161017953.
[I 2025-06-19 09:23:49,597] Trial 3 finished with value: 0.3788184883974371 and parameters: {'c': 25.54808299451608, 'tol': 1e-05}. Best is trial 3 with value: 0.3788184883974371.
[I 2025-06-19 09:23:53,822] Trial 4 finished with value: 0.19579512827637305 and parameters: {'c': 3.87742306223791e-05, 'tol': 0.0001}. Best is tria

In [19]:
# Print the best trial's hyperparameters and objective value
best_trial = study.best_trial
print(f"Best trial (number {best_trial.number}):")
print(f"  Value: {best_trial.value}")
print(f"  Params: {best_trial.params}")

Best trial (number 15):
  Value: 0.3793497189025488
  Params: {'c': 62.42249746377182, 'tol': 0.001}


In [20]:
classifiers = {}

In [27]:
c_scalers = []
for i in range(len(SNIPPET_LENS)):
    c_scalers.append(best_trial.params['c']*(float(len(train_dss[i]['conllu']) * np.sqrt(0.7)) ))

In [28]:
print(c_scalers)

[11005566.586019961, 5504193.406036358, 2203014.3584691826, 1102708.3866274317, 736183.4525859752, 552607.6271149407]


In [33]:
for i in range(len(SNIPPET_LENS)):
    clf = LinearSVC(
    loss='squared_hinge', penalty='l2',
                    random_state=42,
                    C=62.42249746377182,
                    tol=best_trial.params['tol'])
    clf.fit(vecd_train_datas[i], train_dss[i]['label'])
    classifiers[SNIPPET_LENS[i]] = clf
    test_predict = clf.predict(vecd_test_datas[i])
    print("Results for snippet length: ",SNIPPET_LENS[i])
    print(metrics.classification_report(test_predict, test_dss[i]['label']))

Results for snippet length:  5
              precision    recall  f1-score   support

         13+       0.79      0.64      0.70    116490
         7-8       0.35      0.57      0.44     24643
        9-12       0.55      0.60      0.57     69595

    accuracy                           0.62    210728
   macro avg       0.56      0.60      0.57    210728
weighted avg       0.66      0.62      0.63    210728

Results for snippet length:  10
              precision    recall  f1-score   support

         13+       0.82      0.71      0.76     54520
         7-8       0.50      0.65      0.56     15524
        9-12       0.62      0.67      0.64     35347

    accuracy                           0.69    105391
   macro avg       0.65      0.67      0.65    105391
weighted avg       0.70      0.69      0.69    105391

Results for snippet length:  25
              precision    recall  f1-score   support

         13+       0.87      0.80      0.84     20555
         7-8       0.68      0.76 

# Most important features comparison

In [34]:
#Reverse the dictionary
index2features = {}
for i in range(len(SNIPPET_LENS)):
    index2feature = {}
    vectorizer = vectorizers[i]
    for feature,idx in vectorizer.vocabulary_.items():
        assert idx not in index2feature #This really should hold
        index2feature[idx]=feature
    index2features[SNIPPET_LENS[i]] = index2feature
#Now we can query index2feature to get the feature names as we need

In [35]:
highest_prios = {}
for i in SNIPPET_LENS:
    classifier = classifiers[i]
    index2feature = index2features[i]
    # make a list of (weight, index), sort it
    lst=[]
    for idx,weight in enumerate(classifier.coef_[2]):
        lst.append((weight,idx))
    lst.sort() #sort

    #Print first few and last few
    #for weight,idx in lst[:20]: #first 30 (ie lowest weight)
    #    print(index2feature[idx])
    #print("----------------------------------------------------")
    #Take the last 30 (lst[-30:]) but these now come from weakest to strongest
    #so reverse the list using [::-1]
    highest_prio = []
    for weight,idx in lst[-20:][::-1]:
        highest_prio.append(index2feature[idx])
    highest_prios[i] = highest_prio

In [36]:
from pprint import pprint
pprint(highest_prios)

{'10': ['Orvokki',
        'orvokki',
        'leiri',
        'soturi',
        'klaani',
        'astella',
        'hevonen',
        'jumala',
        'pentu',
        'ratsastaa',
        'hotelli',
        'lehti',
        'mutisi',
        'tohtori',
        'vastaa',
        'sanoi',
        'päällikkö',
        'C',
        'huudahtaa',
        'Sitten'],
 '100': ['leiri',
         'orvokki',
         'Orvokki',
         'jumala',
         'soturi',
         'pian',
         'hevonen',
         'hotelli',
         'luultavasti',
         'luokse',
         'klaani',
         'huudahtaa',
         'sanoi',
         'ratsastaa',
         'Sitten',
         'sisko',
         'tosiaan',
         'teksti',
         '!',
         'vastaa'],
 '25': ['Orvokki',
        'orvokki',
        'leiri',
        'soturi',
        'klaani',
        'astella',
        'jumala',
        'hevonen',
        'vastaa',
        'hotelli',
        'huudahtaa',
        'Pian',
        'sanoi',
        