# Optimize token similarity matrices

In this notebook, we will optimize the hyperparameters of token similarity matrices for our best run from the previous notebook.

In [1]:
! hostname

mir


In [2]:
%%capture
! pip install .[system]

In [3]:
import json
import re

from pandas import DataFrame

def evaluate_interpolated_run(basename: str) -> DataFrame:
    match = re.search('(?P<symmetric>(True|False))-(?P<dominant>(True|False))-(?P<nonzero_limit>([0-9]+))$', basename)
    if not match:
        symmetric = 'True'
        dominant = 'True for Levenshtein, False for word embeddings'
        nonzero_limit = '100'
    else:
        symmetric = match.group('symmetric')
        dominant = match.group('dominant')
        nonzero_limit = match.group('nonzero_limit')
    
    with open(f'submission/{basename}.first_alpha_and_gamma', 'rt') as f:
        first_alpha_and_gamma = json.load(f)
    if 'alpha' in first_alpha_and_gamma or 'gamma' in first_alpha_and_gamma:
        raise ValueError(f'First system from run {basename} is not yet optimized')

    first_alpha = first_alpha_and_gamma['best_alpha']
    first_gamma = first_alpha_and_gamma['best_gamma']

    with open(f'submission/{basename}.second_alpha_and_gamma', 'rt') as f:
        second_alpha_and_gamma = json.load(f)
    if 'alpha' in second_alpha_and_gamma or 'gamma' in second_alpha_and_gamma:
        raise ValueError(f'Second system from run {basename} is not yet optimized')

    second_alpha = second_alpha_and_gamma['best_alpha']
    second_gamma = second_alpha_and_gamma['best_gamma']

    with open(f'submission/{basename}.beta', 'rt') as f:
        _beta = json.load(f)
    if 'beta' in _beta:
        raise ValueError(f'Interpolated system from run {basename} is not yet optimized')
    
    beta = _beta['best_beta']
    
    with open(f'submission/{basename}.ndcg_score', 'rt') as f:
        ndcg = f.read()

    ndcg, *_ = ndcg.split(', ')
    ndcg = float(ndcg)
        
    formatters = {"first_alpha": lambda alpha: f'{alpha:.1f}',
                  "first_gamma": lambda gamma: f'{gamma:g}',
                  "second_alpha": lambda alpha: f'{alpha:.1f}',
                  "second_gamma": lambda gamma: f'{gamma:g}',
                  "beta": lambda beta: f'{beta:.1f}',
                  "ndcg": lambda ndcg: f'{alpha:.3f}'}

    rows = 'ARQMath-3',
    columns = 'α₁', 'γ₁', 'α₂', 'γ₂', 'β', 'symmetric', 'dominant', 'nonzero_limit', "NDCG'"
    data = [[first_alpha, first_gamma, second_alpha, second_gamma, beta, symmetric, dominant, nonzero_limit, ndcg]]

    dataframe = DataFrame(data, columns=columns, index=rows)

    return dataframe

## Results with hand-picked hyperparameters

First, we show the results with hand-picked hyperparameters of token similarity matrices.

In [4]:
evaluate_interpolated_run('SCM-task1-interpolated_positional_word2vec_text+tangentl-both-auto-P')

Unnamed: 0,α₁,γ₁,α₂,γ₂,β,symmetric,dominant,nonzero_limit,NDCG'
ARQMath-3,0.7,2,0.0,5,0.7,True,"True for Levenshtein, False for word embeddings",100,0.355


In [5]:
from pathlib import Path

def get_best_hyperparameters(basename: str) -> str:
    best_ndcg, best_basename = float('-inf'), None
    for pathname in Path('submission').glob(f'{basename}-*.beta'):
        with pathname.open('rt') as f:
            results = json.load(f)
        ndcg = results['best_ndcg']
        if ndcg > best_ndcg:
            best_ndcg = ndcg
            best_basename = pathname.stem
    assert best_basename is not None
    return best_basename

## Results with optimized hyperparameters

Next, we show the results with optimized hyperparameters of token similarity matrices.

In [6]:
%%capture
! make optimized-best-runs

In [7]:
basename = get_best_hyperparameters('SCM-task1-interpolated_positional_word2vec_text+tangentl-both-auto-P')

In [8]:
evaluate_interpolated_run(basename)

Unnamed: 0,α₁,γ₁,α₂,γ₂,β,symmetric,dominant,nonzero_limit,NDCG'
ARQMath-3,0.8,2,0.0,5,0.7,False,False,50,0.355


NDCG' shows that the soft vector space model is robust to parameter variations.