In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from pathlib import Path
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'
plt.style.use('ggplot')

In [2]:
with Path('scandeval_benchmark_results.jsonl').open() as f:
    records = [json.loads(dct_str) for dct_str in f if dct_str.strip("\n")]
len(records)

3310

In [3]:
records[0].keys()

dict_keys(['dataset', 'task', 'dataset_languages', 'model', 'results', 'num_model_parameters', 'max_sequence_length', 'vocabulary_size', 'few_shot'])

In [4]:
dane_model_ids = {record['model'] for record in records if record['dataset'] == 'dane'}
dansk_model_ids = {record['model'] for record in records if record['dataset'] == 'dansk'}
relevant_model_ids = list(dansk_model_ids.intersection(dane_model_ids))

dane_scores = [
    [
        record['results']['total']['test_micro_f1'] 
        for record in records
        if record['model'] == model_id and record['dataset'] == 'dane'
    ][0]
    for model_id in relevant_model_ids
]
dansk_scores = [
    [
        record['results']['total']['test_micro_f1'] 
        for record in records
        if record['model'] == model_id and record['dataset'] == 'dansk'
    ][0]
    for model_id in relevant_model_ids
]

corr = np.corrcoef(x=dane_scores, y=dansk_scores)[0, 1]
print(f"Correlation: {corr:.2%}")

fig = px.scatter(
    x=dane_scores, 
    y=dansk_scores,
    title='Correlation between Danish NER datasets',
    trendline='ols',
    hover_name=relevant_model_ids,
    labels=dict(x='DaNE Micro-F1 score', y='DANSK Micro-F1 score'),
)
fig.write_html('/Users/dan/Downloads/dane-dansk-correlation.html')
fig.show()

Correlation: 63.36%


In [5]:
scandiqa_no_model_ids = {record['model'] for record in records if record['dataset'] == 'scandiqa-no'}
norquad_model_ids = {record['model'] for record in records if record['dataset'] == 'norquad'}
relevant_model_ids = list(norquad_model_ids.intersection(scandiqa_no_model_ids))

scandiqa_no_scores = [
    [
        record['results']['total']['test_em'] 
        for record in records
        if record['model'] == model_id and record['dataset'] == 'scandiqa-no'
    ][0]
    for model_id in relevant_model_ids
]
norquad_scores = [
    [
        record['results']['total']['test_em'] 
        for record in records
        if record['model'] == model_id and record['dataset'] == 'norquad'
    ][0]
    for model_id in relevant_model_ids
]

corr = np.corrcoef(x=scandiqa_no_scores, y=norquad_scores)[0, 1]
print(f"Correlation: {corr:.2%}")

fig = px.scatter(
    x=scandiqa_no_scores, 
    y=norquad_scores,
    title='Correlation between Norwegian QA datasets',
    trendline='ols',
    hover_name=relevant_model_ids,
    labels=dict(x='ScandiQA-no exact match score', y='NorQuAD exact match score'),
)
fig.write_html('/Users/dan/Downloads/scandiqa-no-norquad-correlation.html')
fig.show()

Correlation: 79.02%
