In [1]:
import os
import sys
sys.path.append(os.path.abspath("../src"))  # zodat Python src als modulepad herkent
from preprocessing import normalize_text
from analysis import extract_pos_tag_scores
from analysis import load_file, compare_bleu_buckets, calculate_wer_per_sentence, get_knn_gain_outliers, wer_summary
from better_by import (
    load_pickle, classify_sentences,
    extract_pos, extract_entities, plot_distribution
)
import pandas as pd
import matplotlib.pyplot as plt 
from plotting import plot_wer_distribution, plot_knn_gain_scatter

In [2]:
def load_wer_scores(path):
    with open(path, "r", encoding="utf-8") as f:
        return [float(line.strip()) for line in f if line.strip()]

In [3]:
bucket_edges = list(range(0, 110, 10))  # [0, 10, 20, ..., 100]
bucket_labels = [f"{i}-{i+10}" for i in bucket_edges[:-1]]

In [4]:
def count_in_buckets(wers, bucket_edges, labels):
    df = pd.cut(wers, bins=bucket_edges, labels=labels, right=False)
    return df.value_counts().sort_index()


wers_knn_com = load_wer_scores("results/wer_scores_knn_com.txt")
wers_vanilla_com = load_wer_scores("results/wer_scores_van_com.txt")

count_knn_com = count_in_buckets(wers_knn_com, bucket_edges, bucket_labels)
count_vanilla_com = count_in_buckets(wers_vanilla_com, bucket_edges, bucket_labels)

In [6]:
df_com = pd.DataFrame({
    "WER-bucket": bucket_labels,
    "Aantal zinnen (kNN)": count_knn_com.values,
    "Aantal zinnen (Vanilla)": count_vanilla_com.values,
})
df_com["Verschil (kNN - Vanilla)"] = df_com["Aantal zinnen (kNN)"] - df_com["Aantal zinnen (Vanilla)"]

df_com

Unnamed: 0,WER-bucket,Aantal zinnen (kNN),Aantal zinnen (Vanilla),Verschil (kNN - Vanilla)
0,0-10,8685,8597,88
1,10-20,1497,1547,-50
2,20-30,619,608,11
3,30-40,205,246,-41
4,40-50,116,125,-9
5,50-60,75,74,1
6,60-70,42,37,5
7,70-80,19,18,1
8,80-90,15,20,-5
9,90-100,0,0,0


In [7]:
wers_knn_libri = load_wer_scores("results/wer_scores_knn_libri.txt")
wers_vanilla_libri = load_wer_scores("results/wer_scores_van_libri.txt")

count_knn_libri = count_in_buckets(wers_knn_libri, bucket_edges, bucket_labels)
count_vanilla_libri = count_in_buckets(wers_vanilla_libri, bucket_edges, bucket_labels)
df_libri = pd.DataFrame({
    "WER-bucket": bucket_labels,
    "Aantal zinnen (kNN)": count_knn_libri.values,
    "Aantal zinnen (Vanilla)": count_vanilla_libri.values,
})
df_libri["Verschil (kNN - Vanilla)"] = df_libri["Aantal zinnen (kNN)"] - df_libri["Aantal zinnen (Vanilla)"]

df_libri

Unnamed: 0,WER-bucket,Aantal zinnen (kNN),Aantal zinnen (Vanilla),Verschil (kNN - Vanilla)
0,0-10,2370,2304,66
1,10-20,326,360,-34
2,20-30,132,150,-18
3,30-40,44,53,-9
4,40-50,17,21,-4
5,50-60,25,29,-4
6,60-70,11,7,4
7,70-80,7,6,1
8,80-90,1,3,-2
9,90-100,0,0,0


In [8]:
wers_knn_vox = load_wer_scores("results/wer_scores_knn_vox.txt")
wers_vanilla_vox = load_wer_scores("results/wer_scores_van_vox.txt")

count_knn_vox = count_in_buckets(wers_knn_vox, bucket_edges, bucket_labels)
count_vanilla_vox = count_in_buckets(wers_vanilla_vox, bucket_edges, bucket_labels)
df_vox = pd.DataFrame({
    "WER-bucket": bucket_labels,
    "Aantal zinnen (kNN)": count_knn_vox.values,
    "Aantal zinnen (Vanilla)": count_vanilla_vox.values,
})
df_vox["Verschil (kNN - Vanilla)"] = df_vox["Aantal zinnen (kNN)"] - df_vox["Aantal zinnen (Vanilla)"]

df_vox

Unnamed: 0,WER-bucket,Aantal zinnen (kNN),Aantal zinnen (Vanilla),Verschil (kNN - Vanilla)
0,0-10,1004,883,121
1,10-20,520,559,-39
2,20-30,214,258,-44
3,30-40,59,79,-20
4,40-50,17,22,-5
5,50-60,13,18,-5
6,60-70,2,3,-1
7,70-80,0,1,-1
8,80-90,0,0,0
9,90-100,3,1,2


In [10]:
wers_knn_com_num = load_wer_scores("results/wer_scores_knn_com_num.txt")
wers_vanilla_com_num = load_wer_scores("results/wer_scores_van_com_num.txt")

count_knn_com_num = count_in_buckets(wers_knn_com_num, bucket_edges, bucket_labels)
count_vanilla_com_num = count_in_buckets(wers_vanilla_com_num, bucket_edges, bucket_labels)
df_num = pd.DataFrame({
    "WER-bucket": bucket_labels,
    "Aantal zinnen (kNN)": count_knn_com_num.values,
    "Aantal zinnen (Vanilla)": count_vanilla_com_num.values,
})
df_num["Verschil (kNN - Vanilla)"] = df_num["Aantal zinnen (kNN)"] - df_num["Aantal zinnen (Vanilla)"]

df_num

Unnamed: 0,WER-bucket,Aantal zinnen (kNN),Aantal zinnen (Vanilla),Verschil (kNN - Vanilla)
0,0-10,2131,8597,-6466
1,10-20,5541,1547,3994
2,20-30,2316,608,1708
3,30-40,660,246,414
4,40-50,265,125,140
5,50-60,209,74,135
6,60-70,89,37,52
7,70-80,22,18,4
8,80-90,35,20,15
9,90-100,0,0,0
