Edit-Distance/ Normalized Edit-Distance/ Mirco-Precision, -Recall, -F1 score for Morfessor Segmentation (boundary level)

In [None]:
# edit distance
def edit_distance(seq1, seq2):

    n, m = len(seq1), len(seq2)
    dp = [[0] * (m + 1) for _ in range(n + 1)]

    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j

    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if seq1[i - 1] == seq2[j - 1]:
                cost = 0
            else:
                cost = 1
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # deletion
                dp[i][j - 1] + 1,      # insertion
                dp[i - 1][j - 1] + cost # substitution
            )
    return dp[n][m]

In [None]:
# used for calculating boundary level accuracy
import re

def get_boundaries(segmentation):

    tokens = re.split(r"[-\s]+", segmentation.strip())
    boundaries = []
    idx = 0
    for token in tokens[:-1]:
        idx += len(token)
        boundaries.append(idx)
    return boundaries

In [None]:
def calc_f1_prec_rec(pred, gold):

    pred_bounds = set(get_boundaries(pred))
    gold_bounds = set(get_boundaries(gold))

    correct = len(pred_bounds & gold_bounds)
    pred_count = len(pred_bounds)
    gold_count = len(gold_bounds)

    precision = correct / pred_count if pred_count > 0 else 0.0
    recall = correct / gold_count if gold_count > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

In [None]:
# change language, short and file path
language = 'Lezgi'
short = 'lez'

gold = []
with open (f'{language}/{short}-train-track2-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            gold.append(line.strip()[3:])


selftrained = []
with open (f'{language}/{short}-train-track1-covered-selftrained-train', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            selftrained.append(line.strip()[3:])

In [None]:
distances = []
norm_distances = []
pr = []
f = []

total_correct = 0
total_pred = 0
total_gold = 0

for pred, gold in zip(selftrained, gold):

    pred_tokens = pred.replace("-", " ").split()
    gold_tokens = gold.replace("-", " ").split()

    b1 = get_boundaries(pred)
    b2 = get_boundaries(gold)

    dist = edit_distance(pred_tokens, gold_tokens)
    p, r, f1 = calc_f1_prec_rec(pred, gold)

    pred_bounds = set(get_boundaries(pred))
    gold_bounds = set(get_boundaries(gold))
    total_correct += len(pred_bounds & gold_bounds)
    total_pred += len(pred_bounds)
    total_gold += len(gold_bounds)

    distances.append(dist)
    norm_distances.append(dist/max(len(pred_tokens), len(gold_tokens)))
    pr.append((p,r))
    f.append(f1)

In [None]:
# average edit distance per sentence
avg = sum(distances)/len(distances)
print(avg)

# average NED per sentence
norm_avg = sum(norm_distances)/len(norm_distances)
print(norm_avg)

# precision, recall and f1 scores
micro_p = total_correct / total_pred if total_pred > 0 else 0
micro_r = total_correct / total_gold if total_gold > 0 else 0
micro_f1 = 2 * micro_p * micro_r / (micro_p + micro_r) if (micro_p + micro_r) > 0 else 0

Morpheme Level f1, Recall, Precision for morfessor segmentation

In [None]:
# change language, short and file path
language = 'Gitksan'
short = 'git'

gold = []
with open (f'{language}/{short}-train-track2-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            gold.append(line.strip()[3:].replace('-', ' ').split())


selftrained = []
with open (f'{language}/{short}-train-track1-covered-selftrained-train', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            selftrained.append(line.strip()[3:].replace('-', ' ').split())

print(gold)
print(selftrained)

In [None]:
from typing import List, Tuple

def morpheme_pre_rec_f1(pred_list: List[List[str]], gold_list: List[List[str]]) -> Tuple[float, float, float]:

    assert len(pred_list) == len(gold_list), "length unequal"

    total_correct = 0
    total_pred = 0
    total_gold = 0

    for pred_morphs, gold_morphs in zip(pred_list, gold_list):

        total_pred += len(pred_morphs)
        total_gold += len(gold_morphs)

        for pred in pred_morphs:
            if pred in gold_morphs:
                total_correct += 1
                gold_morphs = gold_morphs.copy()
                gold_morphs.remove(pred)

    precision = total_correct / total_pred if total_pred > 0 else 0.0
    recall = total_correct / total_gold if total_gold > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

    return precision, recall, f1

Impact of Word Frequency on Gloss Prediction

In [None]:
from collections import Counter

# change number

num = 5

match num:
    case 1:
      language = 'Gitksan'
      short = 'git'
    case 2:
      language = 'Lezgi'
      short = 'lez'
    case 3:
      language = 'Natugu'
      short = 'ntu'
    case 4:
      language = 'Tsez'
      short = 'ddo'
    case 5:
      language = 'Savosavo'
      short = 'savo'
    case 6:
      language = 'Yali'
      short = 'apah'

word_list = []
with open (f'{language}/{short}-train-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            word_list.extend(line.strip()[3:].split())

test_words = []
with open (f'{language}/{short}-test-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            test_words.append(line.strip()[3:].split())

gold_gloss = []
with open (f'{language}/{short}-test-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\g"):
            gold_gloss.append(line.strip()[3:].split())

selfpredicted_gloss = []
with open (f'{language}/{short}-output-preds', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\g"):
            selfpredicted_gloss.append(line.strip()[3:].split())

#frequency of all words in training dataset
word_freq = Counter(word_list)

In [None]:
# for each word, check if predicted correct. if correct, score = 1, else, score = 0

from collections import defaultdict

wordScores = defaultdict(list)
for i in range(len(test_words)):
    word = test_words[i]
    gold = gold_gloss[i]
    pred = selfpredicted_gloss[i]

    # if predicted not enough, fill with 'UNK'
    while len(pred) < len(gold):
        pred.append('UNK')

    for w, g, pr in zip(word, gold, pred):
        score = 1 if pr.strip() == g.strip() else 0
        wordScores[w].append(score)

In [None]:
buckets = {"OOV": [], "low": [], "medium": [], "high": []}

# sort words according to the frequency into bucktes and append the average score computed from code above

for word, scores in wordScores.items():
    avg_score = sum(scores) / len(scores)
    freq = word_freq.get(word, 0)
    if freq == 0:
        buckets["OOV"].append(avg_score)
    elif freq <= 2:
        buckets["low"].append(avg_score)
    elif freq <= 10:
        buckets["medium"].append(avg_score)
    else:
        buckets["high"].append(avg_score)

# calculate average score for each bucket
for b in buckets:
      if buckets[b]:
          buckets[b] = sum(buckets[b]) / len(buckets[b])
      else:
          buckets[b] = 0.0

print(buckets)

Impact of Numbers of Morphemes for the Gloss Prediction

In [None]:
# change number
num = 6

match num:
    case 1:
      language = 'Gitksan'
      short = 'git'
    case 2:
      language = 'Lezgi'
      short = 'lez'
    case 3:
      language = 'Natugu'
      short = 'ntu'
    case 4:
      language = 'Tsez'
      short = 'ddo'
    case 5:
      language = 'Savosavo'
      short = 'savo'
    case 6:
      language = 'Yali'
      short = 'apah'

gold_gloss = []
with open (f'{language}/{short}-test-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\g"):
            gold_gloss.append(line.strip()[3:].split())

selfpredicted_gloss = []
with open (f'{language}/{short}_output_preds', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\g"):
            selfpredicted_gloss.append(line.strip()[3:].split())

In [None]:
from collections import defaultdict
# number of morphemes, sorted after num of morphemes
group_counts = defaultdict(int)
# correct predicted unsplitted morpheme
word_correct = defaultdict(int)
# number of splitted morphemes
morpheme_total = defaultdict(int)
# correct predicted splitted morpheme
morpheme_correct = defaultdict(int)

for i in range(min(len(gold_gloss), len(selfpredicted_gloss))):
    gold = gold_gloss[i]
    pred = selfpredicted_gloss[i]

    while len(pred) < len(gold):
        pred.append('UNK')

    for gg, pr in zip(gold, pred):

        gold_parts = gg.split("-")
        pred_parts = pr.split("-")
        n = len(gold_parts)

        group_counts[n] += 1

        if gg == pr:
            word_correct[n] += 1

        for g, p in zip(gold_parts, pred_parts):
            if g == p:
                morpheme_correct[n] += 1

        morpheme_total[n] += max(len(gold_parts), len(pred_parts))

for n in sorted(group_counts.keys()):
    word_acc = word_correct[n] / group_counts[n]
    morph_acc = morpheme_correct[n] / morpheme_total[n]

TTR and OOV

In [None]:
import re

language = 'Yali'
short = 'apah'

train_words = []
with open (f'{language}/{short}-train-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            train_words.extend(line.strip()[3:].split())

test_words = []
with open (f'{language}/{short}-test-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            test_words.extend(line.strip()[3:].split())

train_morph = []
with open (f'{language}/{short}-train-track2-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            l = line.strip()[3:]
            train_morph.extend(re.split(r'[-\s]+', l))

test_morph = []
with open (f'{language}/{short}-test-track2-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\m"):
            l = line.strip()[3:]
            test_morph.extend(re.split(r'[-\s]+', l))

In [None]:
# TTR

unique_tok_train = set(train_words)
unique_tok_test = set(test_words)

ttr_train = len(unique_tok_train) / len(train_words)
ttr_test = len(unique_tok_test) / len(test_words)

print(f'TTR train: {ttr_train}, TTR test: {ttr_test}')

# OOV
oov_num = sum(1 for word in test_words if word not in train_words)
oov = oov_num / len(test_words)

oov_num2 = sum(1 for morph in test_morph if morph not in train_morph)
oov_morph = oov_num2 / len(test_morph)

print(f'OOV words: {oov}')
print(f'OOV morph: {oov_morph}')

Number of sentences in dataset

In [None]:
num = 6

match num:
    case 1:
      language = 'Gitksan'
      short = 'git'
    case 2:
      language = 'Lezgi'
      short = 'lez'
    case 3:
      language = 'Natugu'
      short = 'ntu'
    case 4:
      language = 'Tsez'
      short = 'ddo'
    case 5:
      language = 'Savosavo'
      short = 'savo'
    case 6:
      language = 'Yali'
      short = 'apah'

t = 0
d = 0
tr = 0

with open (f'{language}/{short}-test-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            t += 1

with open (f'{language}/{short}-dev-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            d += 1

with open (f'{language}/{short}-train-track1-uncovered', "r", encoding="utf-8") as f:
    for line in f:
        if line.startswith("\\t"):
            tr += 1

print(f"Language {language} has {t} sentences in test set, {d} sentences for dev and {tr} sentences for training set")