# Make mappings between different spellings of the same words in Bokmål

In [1]:
import pandas as pd
import re

In [2]:
# Import the full form list from Norsk ordbank

ordbank_df = pd.read_table("fullformsliste.txt")

In [3]:
# Remove the masculine/feminine distinctions to capture alternations like "betydningen"/"betydninga"

ordbank_df["TAG"] = ordbank_df.TAG.apply(lambda x: x.replace("subst fem appell", "subst mask appell"))

In [4]:
# Identify forms that have multiple instances of the same lemma id and tag, i.e., forms which are variants of the same word
# and have the same grammatical tags

ordbank_df["duplicated"] = ordbank_df.duplicated(subset=["LEMMA_ID", "TAG"], keep=False)

In [5]:
# Get mappings between alternate forms

def get_mappings(gb):
    words = list(gb.OPPSLAG)
    firstw = words[0]
    return [(x, firstw) for x in words[1:] if x != firstw]

dups = ordbank_df.query("duplicated").dropna().groupby(["LEMMA_ID", "TAG"]).apply(lambda x: get_mappings(x)).to_list()
dups = [y for x in dups for y in x if x != []]

In [6]:
# Make a mapping dict

import re

mappings = {}
for t in dups:
    if re.match(r"^\-.*", t[1]):
        pass # Filter out forms starting with "-"
    elif t[0] not in mappings.keys():
        if t[1] in mappings.keys() and mappings[t[1]] == t[0]: # Avoid circular mappings
            pass
        else:
            mappings[t[0]] = t[1]
    else:
        pass

In [20]:
# Produce a json file

import json
with open("bokmal.json", "w") as outf:
    json.dump(mappings, outf, ensure_ascii=False)