In [49]:
import re


def parse_strict_korean_vocab(filepath):
    words = []
    sentences = []
    word_id = 1

    with open(filepath, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    i = 0
    while i < len(lines):
        # Skip junk lines
        if re.match(r"(?i)^page|\blingo mastery\b", lines[i]):
            i += 1
            continue

        # Vocabulary line
        line = lines[i]
        match = re.match(
            r"^(\d+)[–\-\.]?\s+([가-힣]+)\s*/\s*([\w\-]+)\s*\[([a-zA-Z\. ]+)\]\s+(.+)",
            line,
        )
        if not match:
            i += 1
            continue

        _, word, romanization, pos_raw, meaning_text = match.groups()
        pos = pos_raw.strip().lower().rstrip(".")
        meanings = [m.strip() for m in meaning_text.split(",")]

        # Example sentences
        example_kr = lines[i + 1] if i + 1 < len(lines) else ""
        example_en = lines[i + 2] if i + 2 < len(lines) else ""

        if pos == "n":
            pos == "noun"
        elif pos == "num":
            pos = "number"
        elif pos == "pron":
            pos = "pronoun"
        elif pos == "v":
            pos = "verb"
        elif pos == "a":
            pos = "adjective"
        elif pos == "adv":
            pos = "adverb"
        elif pos == "determiner":
            pos = "determiner"
        elif pos == "assistant v":
            pos = "assistant verb"
        elif pos == "adj":
            pos = "adjective"
        elif pos == "determine":
            pos = "determiner"
        elif pos == "p":
            pos = "pronoun"

        for meaning in meanings:
            words.append(
                {
                    "id": word_id,
                    "word": word,
                    "romanization": romanization,
                    "pos": pos,
                    "meaning": meaning,
                }
            )

            if example_kr and example_en:
                sentences.append(
                    {
                        "word_id": word_id,
                        "example_kr": example_kr,
                        "example_en": example_en,
                    }
                )

            word_id += 1

        i += 3  # move to next vocab block

    return words, sentences

In [50]:
words, sentences = parse_strict_korean_vocab(
    "txt_path=../../../assets/data/raw/words/2000 Most Common Korean Words i - Lingo Mastery.txt"
)
len(words)

5394

In [51]:
len(sentences)

5394

In [52]:
unique_pos = {word["pos"] for word in words}
unique_pos

{'',
 'adjective',
 'adverb',
 'assistant verb',
 'counter',
 'determiner',
 'interjection',
 'n',
 'number',
 'pronoun',
 's',
 'verb'}

In [31]:
len(words)

5394

In [32]:
words[:50]

[{'id': 1,
  'word': '것',
  'romanization': 'geot',
  'pos': 'n',
  'meaning': 'a thing'},
 {'id': 2,
  'word': '것',
  'romanization': 'geot',
  'pos': 'n',
  'meaning': 'an object'},
 {'id': 3,
  'word': '하다',
  'romanization': 'hada',
  'pos': 'v',
  'meaning': 'to do'},
 {'id': 4,
  'word': '하다',
  'romanization': 'hada',
  'pos': 'v',
  'meaning': 'to make'},
 {'id': 5,
  'word': '있다',
  'romanization': 'itda',
  'pos': 'v',
  'meaning': 'to be (in a place)'},
 {'id': 6,
  'word': '있다',
  'romanization': 'itda',
  'pos': 'v',
  'meaning': 'exist'},
 {'id': 7,
  'word': '있다',
  'romanization': 'itda',
  'pos': 'v',
  'meaning': 'have'},
 {'id': 8,
  'word': '있다',
  'romanization': 'itda',
  'pos': 'assistant v',
  'meaning': 'to be (in a state)'},
 {'id': 9,
  'word': '되다',
  'romanization': 'doeda',
  'pos': 'v',
  'meaning': 'to become'},
 {'id': 10, 'word': '수', 'romanization': 'su', 'pos': 'n', 'meaning': 'way'},
 {'id': 11, 'word': '수', 'romanization': 'su', 'pos': 'n', 'meanin

In [None]:
import json

# Save words to a JSON file
with open(
    "../assets/data/processed/korean_words_2000.json", "w", encoding="utf-8"
) as words_file:
    json.dump(words, words_file, ensure_ascii=False, indent=4)

# Save sentences to a JSON file
with open(
    "../assets/data/processed/korean_sentences_2000.json",
    "w",
    encoding="utf-8",
) as sentences_file:
    json.dump(sentences, sentences_file, ensure_ascii=False, indent=4)