# Open a wiki file and learn something


Download raw data from:
https://kaikki.org/dictionary/French/index.html


## Record structure

### Word: `word`

It's the word.

### Part of speech: `pos`

We can get a set of good pos to use.

### Meaning of the word: `senses`

Can be more than one (it's a list).

* `glosses`: The meaning of the word, also a list.
* `raw_glosses`: A more informative definition.
* `examples`: Examples.

Each `sense` has a `categories` list,
that can be useful for clustering words
or to train by topic.


## Frequent words

1. Get the big corpus.
1. For each word, get the non inflected version.
1. Compute freq lol you are done.


## Constants


In [None]:
import json
import re
from copy import deepcopy
from itertools import pairwise
from pathlib import Path
from pprint import pprint
from random import randint

from loguru import logger as lg


In [None]:
lg.remove()
lg.add(sys.stderr, format="{message}", level="DEBUG")


In [None]:
dataset_fol = Path(".").absolute().parent / "dataset"

wiki_fn_word_forms = "kaikki.org-dictionary-French-words.json"
wiki_fn_non_inflected_senses = "kaikki.org-dictionary-French-all-no-wFNY2q.json"

# wiki_fp = dataset_fol / wiki_fn_word_forms
wiki_fp = dataset_fol / wiki_fn_non_inflected_senses


In [None]:
accent = set("àÀâÂéÉèÈêÊëËîÎïÏœŒôÔùÙûÛüÜçÇ")
print(accent)


In [None]:
good_pos = {
    # "",
    # '<i class="Jpan mention" lang="ja">かみかぜ</i> (kamikaze, “suicide flyer”, literally “divine wind”)',
    # 'Modern French <i class="Latn mention" lang="fr">chair</i>',
    # "a",
    # "a commune in Normandy, France",
    # "a restoration of the Latin 3rd-person-singular -t",
    # "abbrev",
    # "ablative",
    # "accusative plural",
    "adj",
    # "adjectival suffix",
    "adjective",
    # "adjective-forming suffix",
    "adv",
    "adverb",
    # "affirmative particle",
    # "affix",
    # "an anchovy-based condiment",
    # "an apocopic form of la, la before a vowel",
    "article",
    # "augmentative suffix",
    # "character",
    # "conj",
    # "det",
    # "dialectal",
    # "diminutive ending",
    # "diminutive suffix",
    # "first-person plural present indicative ending",
    # 'from an Illyrian word probably from Proto-Indo-European <i class="Latinx mention" lang="ine-pro">*sab-</i> (“taste”)',
    # "infix",
    # "instrumental suffix",
    # "intensifier",
    # "interfix",
    # "interjection used in deer-hunting",
    # "intj",
    # "n",
    "name",
    # "name of a Celtic tribe in Southern Germany, which later emigrated to Gaul",
    # "nominal suffix",
    "noun",
    # "noun suffix",
    "nouns",
    "num",
    # "onomatopoeia of the lowing of cattle",
    # "particle",
    # "past participle of dire (“to say”)",
    # "past passive participle",
    "phrase",
    # "postp",
    # "prefix",
    # "prep",
    # "prep_phrase",
    "pron",
    # "pronounced /le‿ʁital(jɛ̃)/",
    "proverb",
    # "punct",
    # "reflexive pronoun",
    # "second-person singular",
    # "stem libr-",
    # "suffix",
    # "suffix added to noun stems to form adjectives",
    # "suffix added to verbal stems forming neuter nouns denoting the result of, a particular instance of, or the object of an action",
    # "suffix denoting occupation",
    # "suffix forming adjectives from nouns",
    # "suffix forming adjectives meaning ‘belonging to, relating to’",
    # "suffix forming augmentatives",
    # "suffix forming diminutives",
    # "suffix forming infinitives of first-conjugation verbs",
    # "suffix forming nouns usually denoting diseased conditions",
    # "suffix meaning ‘of or pertaining to’",
    # "suffix with the sense ‘relating’ to forming adjectives",
    # "surname",
    # "symbol",
    # "v",
    "verb",
    "verb and noun",
}
len(good_pos)


## Funcs


In [None]:
def walk_object(obj, level, seen_keys, seen_pos):
    # pad = "\t" * level + str(level) + ": "
    if isinstance(obj, dict):
        for key in obj:
            # print(f"{pad}opening {key=} {obj[key]=}")
            seen_keys.add(key)
            if key == "pos":
                seen_pos.add(obj[key])
            walk_object(obj[key], level + 1, seen_keys, seen_pos)

    elif isinstance(obj, list):
        for el in obj:
            # print(f"{pad}traversing {el}")
            walk_object(el, level + 1, seen_keys, seen_pos)


In [None]:
def build_structure(obj, level, struct, curr_tag):
    # build a pad to indent the levels
    pad = f"{'  '*level} {level}:"
    obj_str = f"{obj}"

    # if it is a dict
    if isinstance(obj, dict):
        # print(f"{pad} dict {obj_str[:30]}")

        # mark it as an object
        curr_tag += "#"
        struct[curr_tag] = 0
        # print(f"{pad}      {curr_tag}")

        # iterate all the keys
        for key in obj:
            # update the tag for this key
            # print(f"{pad} key {key}")
            new_tag = curr_tag + f"{key}"
            # print(f"{pad}      {new_tag}")
            struct[new_tag] = 0
            if "#" in key or ">" in key:
                lg.warning(f"Separator character in key {key}")
                # TODO escape the separator? by doubling it?

            # recurse
            build_structure(obj[key], level + 1, struct, new_tag)

    # if it is a list
    elif isinstance(obj, list):
        # print(f"{pad} list {obj_str[:30]}")

        # mark that there is a list here
        curr_tag += ">"
        struct[curr_tag] = 0
        # print(f"{pad}      {curr_tag}")

        # check if its all made of strings, int, ... TODO
        listlen = len(obj)
        if listlen > 0 and all(isinstance(el, str) for el in obj):
            curr_tag += "str"
            struct[curr_tag] = 0
        elif listlen > 0 and all(isinstance(el, int) for el in obj):
            curr_tag += "int"
            struct[curr_tag] = 0
        # it is not a base type
        else:
            # iterate all objects: # we assume all the objects are homogeneous,
            # at most they can have some missing keys
            for i, el in enumerate(obj):
                # print(f"{pad} item {i}")
                build_structure(el, level + 1, struct, curr_tag)

    # else: print(f"{pad} final {obj_str[:30]}")


In [None]:
# turn the struct into a list of strings


def structure_to_strlist(struct):

    struct_str_all: list[str] = sorted(struct.keys())

    # if you are the prefix of something
    # you are the prefix of the next in the sorted list
    struct_str = []
    for first, second in pairwise(struct_str_all):
        if second.startswith(first):
            continue
        struct_str.append(first)
    struct_str.append(struct_str_all[-1])

    return struct_str


In [None]:
def split_str_to_path(string):
    return [x for x in re.split("[#>]", string) if x]


def match_str_to_path(string, path) -> tuple[bool, list]:
    search_path = split_str_to_path(string)
    if all(p == s for p, s in zip(path, search_path)):
        matches = True
        rest = search_path[len(path) :]
    else:
        matches = False
        rest = []
    return matches, rest


In [None]:
def filter_keys(dict_, keys, which_filter):
    orig_keys = list(dict_.keys())
    for key in orig_keys:
        # keep only the keys in keys
        if which_filter == "keep":
            if key not in keys:
                del dict_[key]
        # remove only the keys in keys
        elif which_filter == "remove":
            if key in keys:
                del dict_[key]


In [None]:
# access the requested path, if it exists
def access_by_path(obj, path):
    # lg.debug(f"{path} {obj}")
    # pprint((path, obj), width=100)

    # if we have no more path, we have found the value!
    if len(path) == 0:
        return obj

    key = path[0]
    remaining_path = path[1:]

    # if it is a dict search for the rest of the path in the value
    if isinstance(obj, dict):
        if key not in obj:
            # None is actually a valid return value
            # shoud raise KeyError
            # but it's tricky, because if the first element in a list
            # does not contain the key, it might be in the next one
            # so in the list portion we should catch them while we still have elements
            # and only raise it in the end
            return None
        return access_by_path(obj[key], remaining_path)

    elif isinstance(obj, list):
        # search for the rest of the path in all the objects in the list
        for el in obj:
            # search for this path continuation in this element
            if key not in el:
                continue
            maybe_value = access_by_path(el[key], remaining_path)
            # if we find something in this element, return that
            if maybe_value is not None:
                return maybe_value
        # if all the elements failed return None
        return None


## Load the data



One record per line


In [None]:
words_data_orig = []
with wiki_fp.open() as wf:
    for line in wf:
        word_data = json.loads(line)
        words_data_orig.append(word_data)


## Get the struct of the original records


In [None]:
# run the thing on the whole dataset
struct_all = {}
for word_data in words_data_orig[:1000000]:
    build_structure(word_data, 0, struct_all, "")

struct_all_str = structure_to_strlist(struct_all)
pprint(struct_all_str)


## Keep only interesting keys


In [None]:
# reset the word records
words_data_all = deepcopy(words_data_orig)


In [None]:
word_keep_keys = [
    "categories",
    "form_of",
    "pos",
    "senses",
    "word",
]

cat_keep_keys = ["name"]

examples_keep_keys = ["english", "text"]

sense_keep_keys = [
    "categories",
    "examples",
    "glosses",
    "raw_glosses",
    "tags",
    "topics",
]

# keep only good pos here
words_data = []

for word_data in words_data_all:

    # only keep words with good pos
    if word_data["pos"] not in good_pos:
        continue

    # # keep only some keys in the word record
    # word_data_keys = list(word_data.keys())
    # for word_data_key in word_data_keys:
    #     if word_data_key not in word_keep_keys:
    #         del word_data[word_data_key]
    filter_keys(word_data, word_keep_keys, "keep")

    if "categories" in word_data:
        for cat in word_data["categories"]:
            filter_keys(cat, cat_keep_keys, "keep")

    for sense in word_data["senses"]:

        # remove unwanted sense keys
        filter_keys(sense, sense_keep_keys, "keep")

        # remove redundant info in categories
        if "categories" in sense:
            # iterate over all the categories
            for cat in sense["categories"]:

                # # copy the list of keys to avoid changing len while looping
                # cat_keys = list(cat.keys())
                # for cat_key in cat_keys:
                #     # delete all the unwanted keys from the cat object
                #     if cat_key not in cat_keep_keys:
                #         del cat[cat_key]
                filter_keys(cat, cat_keep_keys, "keep")

        if "examples" in sense:
            for example in sense["examples"]:
                filter_keys(example, examples_keep_keys, "keep")

    words_data.append(word_data)

# sort(ish) the words
# one word can have more than one pos
words_data_sort = sorted(words_data, key=lambda x: x["word"])

print(f"{len(words_data_all)=} {len(words_data_sort)=}")


## Save filtered data


In [None]:
# # output file
# wiki_out_fn = "kaikki.org-dictionary-French-all-no-filter.json"
# wiki_out_fp = dataset_fol / wiki_out_fn
# print(f"{wiki_out_fp}")
# # build all the records
# out_str = []
# for word_data in words_data_sort:
#     word_str = json.dumps(word_data)
#     out_str.append(word_str)
# # write out the records
# dump_str = "\n".join(out_str)
# wiki_out_fp.write_text(dump_str)


## Inspect keys and pos


In [None]:
seen_keys = set()
seen_pos = set()

for word_data in words_data:
    walk_object(word_data, level=0, seen_keys=seen_keys, seen_pos=seen_pos)

len(seen_keys)


In [None]:
seen_keys


In [None]:
seen_pos


## Analyze some words


In [None]:
words_acc_data = []
word_search = []

for word_data in words_data_sort:
    word = word_data["word"]

    word_letters = set(word)
    is_accent = accent.intersection(word_letters)
    if is_accent:
        words_acc_data.append(word_data)
        # print(f"accent! {word}")
        # break

    # word with a space
    # if " " in word:
    #     print(f"found! {word}")
    #     break

    the_word = "chaise"
    # the_word = "abîme"
    # the_word = "abime"
    # the_word = "abimes"
    # the_word = "angariés"
    # the_word = "angaries"
    # the_word = "arrière"
    # the_word = "Alexia"
    if word == the_word:
        print(f"found! {word} {word_data['pos']}")
        word_search.append(word_data)

    # if word_data["pos"] == "name":
    #     print(f"found! {word} {word_data['pos']}")
    #     break

    if word_data["pos"] not in good_pos:
        print(f"yo why no filter me")


print(f"{len(words_acc_data)=}")


In [None]:
pprint(word_search[0], width=150)


In [None]:
# ri = randint(0, len(words_data_sort) - 1)
# word_data = words_data_sort[ri]
# print(f"{ri=} {word_data['word']}")
# pprint(word_data, width=150)

ri = randint(0, len(words_acc_data) - 1)
acc_data = words_acc_data[ri]
print(f"{ri=} {acc_data['word']}")
pprint(acc_data, width=150)


In [None]:
# access_by_path(acc_data, ["senses", "examples"])
# access_by_path(acc_data, ["senses", "examples", "english"])
access_by_path(acc_data, ["senses", "miss", "english"])


## Get the structure of the json


In [None]:
# run the thing on the whole dataset
struct = {}
for word_data in words_data_sort[:]:
    build_structure(word_data, 0, struct, "")
struct_str = structure_to_strlist(struct)
pprint(struct_str)


In [None]:
# search a sample of a path
for word_data in words_data_sort[:]:
    # data = access_by_path(word_data, ["senses", "examples", "ref"])
    # data = access_by_path(word_data, ["senses", "examples", "note"])
    data = access_by_path(word_data, ["senses", "examples", "type"])
    if data is not None:
        pprint(data)
        break


In [None]:
# search all sample in a path
data_all = set()
for word_data in words_data_sort[:]:
    data = access_by_path(word_data, ["senses", "examples", "type"])
    if data is not None:
        data_all.add(data)

pprint(data_all)


## Analyze the structure of the json


In [None]:
# from a list of nodes? string of paths?
# get the list of keys for a node

import re

rs_sep0 = "[#>]{,2}"
rs_sep1 = "[#>]{1,2}"


# path = []
# search_in = "#senses>#coordinate_terms>#english"

path = ["senses", "coordinate_terms"]
search_in = "#senses>#coordinate_terms>#english"

# path = ["pos"]
# search_in = "#pos"

# build a regex to match this path
# we join in the center with the separator
rs_path_part = rs_sep0.join([f"{key}" for key in path])
# we add a separator in the beginning:
# an empty path will still be populated
rs_path_full = rs_sep0 + rs_path_part + rs_sep0 + "(.*)"
print(f"{rs_path_full=}")
re_path = re.compile(rs_path_full)
print(f"{re_path=}")

# parse the current path
print(f"{search_in=}")
match_full_key = re_path.match(search_in)
print(f"{match_full_key=}")
full_key_str = match_full_key.group(1)
print(f"{full_key_str=}")

# extract the key

# first look for a separator after a key
rs_key = "(.*?)" + rs_sep1
re_key = re.compile(rs_key)
print(f"{re_key=}")
match_key = re_key.match(full_key_str)
print(f"{match_key=}")

# if we did not match then the whole str is the key
if match_key is None:
    print(f"matched all {full_key_str}")


In [None]:
search_in = "#senses>#coordinate_terms>#english"
print(f"{search_in=}")

# rs_sep1_key = rs_sep1 + "(.*?)"
# rs_sep1_key = rs_sep1 + "(.*)"
rs_sep1_key = rs_sep1 + "([a-z_]+)"
rs_sep1_key_rep = f"(?:{rs_sep1_key})*"
print(f"{rs_sep1_key_rep=}")
re_sep1_key_rep = re.compile(rs_sep1_key_rep)
print(f"{re_sep1_key_rep=}")
m_sep1_key_rep = re_sep1_key_rep.match(search_in)
print(f"{m_sep1_key_rep=}")
for g in m_sep1_key_rep.groups():
    print(f"{g=}")

# rs_boring_two = "[#>]{1,2}(.*?)[#>]{1,2}(.*?)"
rs_boring_two = "[#>]{1,2}(.*)[#>]{1,2}(.*)"
re_boring_two = re.compile(rs_boring_two)
re_boring_two.match(search_in)


We had fun with regex and groups but dear Lord, you can do this in 3 lines.

In [None]:
# path = ["senses", "coordinate_terms", "english", "other"]
# path = ["senses", "coordinate_terms", "english"]
# path = ["senses", "coordinate_terms"]
# path = ["senses"]
path = ["other"]
# path = []
search_in = "#senses>#coordinate_terms>#english"

search_path = [x for x in re.split("[#>]", search_in) if x]
if all(p == s for p, s in zip(path, search_path)):
    rest = search_path[len(path) :]
    print(f"matches {rest}")
    if len(rest) == 0:
        print(f"matches but not longer than path")

match_str_to_path(search_in, path)


In [None]:
# this is the wrong thing: objects should be homogeneous
# at most you can miss a key in an object, that's ok
# but all the objects in the list are the same thing
nested_lists = [
    {"a": "data"},
    [
        {"a": "data"},
        {"b": "data"},
    ],
    [
        [
            {"c": "data"},
            {"d": "data"},
        ],
        [
            {"e": "data"},
            {"f": "data"},
        ],
    ],
]
struct_nest = {}
build_structure(nested_lists, 0, struct_nest, "")
for k in structure_to_strlist(struct_nest):
    # for k in struct_nest.keys():
    print(k)


In [None]:
# an object can have a list as a *value*, that's ok
nested_lists = [
    {
        "a": "data",
        "b": {"b0": "data"},
        "c": [
            {"c0": "data0", "c1": "data0"},
            {"c0": "data1"},
        ],
        "d": [
            [
                {"d0": "data", "d1": "data"},
            ],
            [
                {"d1": "data", "d1": "data"},
            ],
        ],
    },
]
struct_nest = {}
build_structure(nested_lists, 0, struct_nest, "")
for k in structure_to_strlist(struct_nest):
    # for k in struct_nest.keys():
    print(k)
