# Open a wiki file and learn something


Download raw data from:
https://kaikki.org/dictionary/French/index.html


## Record structure

### Word: `word`

It's the word.

### Part of speech: `pos`

We can get a set of good pos to use.

### Meaning of the word: `senses`

Can be more than one (it's a list).

`glosses`: The meaning of the word, also a list.
`raw_glosses`: A more informative definition.
`examples`: Examples.

Each `sense` has a `categories` list,
that can be useful for clustering words
or to train by topic.


## Frequent words

1. Get the big corpus.
1. For each word, get the non inflected version.
1. Compute freq lol you are done.


## Constants


In [None]:
import json
from pathlib import Path
from random import randint
from pprint import pprint


In [None]:
dataset_fol = Path(".").absolute().parent / "dataset"

wiki_fn_word_forms = "kaikki.org-dictionary-French-words.json"
wiki_fn_non_inflected_senses = "kaikki.org-dictionary-French-all-no-wFNY2q.json"

# wiki_fp = dataset_fol / wiki_fn_word_forms
wiki_fp = dataset_fol / wiki_fn_non_inflected_senses


In [None]:
accent = set("àÀâÂéÉèÈêÊëËîÎïÏœŒôÔùÙûÛüÜçÇ")
print(accent)


In [None]:
good_pos = {
    # "",
    # '<i class="Jpan mention" lang="ja">かみかぜ</i> (kamikaze, “suicide flyer”, literally “divine wind”)',
    # 'Modern French <i class="Latn mention" lang="fr">chair</i>',
    # "a",
    # "a commune in Normandy, France",
    # "a restoration of the Latin 3rd-person-singular -t",
    # "abbrev",
    # "ablative",
    # "accusative plural",
    "adj",
    # "adjectival suffix",
    "adjective",
    # "adjective-forming suffix",
    "adv",
    "adverb",
    # "affirmative particle",
    # "affix",
    # "an anchovy-based condiment",
    # "an apocopic form of la, la before a vowel",
    "article",
    # "augmentative suffix",
    # "character",
    # "conj",
    # "det",
    # "dialectal",
    # "diminutive ending",
    # "diminutive suffix",
    # "first-person plural present indicative ending",
    # 'from an Illyrian word probably from Proto-Indo-European <i class="Latinx mention" lang="ine-pro">*sab-</i> (“taste”)',
    # "infix",
    # "instrumental suffix",
    # "intensifier",
    # "interfix",
    # "interjection used in deer-hunting",
    # "intj",
    # "n",
    "name",
    # "name of a Celtic tribe in Southern Germany, which later emigrated to Gaul",
    # "nominal suffix",
    "noun",
    # "noun suffix",
    "nouns",
    "num",
    # "onomatopoeia of the lowing of cattle",
    "particle",
    # "past participle of dire (“to say”)",
    # "past passive participle",
    "phrase",
    # "postp",
    "prefix",
    "prep",
    "prep_phrase",
    "pron",
    # "pronounced /le‿ʁital(jɛ̃)/",
    "proverb",
    # "punct",
    # "reflexive pronoun",
    # "second-person singular",
    # "stem libr-",
    # "suffix",
    # "suffix added to noun stems to form adjectives",
    # "suffix added to verbal stems forming neuter nouns denoting the result of, a particular instance of, or the object of an action",
    # "suffix denoting occupation",
    # "suffix forming adjectives from nouns",
    # "suffix forming adjectives meaning ‘belonging to, relating to’",
    # "suffix forming augmentatives",
    # "suffix forming diminutives",
    # "suffix forming infinitives of first-conjugation verbs",
    # "suffix forming nouns usually denoting diseased conditions",
    # "suffix meaning ‘of or pertaining to’",
    # "suffix with the sense ‘relating’ to forming adjectives",
    # "surname",
    # "symbol",
    # "v",
    "verb",
    "verb and noun",
}
len(good_pos)


## Funcs


In [None]:
def walk_object(obj, level, seen_keys, seen_pos):
    # pad = "\t" * level + str(level) + ": "
    if isinstance(obj, dict):
        for key in obj:
            # print(f"{pad}opening {key=} {obj[key]=}")
            seen_keys.add(key)
            if key == "pos":
                seen_pos.add(obj[key])
            walk_object(obj[key], level + 1, seen_keys, seen_pos)

    elif isinstance(obj, list):
        for el in obj:
            # print(f"{pad}traversing {el}")
            walk_object(el, level + 1, seen_keys, seen_pos)


## Load the data

One record per line


In [None]:

words_data = []
with wiki_fp.open() as wf:
    for line in wf:
        word_data = json.loads(line)
        words_data.append(word_data)


In [None]:
keep_keys = [
    "categories",
    "form_of",
    "pos",
    "senses",
    "word",
]

for word_data in words_data:

    # only keep words with good pos
    if word_data["pos"] not in good_pos:
        continue

    # keep only some keys in the word record
    word_data_keys = list(word_data.keys())
    for word_data_key in word_data_keys:
        if word_data_key not in word_data:
            del word_data[word_data_key]

    # remove useless info in categories
    # (useless if you have a graph of cats)
    for sense in word_data["senses"]:
        if "categories" not in sense:
            continue
        for cat in sense["categories"]:
            cat_keys = list(cat.keys())
            for cat_key in cat_keys:
                if cat_key != "name":
                    del cat[cat_key]


In [None]:
# sort(ish) the words
# one word can have more than one pos
words_data_sort = sorted(words_data, key=lambda x: x["word"])


## Inspect keys and pos


In [None]:
seen_keys = set()
seen_pos = set()

for word_data in words_data:
    walk_object(word_data, level=0, seen_keys=seen_keys, seen_pos=seen_pos)

len(seen_keys)


In [None]:
seen_keys


In [None]:
seen_pos


## Analyze some words


In [None]:
words_acc_data = []
word_search = []

for word_data in words_data_sort:
    word = word_data["word"]

    word_letters = set(word)
    is_accent = accent.intersection(word_letters)
    if is_accent:
        words_acc_data.append(word_data)
        # print(f"accent! {word}")
        # break

    # word with a space
    # if " " in word:
    #     print(f"found! {word}")
    #     break

    # the_word = "abîme"
    # the_word = "abime"
    # the_word = "abimes"
    # the_word = "angariés"
    # the_word = "angaries"
    # the_word = "arrière"
    the_word = "Alexia"
    if word == the_word:
        print(f"found! {word} {word_data['pos']}")
        word_search.append(word_data)

    # if word_data["pos"] == "name":
    #     print(f"found! {word} {word_data['pos']}")
    #     break


print(f"{len(words_acc_data)=}")


In [None]:
pprint(word_search[0], width=150)


In [None]:
# ri = randint(0, len(words_data_sort) - 1)
# word_data = words_data_sort[ri]
# print(f"{ri=} {word_data['word']}")
# pprint(word_data, width=150)

ri = randint(0, len(words_acc_data) - 1)
acc_data = words_acc_data[ri]
print(f"{ri=} {acc_data['word']}")
pprint(acc_data, width=150)


In [None]:
# # output file
# wiki_out_fn = "kaikki.org-dictionary-French-all-no-filter.json"
# wiki_out_fp = dataset_fol / wiki_out_fn
# print(f"{wiki_out_fp}")
# # build all the records
# out_str = []
# for word_data in words_data_sort:
#     word_str = json.dumps(word_data)
#     out_str.append(word_str)
# # write out the records
# dump_str = "\n".join(out_str)
# wiki_out_fp.write_text(dump_str)
