This notebooks build the most frequent words lists.

In [7]:
import os
import glob
import json
import pandas as pd
from collections import Counter
from typing import List
import tqdm

def get_affixes(token):
    if len(token) <= 3:
        yield token
    if len(token) >= 2:
        yield f"_{token[:2]}"
        yield f"{token[-2:]}_"
    if len(token) > 3:
        yield f"{token[:3]}"
        yield f"{token[-3:]}"
        

In [8]:
texts = pd.read_csv("tlg-texts.csv").file

In [9]:
def read_tsv(file):
    for line in file:
        yield line.split()[:2]

        
def get_trigrams(tokens: List[str]) -> List[str]:
    return ["-".join(tokens[i:i+3]) for i in range(len(tokens)-3+1)]


tokens = Counter()
poses = Counter()
trigrams = Counter()

for text in tqdm.tqdm(texts):
    path = f"./tagged/{text}-tagged.txt"
    if not os.path.exists(path):
        continue
    with open(path) as f:
        doc = list(read_tsv(f))
        tokens += Counter([
            token.lower()
            for token, pos in doc
            if pos[0] != "u" and pos[0] != "n"
        ])
        poses += Counter(get_trigrams([pos[0] for _, pos in doc if pos[0] != "u"]))
        trigrams += Counter([
            t
            for token, pos in doc
            for t in get_affixes(token.lower())
            if pos[0] != "u" 
        ])
        
        
with open("mfw.json", "w") as f:
    json.dump(tokens.most_common(2000), f)

with open("mfp.json", "w") as f:
    json.dump(poses.most_common(2000), f)
    
with open("mft.json", "w") as f:
    json.dump(trigrams.most_common(4000), f)

100%|█████████████████████████████████████████| 700/700 [00:56<00:00, 12.41it/s]


In [10]:
trigrams.most_common(2000)

[('_κα', 1362915),
 ('αὶ_', 1011492),
 ('καὶ', 998663),
 ('_το', 671315),
 ('ον_', 582629),
 ('αι_', 527717),
 ('οῦ_', 508962),
 ('ος_', 490618),
 ('_τὸ', 466351),
 ('ῶν_', 459466),
 ('τοῦ', 457479),
 ('ων_', 426925),
 ('ὸν_', 371509),
 ('ὴν_', 359940),
 ('ῆς_', 351562),
 ('ιν_', 350730),
 ('ας_', 342817),
 ('_πρ', 337763),
 ('δὲ_', 332646),
 ('_αὐ', 330532),
 ('αὐτ', 324018),
 ('_οὐ', 308892),
 ('_ἐπ', 294603),
 ('τὸ_', 294252),
 ('_δι', 294174),
 ('ου_', 292071),
 ('_δὲ', 290706),
 ('δὲ', 290678),
 ('τῶν', 286827),
 ('_πα', 286590),
 ('_ἐν', 285775),
 ('αν_', 284207),
 ('τὸ', 282257),
 ('τὴν', 277838),
 ('_εἰ', 274350),
 ('_τὴ', 254550),
 ('τῆς', 253918),
 ('τὸν', 253525),
 ('ις_', 252187),
 ('τὰ_', 249518),
 ('ὸς_', 238948),
 ('ται', 232421),
 ('_τῆ', 232348),
 ('εν_', 231357),
 ('τι_', 229572),
 ('_τῶ', 228310),
 ('ὁ', 227435),
 ('ῖς_', 223948),
 ('τα_', 223861),
 ('_ἀν', 210381),
 ('_ἀπ', 206271),
 ('ως_', 201306),
 ('ει_', 200434),
 ('ἐν_', 195473),
 ('ἐν', 195456),
 ('κατ', 1946