# Imports

In [1]:
import itertools
from collections import Counter
import string
import numpy as np
from tqdm.notebook  import tqdm

# Load text file

In [5]:
characters = set()
words = []
limit = int(1e6)

with open("list/words_fr.txt", "rb") as text:
    for line in tqdm(itertools.islice(text, limit)):
        line = line.decode("utf8").lower().replace("\n", "")
        if len(line) < 4:
            continue
#         if any([i not in string.ascii_lowercase for i in line]):
#             continue
        characters |= set(line)
        words.append(line)

characters = sorted(list(characters))
print(f"characters : {len(characters)}")
print(characters)
print(f"nb words : {len(words)}")
print(words[0:10])

0it [00:00, ?it/s]

characters : 41
['-', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'à', 'â', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'î', 'ï', 'ô', 'ö', 'û', 'ü']
nb words : 335968
['abaissa', 'abaissable', 'abaissables', 'abaissai', 'abaissaient', 'abaissais', 'abaissait', 'abaissâmes', 'abaissant', 'abaissante']


# N-grams

In [6]:
def n_grams(word, n):
    return [word[i:i+n] for i in range(0, len(word) - n + 1)]
n_grams(words[5], 3)

['aba', 'bai', 'ais', 'iss', 'ssa', 'sai', 'ais']

# Probability tables

In [57]:
def create_probability_table(chain):
    counter = Counter(chain)
    p = np.zeros(len(counter.most_common(1)[0][0]) * [len(characters)])
    for w in counter:
        pos = tuple([characters.index(i) for i in w])
        p[pos] = counter[w]
    return p

chain_s1 = itertools.chain(word[:1] for word in tqdm(words))
chain_s2 = itertools.chain(word[:2] for word in tqdm(words))
chain_s3 = itertools.chain(word[:3] for word in tqdm(words))
chain_m4 = itertools.chain.from_iterable(n_grams(word, 4) for word in tqdm(words))
chain_e3 = itertools.chain(word[-3:] for word in tqdm(words))
chain_e2 = itertools.chain(word[-2:] for word in tqdm(words))

ps1 = create_probability_table(chain_s1)
ps2 = create_probability_table(chain_s2)
ps3 = create_probability_table(chain_s3)
pm4 = create_probability_table(chain_m4)
pe3 = create_probability_table(chain_e3)
pe2 = create_probability_table(chain_e2)

  0%|          | 0/335968 [00:00<?, ?it/s]

  0%|          | 0/335968 [00:00<?, ?it/s]

  0%|          | 0/335968 [00:00<?, ?it/s]

  0%|          | 0/335968 [00:00<?, ?it/s]

  0%|          | 0/335968 [00:00<?, ?it/s]

  0%|          | 0/335968 [00:00<?, ?it/s]

(41,)
(41, 41)
(41, 41, 41)
(41, 41, 41, 41)
(41, 41, 41)
(41, 41)


# Generative model

In [116]:
def generate(n, s="", laplace=1e-1):
    s = s.lower()
    for i in range(n):
        # get the previous characters indices
        if len(s) >= 1:
            n_minus_1 = characters.index(s[len(s)-1])
        if len(s) >= 2:
            n_minus_2 = characters.index(s[len(s)-2])
        if len(s) >= 2:
            n_minus_3 = characters.index(s[len(s)-3])
        
        # select the probability table
        
        # begining of the word
        if len(s) <= 0:
            p = ps1
        elif len(s) <= 1:
            p = ps2[n_minus_1, :]
        elif len(s) <= 2:
            p = ps3[n_minus_2, n_minus_1, :]
        # end of the word
        elif len(s) >= n - 1:
            p = pe2[n_minus_1, :]
        elif len(s) >= n - 2:
            p = pe3[n_minus_2, n_minus_1, :]
        # middle of the word
        else:
            p = pm4[n_minus_3, n_minus_2, n_minus_1, :]

        # laplace smoothing
        p += laplace
        for i in range(10):
            p = p / p.sum()
        
        # select the next character
        s += np.random.choice(characters, p=p)
    return s

# Use cases

In [122]:
for i in range(20):
    print(generate(int(np.random.randint(4, 15))))

érez
écur
quaitèresses
étantsiassesse
gara
prisâmestîmese
étêtés-joira
démont
radsa
resthérèrer
exploisés
séchse
détonnesczu
bara
friseras
maillerase
défristrese
figues
énoponnéessase
amoisaisas


In [125]:
for i in range(20):
    print(generate(5, "cy").title())

Cylerai
Cyasera
Cycpers
Cyasese
Cyaises
Cylseza
Cycites
Cypéese
Cylasez
Cyaseas
Cysease
Cycömba
Cycàces
Cycfezq
Cycbase
Cyaises
Cynesez
Cynezes
Cygeses
Cycàvez
