Aquest quadern és una versió neta del model amb 4-grames.

El podem exportar a un executable (.py) amb `File > Save and Export Notebook As > Executable Script`

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from nltk import ngrams

In [None]:
df = pd.read_csv("data/noms_net.csv", keep_default_na=False)
df["nom#"] = "#" +df["Nom"] +"#"

chars = ["#"] + sorted(list(set("".join(df.Nom))))
nchar = len(chars)
c2i = {c: i for i, c in enumerate(chars)}
i2c = {i: c for i, c in enumerate(chars)}

In [None]:
noms_home = df[df.Sexe == "H"]["nom#"].to_list()
noms_dona = df[df.Sexe == "D"]["nom#"].to_list()

freqs_home = df[df.Sexe == "H"]["freq"].to_list()
freqs_dona = df[df.Sexe == "D"]["freq"].to_list()

In [None]:
def get_P2(noms, freqs, c2i):
    nchar = len(c2i)
    N2 = np.zeros((nchar, nchar), dtype=np.int32)
    for i, nom in enumerate(noms):
        for ch1, ch2 in zip(nom, nom[1:]):
            N2[c2i[ch1], c2i[ch2]] += freqs[i]

    N2_sum = N2.sum(axis=1, keepdims=True)
    P2 = np.divide(N2, N2_sum, out=np.zeros(N2.shape, dtype=float), where=N2_sum != 0)
    return P2

In [None]:
def get_P3(noms, freqs, c2i):
    nchar = len(c2i)
    N3 = np.zeros((nchar, nchar, nchar), dtype=np.int32)
    for i, nom in enumerate(noms):
        for ch1, ch2, ch3 in ngrams(nom, 3):
            N3[c2i[ch1], c2i[ch2], c2i[ch3]] += freqs[i]

    N3_sum = N3.sum(axis=2, keepdims=True)
    P3 = np.divide(N3, N3_sum, out=np.zeros(N3.shape, dtype=float), where=N3_sum != 0)
    return P3

In [None]:
def get_P4(noms, freqs, c2i):
    nchar = len(c2i)
    N4 = np.zeros((nchar,) * 4, dtype=np.int32)
    for i, nom in enumerate(noms):
        for ch1, ch2, ch3, ch4 in ngrams(nom, 4):
            N4[c2i[ch1], c2i[ch2], c2i[ch3], c2i[ch4]] += freqs[i]

    N4_sum = N4.sum(axis=3, keepdims=True)
    P4 = np.divide(N4, N4_sum, out=np.zeros(N4.shape, dtype=float), where=N4_sum != 0)
    return P4

In [None]:
# Deixem les matrius de probabilitat com a variables globals
PH2 = get_P2(noms_home, freqs_home, c2i)
PH3 = get_P3(noms_home, freqs_home, c2i)
PH4 = get_P4(noms_home, freqs_home, c2i)

PD2 = get_P2(noms_dona, freqs_dona, c2i)
PD3 = get_P3(noms_dona, freqs_dona, c2i)
PD4 = get_P4(noms_dona, freqs_dona, c2i)

In [None]:
def genera_noms(sexe):
    if sexe == "H":
        P2 = PH2
        P3 = PH3
        P4 = PH4
    elif sexe == "D":
        P2 = PD2
        P3 = PD3
        P4 = PD4
    else:
        raise ValueError("sexe ha de ser H o D")

    # Primer caràcter a partir de bigrames
    ix1 = 0
    ix2 = np.argmax(np.random.multinomial(1, P2[ix1]))
    nom = i2c[ix2]

    # Segon caràcter a partir de trigrames
    ix3 = np.argmax(np.random.multinomial(1, P3[ix1, ix2]))
    if ix3 == 0:
        return nom
    else:
        nom += i2c[ix3]

    # Resta amb 4-grames
    while True:
        ix4 = np.argmax(np.random.multinomial(1, P4[ix1, ix2, ix3]))
        if ix4 == 0:
            return nom
        nom += i2c[ix4]
        ix1 = ix2
        ix2 = ix3
        ix3 = ix4

In [None]:
np.random.seed(42)

In [None]:
noms = df.Nom.tolist()
for _ in range(10):
    nom = genera_noms("D")
    print(nom, nom in noms)

print()

for _ in range(10):
    nom = genera_noms("H")
    print(nom, nom in noms)