<a href="https://colab.research.google.com/github/NxrFesdac/bourbaki-nlp-avanzado/blob/main/modulo2/n_grams_text_generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import re, random, math
from collections import Counter, defaultdict

random.seed(44)

# ------------------ Config ------------------
N = 3                 # trigram model
ALPHA = 0.4           # backoff factor (Stupid Backoff)
TOP_K = 30            # sample from top-k tokens for coherence (None/0 disables)
TOP_P = 0.9           # nucleus sampling threshold (None/0/1.0 disables)
TEMPERATURE = 0.9     # softens/sharpens distribution (1.0 = off)
MAX_LEN = 40          # max tokens to generate
NUM_SAMPLES = 5       # how many sentences to sample in demo

START, END = "<s>", "</s>"

# ------------------ Toy public-domain-ish corpus ------------------
CORPUS = """
No es que muera de amor, muero de ti.
Muero de ti, amor, de amor de ti,
de urgencia mía de mi piel de ti,
de mi alma, de ti y de mi boca
y del insoportable que yo soy sin ti.

Muero de ti y de mi, muero de ambos,
de nosotros, de ese,
desgarrado, partido,
me muero, te muero, lo morimos.

Morimos en mi cuarto en que estoy solo,
en mi cama en que faltas,
en la calle donde mi brazo va vacío,
en el cine y los parques, los tranvías,
los lugares donde mi hombro
acostumbra tu cabeza
y mi mano tu mano
y todo yo te sé como yo mismo.

Morimos en el sitio que le he prestado al aire
para que estés fuera de mí,
y en el lugar en que el aire se acaba
cuando te echo mi piel encima
y nos conocemos en nosotros,
separados del mundo, dichosa, penetrada,
y cierto , interminable.

Morimos, lo sabemos, lo ignoran, nos morimos
entre los dos, ahora, separados,
del uno al otro, diariamente,
cayéndonos en múltiples estatuas,
en gestos que no vemos,
en nuestras manos que nos necesitan.

Nos morimos, amor, muero en tu vientre
que no muerdo ni beso,
en tus muslos dulcísimos y vivos,
en tu carne sin fin, muero de máscaras,
de triángulos oscuros e incesantes.
Muero de mi cuerpo y de tu cuerpo,
de nuestra muerte ,amor, muero, morimos.
En el pozo de amor a todas horas,
inconsolable, a gritos,
dentro de mi, quiero decir, te llamo,
te llaman los que nacen, los que vienen
de atrás, de ti, los que a ti llegan.
Nos morimos, amor, y nada hacemos
sino morirnos más, hora tras hora,
y escribirnos y hablarnos y morirnos.
"""

# ------------------ Tokenization ------------------
def tokenize(text):
    # Split into tokens: words or punctuation
    # Lowercasing for counts; you can keep case if preferred
    return re.findall(r"[A-Za-z']+|[0-9]+|[^\w\s]", text.lower())

def detok(tokens):
    # Simple detokenizer with decent punctuation spacing
    s = ""
    for i, t in enumerate(tokens):
        if t in {".", ",", "!", "?", ":", ";", ")", "]", "'"}:
            s += t
        elif t in {"(", "["}:
            if s and not s.endswith(" "): s += " "
            s += t
        elif i == 0:
            s += t.capitalize()
        else:
            if s and not s.endswith(" "): s += " "
            s += t
    # Ensure ending punctuation
    if s and s[-1].isalnum():
        s += "."
    return s

# ------------------ N-gram counting ------------------
def build_counts(lines, n=N):
    counts = [Counter() for _ in range(n+1)]  # counts[k] holds k-gram counts (k>=1)
    vocab = set()
    for line in lines:
        toks = [START]*(n-1) + tokenize(line) + [END]
        vocab.update(toks)
        for k in range(1, n+1):
            for i in range(len(toks)-k+1):
                gram = tuple(toks[i:i+k])
                counts[k][gram] += 1
    # We remove START from sampling vocab; END is allowed as a stop signal
    vocab.discard(START)
    return counts, sorted(vocab)

# ------------------ Stupid Backoff probability (unnormalized) ------------------
def sb_score(counts, context, w, alpha=ALPHA):
    # Returns an unnormalized "score" proportional to probability
    # If n-gram unseen, back off to smaller context and multiply by alpha each step.
    n = len(counts)-1
    for k in range(n, 0, -1):
        ctx = tuple(context[-(k-1):]) if k > 1 else tuple()
        gram = ctx + (w,)
        c = counts[k].get(gram, 0)
        if c > 0:
            # denom: sum of all next-token counts from this context
            if k == 1:
                denom = sum(counts[1].values())
            else:
                denom = 0
                for g, v in counts[k].items():
                    if g[:-1] == ctx:
                        denom += v
                if denom == 0:
                    denom = 1
            steps = n - k
            return (alpha**steps) * (c / denom)
    return 1e-12

# ------------------ Sampling: top-k + top-p + temperature ------------------
def top_k_top_p_sample(scores, top_k=TOP_K, top_p=TOP_P, temperature=TEMPERATURE):
    """
    scores: dict[token] -> nonnegative score (unnormalized ok)
    Applies temperature, then optional top-k, then optional top-p, then samples.
    """
    items = [(w, s) for w, s in scores.items() if s > 0]
    if not items:
        items = list(scores.items())
    if not items:
        raise ValueError("No tokens to sample from (scores is empty).")

    # Sort by score descending
    items.sort(key=lambda x: x[1], reverse=True)

    # Apply temperature (power transform works with your probability-like scores)
    if temperature is not None and temperature > 0 and temperature != 1.0:
        items = [(w, s ** (1.0 / temperature)) for w, s in items]

    # Normalize across all items (needed for correct nucleus cutoff)
    total = sum(s for _, s in items)
    if total <= 0:
        return random.choice([w for w, _ in items])

    probs = [(w, s / total) for w, s in items]

    # Top-k truncation
    if top_k is not None and top_k > 0:
        probs = probs[:min(top_k, len(probs))]

    # Top-p (nucleus) truncation
    if top_p is not None and 0 < top_p < 1.0:
        cum = 0.0
        nucleus = []
        for w, p in probs:
            nucleus.append((w, p))
            cum += p
            if cum >= top_p:
                break
        probs = nucleus

    # Renormalize after truncation, then sample
    total_p = sum(p for _, p in probs)
    if total_p <= 0:
        return random.choice([w for w, _ in probs])

    r = random.random()
    cum = 0.0
    for w, p in probs:
        cum += p / total_p
        if r < cum:
            return w
    return probs[-1][0]

def generate(counts, vocab, max_len=MAX_LEN, top_k=TOP_K, top_p=TOP_P, temperature=TEMPERATURE):
    n = len(counts)-1
    context = [START]*(n-1)
    out = []
    for _ in range(max_len):
        scores = {w: sb_score(counts, context, w) for w in vocab if w != START}
        scores[END] = sb_score(counts, context, END)

        w = top_k_top_p_sample(scores, top_k=top_k, top_p=top_p, temperature=temperature)
        if w == END:
            break
        out.append(w)
        context.append(w)
    return detok(out)

# ------------------ Train + Demo ------------------
lines = [l.strip() for l in CORPUS.strip().splitlines() if l.strip()]
counts, vocab = build_counts(lines, n=N)

print(f"Model: {N}-gram with Stupid Backoff α={ALPHA}, top-k={TOP_K}, top-p={TOP_P}, T={TEMPERATURE}")
print("Vocabulary size:", len(vocab))

print("\n=== Samples ===")
for i in range(NUM_SAMPLES):
    print(f"{i+1}.", generate(counts, vocab))

# Show a quick conditional sample for intuition
def show_next_tokens(context_words, top=15):
    ctx = [START]*(N-1) + tokenize(" ".join(context_words)) if context_words else [START]*(N-1)
    scores = {w: sb_score(counts, ctx, w) for w in vocab if w != START}
    items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top]
    print(f"\nTop continuations for context: {' '.join(context_words) if context_words else '<start>'}")
    for w, s in items:
        print(f"{w:15s} {s:.6f}")

show_next_tokens(["muero"])

Model: 3-gram with Stupid Backoff α=0.4, top-k=30, top-p=0.9, T=0.9
Vocabulary size: 144

=== Samples ===
1. De nuestra muerte, amor, muero de ti, lo sabemos, lo sabemos, lo sabemos, lo ignoran, muero de ambos,
2. De nuestra muerte, amor, muero, te muero, te muero, morimos.
3. Morimos en el lugar en que estoy solo,
4. En el cine y los parques, los que a ti, amor, muero de ambos,
5. Muero de ambos,

Top continuations for context: muero
de              1.000000
,               0.120000
en              0.040000
</s>            0.014314
y               0.004771
que             0.004135
mi              0.003817
.               0.003181
muero           0.003181
ti              0.002863
morimos         0.002545
amor            0.002227
los             0.002227
el              0.001590
m               0.001590
