# Examen Parcial CC0C2

## Ejercicio 1

Dadas tres oraciones "all models are wrong", "a model is wrong" y "some models are useful", y el vocabulario {< s >, < /s >, a, all, are, model, models, some, useful, wrong}. En código responde las siguientes preguntas 

### a) Calcule las probabilidades de todos los bigramas sin suavizado.

In [199]:
import re


def tokenize(text):
    # Tokeniza el texto y remueve signos de puntuación
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text.split()

corpus = """
        all models are wrong
        a model is wrong
        some models are useful
        """

tokens = tokenize(corpus)
print("Tokens:", tokens)

Tokens: ['all', 'models', 'are', 'wrong', 'a', 'model', 'is', 'wrong', 'some', 'models', 'are', 'useful']


In [200]:
# Clase2/Modelos-lenguaje1.ipynb
from collections import Counter


def build_ngram_counts(tokens, n):
    ngrams = zip(*[tokens[i:] for i in range(n)])
    return Counter(ngrams)


# Conteo de unigramas, bigramas y trigramas
unigrams = build_ngram_counts(tokens, 1)
bigrams = build_ngram_counts(tokens, 2)
trigrams = build_ngram_counts(tokens, 3)

print("Bigramas:", bigrams)

Bigramas: Counter({('models', 'are'): 2, ('all', 'models'): 1, ('are', 'wrong'): 1, ('wrong', 'a'): 1, ('a', 'model'): 1, ('model', 'is'): 1, ('is', 'wrong'): 1, ('wrong', 'some'): 1, ('some', 'models'): 1, ('are', 'useful'): 1})


In [201]:
# Clase2/Modelos-lenguaje1.ipynb
def bigram_prob(bigrams, unigrams, word1, word2):
    return bigrams[(word1, word2)] / unigrams[(word1,)]


# @Overglitch
# Respuesta de la pregunta
print("Probabilidades de bigramas sin suavizado:\n")
for bigram in bigrams:
    word1, word2 = bigram
    print(f"P({word2} | {word1}) = {bigram_prob(bigrams, unigrams, word1, word2)}")

Probabilidades de bigramas sin suavizado:

P(models | all) = 1.0
P(are | models) = 1.0
P(wrong | are) = 0.5
P(a | wrong) = 0.5
P(model | a) = 1.0
P(is | model) = 1.0
P(wrong | is) = 1.0
P(some | wrong) = 0.5
P(models | some) = 1.0
P(useful | are) = 0.5


### b) Calcule las probabilidades de todos los bigramas y el bigrama no visto "a models" con suavizado de add-one.

In [202]:
# Clase2/Modelos-lenguaje2.ipynb

# Suavizado de Laplace para bigramas
def laplace_smoothing_bigram(corpus, k=1):
    # Conteo de bigramas y unigrams
    bigram_counts = {}
    unigram_counts = {}

    # Construir bigramas
    for i in range(len(corpus) - 1):
        bigram = (corpus[i], corpus[i + 1])
        unigram = corpus[i]

        if bigram in bigram_counts:
            bigram_counts[bigram] += 1
        else:
            bigram_counts[bigram] = 1

        if unigram in unigram_counts:
            unigram_counts[unigram] += 1
        else:
            unigram_counts[unigram] = 1

    # Contar el último unigrama
    last_word = corpus[-1]
    if last_word in unigram_counts:
        unigram_counts[last_word] += 1
    else:
        unigram_counts[last_word] = 1

    # Tamaño del vocabulario
    V = len(unigram_counts)

    # Cálculo de las probabilidades suavizadas para bigramas
    laplace_probabilities = {}
    for bigram, bigram_count in bigram_counts.items():
        w_n_1 = bigram[0]
        # Aplicando la ecuación P_Laplace(w_n | w_n-1) = (C(w_n-1 w_n) + 1) / (C(w_n-1) + V)
        # Aquí está el add-one smoothing
        laplace_probabilities[bigram] = (bigram_count + k) / (unigram_counts[w_n_1] + k * V)

    # Probabilidad para un bigrama no visto
    laplace_probabilities[('a', 'models')] = k / (V * (V + k))

    return laplace_probabilities


# Ejemplo de uso
corpus = """
        all models are wrong
        a model is wrong
        some models are useful
        """
corpus = tokenize(corpus)
laplace_prob_bigrams = laplace_smoothing_bigram(corpus)
print("\nProbabilidades de bigramas suavizadas con add-one:")

# Imprimir las probabilidades de los bigramas
for bigram, prob in laplace_prob_bigrams.items():
    if bigram == ('a', 'models'):
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f} (bigrama no visto)")
    else:
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")



Probabilidades de bigramas suavizadas con add-one:
P(models | all) = 0.2000
P(are | models) = 0.2727
P(wrong | are) = 0.1818
P(a | wrong) = 0.1818
P(model | a) = 0.2000
P(is | model) = 0.2000
P(wrong | is) = 0.2000
P(some | wrong) = 0.1818
P(models | some) = 0.2000
P(useful | are) = 0.1818
P(models | a) = 0.0111 (bigrama no visto)


### c) Calcule las probabilidades de todos los bigramas y el bigrama no visto "a models" con suavizado de add-k. Pruebe con k = 0.05 y k = 0.15.

In [204]:
# Clase2/Modelos-lenguaje2.ipynb

# Ejemplo de uso
corpus = """
        all models are wrong
        a model is wrong
        some models are useful
        """
corpus = tokenize(corpus)
laplace_prob_bigrams = laplace_smoothing_bigram(corpus, k=0.05)
print("\nProbabilidades de bigramas suavizadas con add-k (k = 0.05):")


for bigram, prob in laplace_prob_bigrams.items():
    if bigram == ('a', 'models'):
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f} (bigrama no visto)")
    else:
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")

laplace_prob_bigrams = laplace_smoothing_bigram(corpus, k=0.15)
print("\nProbabilidades de bigramas suavizadas con add-k (k = 0.15):")


for bigram, prob in laplace_prob_bigrams.items():
    if bigram == ('a', 'models'):
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f} (bigrama no visto)")
    else:
        print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")


Probabilidades de bigramas suavizadas con add-k (k = 0.05):
P(models | all) = 0.7241
P(are | models) = 0.8367
P(wrong | are) = 0.4286
P(a | wrong) = 0.4286
P(model | a) = 0.7241
P(is | model) = 0.7241
P(wrong | is) = 0.7241
P(some | wrong) = 0.4286
P(models | some) = 0.7241
P(useful | are) = 0.4286
P(models | a) = 0.0006 (bigrama no visto)

Probabilidades de bigramas suavizadas con add-k (k = 0.15):
P(models | all) = 0.4894
P(are | models) = 0.6418
P(wrong | are) = 0.3433
P(a | wrong) = 0.3433
P(model | a) = 0.4894
P(is | model) = 0.4894
P(wrong | is) = 0.4894
P(some | wrong) = 0.3433
P(models | some) = 0.4894
P(useful | are) = 0.3433
P(models | a) = 0.0018 (bigrama no visto)


### d) Calcule las probabilidades de todos los bigramas y el bigrama no visto "a models" con back-off y stupid-backoff.

In [212]:
from typing import List, Tuple


# Implementación del NGramModel
class NGramModel:
    def __init__(self, n: int):
        self.n = n
        self.ngram_counts = collections.Counter()
        self.context_counts = collections.Counter()
        self.vocab = set()
        self.total_ngrams = 0

    def train(self, corpus: List[List[str]]):
        for document in corpus:
            tokens = ['<s>'] * (self.n - 1) + document + ['</s>']
            self.vocab.update(tokens)
            for i in range(len(tokens) - self.n + 1):
                ngram = tuple(tokens[i:i + self.n])
                context = tuple(tokens[i:i + self.n - 1])
                self.ngram_counts[ngram] += 1
                self.context_counts[context] += 1
                self.total_ngrams += 1

    def get_ngram_prob(self, ngram: Tuple[str, ...]) -> float:
        count = self.ngram_counts.get(ngram, 0)
        context = ngram[:-1]
        context_count = self.context_counts.get(context, 0)
        if context_count == 0:
            return 0.0
        else:
            return count / context_count

    def get_sentence_probability(self, sentence: List[str]) -> float:
        tokens = ['<s>'] * (self.n - 1) + sentence + ['</s>']
        probability = 1.0
        for i in range(len(tokens) - self.n + 1):
            ngram = tuple(tokens[i:i + self.n])
            prob = self.get_ngram_prob(ngram)
            if prob > 0:
                probability *= prob
            else:
                # Asignamos una pequeña probabilidad para evitar cero
                probability *= 1e-6
        return probability


# Implementación de Backoff Estándar

class BackoffNGramModel(NGramModel):
    def __init__(self, n: int, models: List[NGramModel]):
        super().__init__(n)
        self.models = models  # Lista de modelos de diferentes órdenes, ordenados de mayor a menor
        # Actualizamos self.vocab con la unión de los vocabularios de los modelos
        self.vocab = set()
        for model in self.models:
            self.vocab.update(model.vocab)

    def get_ngram_prob(self, ngram: Tuple[str, ...]) -> float:
        for model in self.models:
            ngram_adjusted = ngram[-model.n:]
            prob = model.get_ngram_prob(ngram_adjusted)
            if prob > 0:
                return prob
        # Si ningún modelo tiene el n-grama, asignamos una pequeña probabilidad
        return 1e-6


# Implementación del Stupid Backoff

class StupidBackoffNGramModel(NGramModel):
    def __init__(self, n: int, models: List[NGramModel], alpha: float = 0.4):
        super().__init__(n)
        self.models = models  # Lista de modelos de diferentes órdenes, ordenados de mayor a menor
        self.alpha = alpha  # Factor de escala fijo
        # Actualizamos self.vocab con la unión de los vocabularios de los modelos
        self.vocab = set()
        for model in self.models:
            self.vocab.update(model.vocab)

    def get_ngram_prob(self, ngram: Tuple[str, ...]) -> float:
        for i, model in enumerate(self.models):
            ngram_adjusted = ngram[-model.n:]
            prob = model.get_ngram_prob(ngram_adjusted)
            if prob > 0:
                return (self.alpha ** i) * prob
        # Si ningún modelo tiene el n-grama, asignamos una pequeña probabilidad
        return (self.alpha ** len(self.models)) * (1.0 / len(self.vocab))


# Entrenamiento de los modelos
corpus = """
        all models are wrong
        a model is wrong
        some models are useful
        """

train_corpus = tokenize(corpus)
vocab = ['<s>', '</s>', 'a', 'all', 'are', 'model', 'models', 'some', 'useful', 'wrong']
vocab_size = len(vocab)

unigram_model = NGramModel(1)
bigram_model = NGramModel(2)
trigram_model = NGramModel(3)

for model in [unigram_model, bigram_model, trigram_model]:
    model.train([train_corpus])

#muestra todas las probabilidades de los bigramas
print("Probabilidades de bigramas sin suavisado:\n")
for bigram in bigram_model.ngram_counts:
    prob = bigram_model.get_ngram_prob(bigram)
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")
print("Probabilidades del bigrama no visto 'a models':")
print(f"P(models | a) = {bigram_model.get_ngram_prob(('a', 'models')):.4f}")

#muestra todas las probabilidades de los bigramas con backoff
print("\nProbabilidades de bigramas con backoff:\n")
backoff_model = BackoffNGramModel(2, [unigram_model, bigram_model])
for bigram in bigram_model.ngram_counts:
    prob = backoff_model.get_ngram_prob(bigram)
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")
print("Probabilidades del bigrama no visto 'a models':")
print(f"P(models | a) = {backoff_model.get_ngram_prob(('a', 'models')):.4f}")

#muestra todas las probabilidades de los bigramas con stupid backoff
print("\nProbabilidades de bigramas con stupid backoff:\n")
stupid_backoff_model = StupidBackoffNGramModel(2, [unigram_model, bigram_model])
for bigram in bigram_model.ngram_counts:
    prob = stupid_backoff_model.get_ngram_prob(bigram)
    print(f"P({bigram[1]} | {bigram[0]}) = {prob:.4f}")
print("Probabilidades del bigrama no visto 'a models':")
print(f"P(models | a) = {stupid_backoff_model.get_ngram_prob(('a', 'models')):.4f}")


Probabilidades de bigramas sin suavisado:

P(all | <s>) = 1.0000
P(models | all) = 1.0000
P(are | models) = 1.0000
P(wrong | are) = 0.5000
P(a | wrong) = 0.5000
P(model | a) = 1.0000
P(is | model) = 1.0000
P(wrong | is) = 1.0000
P(some | wrong) = 0.5000
P(models | some) = 1.0000
P(useful | are) = 0.5000
P(</s> | useful) = 1.0000
Probabilidades del bigrama no visto 'a models':
P(models | a) = 0.0000

Probabilidades de bigramas con backoff:

P(all | <s>) = 0.0769
P(models | all) = 0.1538
P(are | models) = 0.1538
P(wrong | are) = 0.1538
P(a | wrong) = 0.0769
P(model | a) = 0.0769
P(is | model) = 0.0769
P(wrong | is) = 0.1538
P(some | wrong) = 0.0769
P(models | some) = 0.1538
P(useful | are) = 0.0769
P(</s> | useful) = 0.0769
Probabilidades del bigrama no visto 'a models':
P(models | a) = 0.1538

Probabilidades de bigramas con stupid backoff:

P(all | <s>) = 0.0769
P(models | all) = 0.1538
P(are | models) = 0.1538
P(wrong | are) = 0.1538
P(a | wrong) = 0.0769
P(model | a) = 0.0769
P(is | m