# Juan Carlos Perez Ramirez
## Procesamiento de Lenguaje Natural
## Practica 5: PLM

# Lectura de datos

In [2]:
import os
import re
from keras.preprocessing.text import Tokenizer

def get_texts_from_file(path_corpus, path_truth):
    tr_txt = []
    tr_y = []

    with open(path_corpus, "r") as f_corpus, open(path_truth, "r") as f_truth:
        for tweet in f_corpus:
            tr_txt += [tweet]
        for label in f_truth:
            tr_y += [label]
    return tr_txt, tr_y

2025-03-08 18:17:59.899536: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-08 18:18:00.275466: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-03-08 18:18:00.279097: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tr_txt, tr_y = get_texts_from_file("../../Corpus/mex20_train.txt", "../../Corpus/mex20_train_labels.txt")
tr_y = list(map(int, tr_y))

# Preprocesamiento y tratamiento de datos

In [None]:
from nltk.probability import FreqDist
class TrigramData:

    def __init__(self, vocab_max, tokenizer):
        self.vocab_max = vocab_max
        self.tokenizer = tokenizer
        self.UNK = "<unk>"
        self.SOS = "<s>"
        self.EOS = "</s>"
        self.final_vocabulary = set()

    def fit(self, raw_texts):

        freq_dist = FreqDist()
        tokenized_corpus = []

        for txt in raw_texts:
            tokens = self.tokenizer.tokenize(txt)
            tokenized_corpus.append(tokens)
            for w in tokens:
                freq_dist[w] += 1

        self.final_vocabulary = {tok for tok, _ in freq_dist.most_common(self.vocab_max)}
        self.final_vocabulary.update([self.UNK, self.SOS, self.EOS])

        transformed_corpus = []
        for tokens in tokenized_corpus:
            transformed_corpus.append(self.transform(tokens))
        return transformed_corpus
    
    def mask_oov(self, w):
        return self.UNK if w not in self.final_vocabulary else w
    
    def add_sos_eos(self, tokens):
        return [self.SOS, self.SOS] + tokens + [self.EOS]

    def transform(self, tokens):
        transformed = []
        for w in tokens:
            transformed.append(self.mask_oov(w))
        transformed = self.add_sos_eos(transformed)
        return transformed

# TrigramLM

In [45]:
class TrigramLanguageModel:

    def __init__(self, lambda1=0.4, lambda2=0.3, lambda3=0.3):
        self.lambda1 = lambda1 # trigramas
        self.lambda2 = lambda2 # bigramas
        self.lambda3 = lambda3 # unigramas

        # Contadores
        self.unigram_counts = {}
        self.bigram_counts = {}
        self.trigram_counts = {}

        self.vocab = 0
        self.total_tokens = 0
        self.V = 0

    def train(self, transformed_corpus, final_vocabulary):
        self.vocab = final_vocabulary
        self.V = len(final_vocabulary)

        for tokens in transformed_corpus:
            for i, w in enumerate(tokens):

                # Unigramas
                self.unigram_counts[w] = self.unigram_counts.get(w, 0) + 1

                # Bigramas
                if i > 0:
                    w_prev = tokens[i-1]
                    self.bigram_counts[(w_prev, w)] = self.bigram_counts.get((w_prev, w), 0) + 1
                    
                # Trigramas
                if i > 1:
                    w_prev2 = tokens[i-2]
                    self.trigram_counts.get((w_prev2, w_prev, w), 0) + 1
            self.total_tokens = sum(self.unigram_counts.values())

    def mask_oov(self, w):
        return "<unk>" if w not in self.vocab else w

    def unigram_probability(self, w):
        return (self.unigram_counts.get(self.mask_oov(w), 0) + 1) / (self.total_tokens + self.V)
    
    def bigram_probability(self, w_prev, w):
        w_prev = self.mask_oov(w_prev)
        w = self.mask_oov(w)
        
        numerator = self.bigram_counts.get((w_prev, w), 0) + 1
        denominator = self.unigram_counts.get(w_prev, 0) + self.V
        return numerator / denominator
    
    def trigram_probability(self, w_prev2, w_prev, w):
        w_prev2 = self.mask_oov(w_prev2)
        w_prev = self.mask_oov(w_prev)
        w = self.mask_oov(w)

        numerator = self.trigram_counts.get((w_prev2, w_prev, w), 0) + 1
        denominator = self.bigram_counts.get((w_prev2, w_prev), 0) + self.V
        return numerator / denominator
    
    def probability_of_word(self, w_prev2, w_prev, w):
        return self.lambda1 * self.trigram_probability(w_prev2, w_prev, w) + \
                self.lambda2 * self.bigram_probability(w_prev, w) + \
                self.lambda3 * self.unigram_probability(w)
    
    def sequence_probability(self, sequence):
        import math
        log_prob = 0.0
        for i in range(2, len(sequence)):
            w_prev2 = sequence[i-2]
            w_prev = sequence[i-1]
            w = sequence[i]

            p = self.probability_of_word(w_prev2, w_prev, w)
            log_prob += math.log(p)
        return math.exp(log_prob)
    
    def check_prob(self):
        print(sum(self.unigram_probability(w) for w in self.vocab))

        print(sum(self.bigram_probability("hola", w) for w in self.vocab))

        print(sum(self.trigram_probability("hola", "como", w) for w in self.vocab))

In [46]:
from nltk.tokenize import TweetTokenizer
tokenizer = TweetTokenizer()

trigram_data = TrigramData(13580, tokenizer)
transformed_corpus = trigram_data.fit(tr_txt)
final_vocab = trigram_data.final_vocabulary

In [47]:
trigram_lm = TrigramLanguageModel(lambda1=6.0, lambda2=3.0, lambda3=1.0)

In [48]:
trigram_lm.train(transformed_corpus, final_vocab)

In [49]:
trigram_lm.check_prob()

1.000000000000098
0.9999999999998532
1.0000000000002154


# Pruebitas

In [36]:
w_prev2, w_prev, w = "<s>", "hola", "mundo"
p_w = trigram_lm.probability_of_word(w_prev2, w_prev, w)
print(f"\nP('{w}' | '{w_prev2}', '{w_prev}') = {p_w:.16f}")


P('mundo' | '<s>', 'hola') = 0.0011064817715329


In [37]:
w_prev2, w_prev, w = "<s>", "saludos", "a"
p_w = trigram_lm.probability_of_word(w_prev2, w_prev, w)
print(f"\nP('{w}' | '{w_prev2}', '{w_prev}') = {p_w:.16f}")


P('a' | '<s>', 'saludos') = 0.0167546573857528


In [38]:
w_prev2, w_prev, w = "vete", "a", "la"
p_w = trigram_lm.probability_of_word(w_prev2, w_prev, w)
print(f"\nP('{w}' | '{w_prev2}', '{w_prev}') = {p_w:.16f}")


P('la' | 'vete', 'a') = 0.0717871315164460


In [39]:
w_prev2, w_prev, w = "hijo", "de", "tu"
p_w = trigram_lm.probability_of_word(w_prev2, w_prev, w)
print(f"\nP('{w}' | '{w_prev2}', '{w_prev}') = {p_w:.16f}")


P('tu' | 'hijo', 'de') = 0.0123598509137938


In [50]:
seq_example = ["hola", "como", "has", "estado", "</s>"]
seq_prob = trigram_lm.sequence_probability(seq_example)
print(f"\nP({seq_example}) = {seq_prob:.16f}")


P(['hola', 'como', 'has', 'estado', '</s>']) = 0.0000000264188843
