N-grams Parte 1

Vamos a entrenar un modelo ngrams "from scratch"

## Download WikiText-2 dataset

In [None]:
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt
! wget https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/test.txt

--2024-04-16 13:54:14--  https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10797148 (10M) [text/plain]
Saving to: ‘train.txt’


2024-04-16 13:54:15 (139 MB/s) - ‘train.txt’ saved [10797148/10797148]

--2024-04-16 13:54:15--  https://raw.githubusercontent.com/pytorch/examples/master/word_language_model/data/wikitext-2/valid.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1121681 (1.1M) [text/plain]
Saving to: ‘valid.txt’


2024

In [None]:
!pip install nltk

import nltk
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from collections import Counter, defaultdict
import math
import copy
import random
import operator




[nltk_data] Downloading package wordnet to /root/nltk_data...


### Preparación de datos


In [None]:

def flatten(lst):
    return [item for sublist in lst for item in sublist]

def prepare_data(filename):
    lemmatizer = WordNetLemmatizer()

    # Read lines, strip them, lower them, split by spaces, and append </s> token
    data = [l.strip().lower().split() + ['</s>'] for l in open(filename, 'r', encoding='utf-8') if l.strip()]

    # Lemmatize words in data
    data = [[lemmatizer.lemmatize(word) for word in sentence] for sentence in data]

    corpus = flatten(data)
    vocab = set(corpus)
    return vocab, data

La probabilidad de una palabra depende la historia inmediata de la secuencia.

$P(w_t|w_{t-1}, w_{t-2}, ... , w_1)$

### Markov Assumption

- Unigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t)$
- Bigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t|w_{t-1}) $
- Trigram: $P(w_t|w_{t-1}, w_{t-2}, ... , w_1) = P(w_t|w_{t-1}, w_{t-2})$

Contamos palabras para estimar las probabilidades

$P(w_t|w_{t-1}, w_{t-2}, ... ,w_{t-N+1}) = \frac{C(w_t, w_{t-1}, w_{t-2}, ... ,w_{t-N+1} )}{C(w_{t-1}, w_{t-2}, ... ,w_{t-N+1})}$

## Train with toy dataset

At this step, let's train a Bigram language model on the toy dataset.

In [None]:
class NGramLM():
    def __init__(self, N):
        self.N = N
        self.vocab = set()
        self.data = []
        self.prob = {}
        self.counts = defaultdict(Counter)

    def train(self, vocab, data, smoothing_k=0):
        self.vocab = vocab
        self.data = data
        self.smoothing_k = smoothing_k

        if self.N == 1:
            self.counts = Counter(flatten(data))
            self.prob = self.get_prob(self.counts)
        else:
            self.vocab.add('<s>')
            counts = self.count_ngram()

            self.prob = {}
            for context, counter in counts.items():
                self.prob[context] = self.get_prob(counter)

    def count_ngram(self):
        counts = defaultdict(Counter)
        for sentence in self.data:
            sentence = (self.N - 1) * ['<s>'] + sentence
            for i in range(len(sentence)-self.N+1):
                context = sentence[i:i+self.N-1]
                context = " ".join(context)
                word = sentence[i+self.N-1]
                counts[context][word] += 1

        self.counts = counts
        return counts

    def get_prob(self, counter):
        total = float(sum(counter.values()))
        k = self.smoothing_k

        prob = {}
        for word, count in counter.items():
            prob[word] = (count + k) / (total + len(self.vocab) * k)
        return prob

    def get_ngram_logprob(self, word, seq_len, context=""):
        if self.N == 1 and word in self.prob.keys():
            return math.log(self.prob[word]) / seq_len
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return math.log(self.prob[context][word]) / seq_len
        else:
            # assign a small probability to the unseen ngram
            # to avoid log of zero and to penalise unseen word or context
            return math.log(1/len(self.vocab)) / seq_len

    def get_ngram_prob(self, word, context=""):
        if self.N == 1 and word in self.prob.keys():
            return self.prob[word]
        elif self.N > 1 and not self._is_unseen_ngram(context, word):
            return self.prob[context][word]
        elif word in self.vocab and self.smoothing_k > 0:
            # probability assigned by smoothing
            return self.smoothing_k / (sum(self.counts[context].values()) + self.smoothing_k*len(self.vocab))
        else:
            # unseen word or context
            return 0

    # In this method, the perplexity is measured at the sentence-level, averaging over all sentences.
    # Actually, it is also possible to calculate perplexity by merging all sentences into a long one.
    def perplexity(self, test_data):
        log_ppl = 0
        if self.N == 1:
            for sentence in test_data:
                for word in sentence:
                    log_ppl += self.get_ngram_logprob(word=word, seq_len=len(sentence))
        else:
            for sentence in test_data:
                for i in range(len(sentence)-self.N+1):
                    context = sentence[i:i+self.N-1]
                    context = " ".join(context)
                    word = sentence[i+self.N-1]
                    log_ppl += self.get_ngram_logprob(context=context, word=word, seq_len=len(sentence))

        log_ppl /= len(test_data)
        ppl = math.exp(-log_ppl)
        return ppl

    def _is_unseen_ngram(self, context, word):
        if context not in self.prob.keys() or word not in self.prob[context].keys():
            return True
        else:
            return False

    # generate the most probable k words
    def generate_next(self, context, k):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)

        if ngram_context in self.prob.keys():
            candidates = self.prob[ngram_context]
            most_probable_words = sorted(candidates.items(), key=lambda kv: kv[1], reverse=True)
            for i in range(min(k, len(most_probable_words))):
                print(" ".join(context[self.N-1:])+" "+most_probable_words[i][0]+"\t P={}".format(most_probable_words[i][1]))
        else:
            print("Unseen context!")

    # generate the next n words with greedy search Poner en un slide
    def generate_next_n(self, context, n):
        context = (self.N-1) * '<s> ' + context
        context = context.split()
        ngram_context_list = context[-self.N+1:]
        ngram_context = " ".join(ngram_context_list)

        for i in range(n):
            try:
                candidates = self.prob[ngram_context]
                most_likely_next = max(candidates.items(), key=operator.itemgetter(1))[0]
                context += [most_likely_next]
                ngram_context_list = ngram_context_list[1:] + [most_likely_next]
                ngram_context = " ".join(ngram_context_list)
            except:
                break
        print(" ".join(context[self.N-1:]))


In [None]:
corpus = ["I like ice cream",
         "I like chocolate",
         "I hate beans"]
data = [l.strip().lower().split() + ['</s>'] for l in corpus if l.strip()]
vocab = set(flatten(data))
print(data)
print(vocab)

[['i', 'like', 'ice', 'cream', '</s>'], ['i', 'like', 'chocolate', '</s>'], ['i', 'hate', 'beans', '</s>']]
{'ice', 'like', 'chocolate', '</s>', 'i', 'cream', 'beans', 'hate'}


In [None]:
def print_probability(lm):
    for context in lm.vocab:
        for word in lm.vocab:
            prob = lm.get_ngram_prob(word, context)
            print("P({}\t|{}) = {}".format(word, context, prob))
        print("--------------------------")

## Smoothing
Smoothing se utiliza para contrarrestar la dispersión en el Modelo de Lenguaje N-Gram. La masa de probabilidad se desplaza hacia las palabras menos frecuentes.


$$P(w_t | context) = \frac{C(w_t, context)+1}{C(context)+|V|}$$

El problema es que si hay muchas palabras con 0, entonces las palabras mas frecuentas sacrificarán más probabilidad.

In [None]:
lm = NGramLM(2)
lm.train(vocab, data, smoothing_k=0)

print_probability(lm)

P(ice	|ice) = 0
P(like	|ice) = 0
P(chocolate	|ice) = 0
P(<s>	|ice) = 0
P(</s>	|ice) = 0
P(i	|ice) = 0
P(cream	|ice) = 1.0
P(beans	|ice) = 0
P(hate	|ice) = 0
--------------------------
P(ice	|like) = 0.5
P(like	|like) = 0
P(chocolate	|like) = 0.5
P(<s>	|like) = 0
P(</s>	|like) = 0
P(i	|like) = 0
P(cream	|like) = 0
P(beans	|like) = 0
P(hate	|like) = 0
--------------------------
P(ice	|chocolate) = 0
P(like	|chocolate) = 0
P(chocolate	|chocolate) = 0
P(<s>	|chocolate) = 0
P(</s>	|chocolate) = 1.0
P(i	|chocolate) = 0
P(cream	|chocolate) = 0
P(beans	|chocolate) = 0
P(hate	|chocolate) = 0
--------------------------
P(ice	|<s>) = 0
P(like	|<s>) = 0
P(chocolate	|<s>) = 0
P(<s>	|<s>) = 0
P(</s>	|<s>) = 0
P(i	|<s>) = 1.0
P(cream	|<s>) = 0
P(beans	|<s>) = 0
P(hate	|<s>) = 0
--------------------------
P(ice	|</s>) = 0
P(like	|</s>) = 0
P(chocolate	|</s>) = 0
P(<s>	|</s>) = 0
P(</s>	|</s>) = 0
P(i	|</s>) = 0
P(cream	|</s>) = 0
P(beans	|</s>) = 0
P(hate	|</s>) = 0
--------------------------
P(ice	|i

## Entrenando en WikiText-2 dataset and calculando perplexity
### Evaluating perplexity

$ PPL(W) = P(w_1, w_2, ... , w_n)^{-\frac{1}{n}}$

$ log(PPL(W)) = -\frac{1}{n}\sum^n_{k=1}log(P(w_k|w_1, w_2, ... , w_{k-1}))$

In [None]:
vocab, train_data = prepare_data('train.txt')
_, valid_data = prepare_data('valid.txt')
_, test_data = prepare_data('test.txt')
print(len(vocab))

25580


In [None]:
lm = NGramLM(3)
lm.train(vocab, train_data)

In [None]:
print(lm.perplexity(train_data))
print(lm.perplexity(valid_data))
print(lm.perplexity(test_data))

1.1519542711642219
1407.4468802577903
1087.8162622740592


## Generando Texto


In [None]:
print(" ".join(valid_data[12]))

context = "the eggs hatch at night , and the larvae swim to the water surface where they"
lm.generate_next(context, 3)


the egg hatch at night , and the larva swim to the water surface where they drift with the ocean current , preying on <unk> . this stage involves three <unk> and last for 15 – 35 day . after the third moult , the juvenile take on a form closer to the adult , and adopts a <unk> lifestyle . the juvenile are rarely seen in the wild , and are poorly known , although they are known to be capable of digging extensive burrow . it is estimated that only 1 larva in every 20 @,@ 000 survives to the <unk> phase . when they reach a carapace length of 15 mm ( 0 @.@ 59 in ) , the juvenile leave their burrow and start their adult life . </s>
Unseen context!


In [None]:

contexts = ["the eggs",
            "the",
            ""]
for context in contexts:
  lm.generate_next(context, 3)
  print("---")

Unseen context!
---
the first	 P=0.03398926654740608
the <unk>	 P=0.020274299344066785
the episode	 P=0.015802027429934407
---
 =	 P=0.26132873311734756
 the	 P=0.14112004039214035
 in	 P=0.06353347077881095
---


In [None]:
context = "the eggs hatch at night , and the larvae swim to the water surface where they"

lm.generate_next_n(context, 10)


lm.generate_next_n(context, 20)

the eggs hatch at night , and the larvae swim to the water surface where they were not able to get the part of the <unk>
the eggs hatch at night , and the larvae swim to the water surface where they were not able to get the part of the <unk> of the <unk> of the <unk> of the <unk> of


Effecto de N

In [None]:
for n in range(1,6):
    lm = NGramLM(n)
    lm.train(vocab, train_data)
    print("************************")
    print("{}-gram LM perplexity on train set: {}".format(n, lm.perplexity(train_data)))
    print("{}-gram LM perplexity on valid set: {}".format(n, lm.perplexity(valid_data)))
    print("{}-gram LM perplexity on test  set: {}".format(n, lm.perplexity(test_data)))

************************
1-gram LM perplexity on train set: 646.3365800668498
1-gram LM perplexity on valid set: 546.7786237397039
1-gram LM perplexity on test  set: 505.95615743496415
************************
2-gram LM perplexity on train set: 49.40224900454232
2-gram LM perplexity on valid set: 121.9653187945607
2-gram LM perplexity on test  set: 106.71637569105857
************************
3-gram LM perplexity on train set: 6.250848177738436
3-gram LM perplexity on valid set: 340.3955446416988
3-gram LM perplexity on test  set: 275.8197272910593
************************
4-gram LM perplexity on train set: 1.7915093585002564
4-gram LM perplexity on valid set: 974.7081693443893
4-gram LM perplexity on test  set: 758.0041880544288
************************
5-gram LM perplexity on train set: 1.1519542711642219
5-gram LM perplexity on valid set: 1407.4468802577903
5-gram LM perplexity on test  set: 1087.8162622740592


## Interpolation


In [None]:
class InterpolateNGramLM(NGramLM): #Hacer slide interpolation

    def __init__(self, N):
        super(InterpolateNGramLM, self).__init__(N)
        self.ngram_lms = []
        self.lambdas = []

    def train(self, vocab, data, smoothing_k=0, lambdas=[]):
        assert len(lambdas) == self.N
        assert sum(lambdas) - 1 < 1e-9
        self.vocab = vocab
        self.lambdas = lambdas

        for i in range(self.N, 0, -1):
            lm = NGramLM(i)
            print("Training {}-gram language model".format(i))
            lm.train(vocab, data, smoothing_k)
            self.ngram_lms.append(lm)

    def get_ngram_logprob(self, word, seq_len, context):
        prob = 0
        for i, (coef, lm) in enumerate(zip(self.lambdas, self.ngram_lms)):
            context_words = context.split()
            cutted_context = " ".join(context_words[-self.N + i + 1:])
            prob += coef * lm.get_ngram_prob(context=cutted_context, word=word)
        return math.log(prob) / seq_len


In [None]:
ilm = InterpolateNGramLM(3) #Aprende weights
ilm.train(vocab, train_data, lambdas=[0.5, 0.4, 0.1]) #Valores arbitrarios

Training 3-gram language model
Training 2-gram language model
Training 1-gram language model


In [None]:
print(ilm.perplexity(valid_data))
print(ilm.perplexity(test_data))

116.44308000312209
95.62538365005771
