In [1]:
# Tools
import os
import time
import shutil
import random
from argparse import Namespace
import matplotlib.pyplot as plt

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# PyTorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# scikit-learn
from sklearn.metrics import accuracy_score

In [2]:
seed = 1111
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False


In [3]:
from pathlib import Path
base = Path.cwd().parent
x_train = pd.read_csv(base / 'Datos y Scripts-20250828' / "mex20_train.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
x_val = pd.read_csv(base / 'Datos y Scripts-20250828' / "mex20_val.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
print("N√∫mero de ejemplos de entrenamiento:", len(x_train))
print(x_train[:10])
print("N√∫mero de ejemplos de validaci√≥n:", len(x_val))
print(x_val[:10])

N√∫mero de ejemplos de entrenamiento: 5278
['@USUARIO @USUARIO @USUARIO Q se puede esperar del maricon de closet de la Ya√±ez aun recuerdo esa ves q lo vi en zona rosa viendo quien lo levantada', '@USUARIO La piel nueva siempre arde un poquito los primeros d√≠as... y m√°s con este puto clima', 'Ustedes no se enamoran de m√≠‚Ä¶ por tontas.', 'Me las va a pagar esa puta gorda roba tuits...', '@USUARIO LA GENTE ES TONTA PORQUE NO SE DAN CUENTA QUE T√ö HACES A BATMAN AZUL', 'Estoy muy encabronada con las pseudo feministas por tontas e iletradas, a veces me averg√ºenza ser mujer; preferir√≠a tener un falo. #NiUnaMas', 'Anden putos, recuerdan el #noerapenal #Holanda fuera de #Rusia2018, esto se llama #karma ehhhhhhhh #puuuuuutos', 'Si no tienen chichis no traten de ense√±ar se ven muy mal y m√°s cuando son prietas.', 'Ojal√° asi me agarrars cuando te digo que me voy en lugar de correrme a la verga cada 5 minutos.', '@USUARIO @USUARIO @USUARIO @USUARIO Es solo un HDP aprovechado y que su "Dio

In [4]:
args = Namespace()
args.N = 4

In [5]:
lista_excluidas = set(['.', ',', ';', ':', '!', '?', '¬ø', '¬°', '"', "'", '(', ')', '[', ']', '{', '}', '-', '_', '‚Äî', '...',
                       '@', '#', '$', '%', '^', '&', '*', '/', '|', '~', '`', '<', '>', '¬´', '¬ª', '‚Äú', '‚Äù', '‚Äò', '‚Äô','<url>','<@usuario>',

                       ])

class NgramData:
    def __init__(self, N: int, vocab_size: int, tokenizer = None, embeddinds_model = None):
        self.tokenizer = tokenizer if tokenizer is not None else self.default_tokenizer
        self.punct = lista_excluidas
        self.N = N
        self.vocab_size = vocab_size
        self.UNK = "<unk>"
        self.SOS = "<s>"
        self.EOS = "</s>"
        self.embeddinds_model = embeddinds_model # TODO: implementar


    def default_tokenizer(self, text: str) -> list:
        return text.split()


    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = word in self.punct
        is_digit = word.isdigit()
        return is_punct or is_digit


    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist()
        for sentence in corpus:
            tokens = self.tokenizer(sentence)
            tokens = [token.lower() for token in tokens if not self.remove_word(token)]
            freq_dist.update(tokens)
        most_common = freq_dist.most_common(self.vocab_size - 3)
        vocab = set([word for word, _ in most_common])
        return vocab


    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)

        self.word_to_id = {}
        self.id_to_word = {}

        if self.embeddinds_model is not None:
            self.embedding_matriz = np.empty([len(self.vocab), self.embeddinds_model.vector_size])

        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and  word_ not in self.word_to_id:
                    self.word_to_id[word_] = id
                    self.id_to_word[id] = word_

                    if self.embeddinds_model is not None:
                        if word_ in self.embeddinds_model:
                            self.embedding_matriz[id] = self.embeddinds_model[word_]
                        else:
                            self.embedding_matriz[id] = np.random.normal(self.embeddinds_model.vector_size)
                    id += 1

        self.word_to_id.update({self.UNK: id, self.SOS: id + 1, self.EOS: id + 2})
        self.id_to_word.update({id: self.UNK, id + 1: self.SOS, id + 2: self.EOS})


    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens


    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))


    def transform(self, corpus: list) -> tuple[np.ndarray, np.ndarray]:
        x_ngrams = []
        y = []

        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_in_window in doc_ngram:
                words_in_window_ids = [self.word_to_id[w] for w in words_in_window]
                x_ngrams.append(list(words_in_window_ids[:-1]))
                y.append(words_in_window_ids[-1])

        return np.array(x_ngrams), np.array(y)


    # =========== PROPOEDADES ===========
    @property
    def size(self) -> int:
        return len(self.vocab)




In [6]:
tk = TweetTokenizer()

ngram_data = NgramData(args.N, 5_000, tokenizer=tk.tokenize)
ngram_data.fit(x_train)


In [7]:
print(f"Tama√±o del vocabulario: {ngram_data.size:,}")

Tama√±o del vocabulario: 5,000


In [8]:
x_ngram_train, y_ngram_train = ngram_data.transform(x_train)
x_ngram_val, y_ngram_val = ngram_data.transform(x_val)


In [9]:
x_ngram_train

array([[4998, 4998, 4998],
       [4998, 4998,    0],
       [4998,    0,    0],
       ...,
       [4997,  937,   32],
       [ 937,   32, 2524],
       [  32, 2524, 4997]], shape=(102751, 3))

In [10]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999], shape=(102751,))

In [11]:
# Tama√±os de los ngrams
x_train_shape = x_ngram_train.shape
y_train_shape = y_ngram_train.shape

x_val_shape = x_ngram_val.shape
y_val_shape = y_ngram_val.shape
print("TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO")
print(f"x_ngram_train: {x_train_shape}")
print(f"y_ngram_train: {y_train_shape}")
print("TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN")
print(f"x_ngram_val: {x_val_shape}")
print(f"y_ngram_val: {y_val_shape}")

TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO
x_ngram_train: (102751, 3)
y_ngram_train: (102751,)
TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN
x_ngram_val: (11558, 3)
y_ngram_val: (11558,)


In [12]:
lista_palabras = [[ngram_data.id_to_word[w]  for w in tw] for tw in x_ngram_train[:22]]
for i, palabras in enumerate(lista_palabras):
    print(f"{i+1}: {palabras}")

1: ['<s>', '<s>', '<s>']
2: ['<s>', '<s>', '@usuario']
3: ['<s>', '@usuario', '@usuario']
4: ['@usuario', '@usuario', '@usuario']
5: ['@usuario', '@usuario', 'q']
6: ['@usuario', 'q', 'se']
7: ['q', 'se', 'puede']
8: ['se', 'puede', 'esperar']
9: ['puede', 'esperar', 'del']
10: ['esperar', 'del', 'maricon']
11: ['del', 'maricon', 'de']
12: ['maricon', 'de', 'closet']
13: ['de', 'closet', 'de']
14: ['closet', 'de', 'la']
15: ['de', 'la', 'ya√±ez']
16: ['la', 'ya√±ez', 'aun']
17: ['ya√±ez', 'aun', 'recuerdo']
18: ['aun', 'recuerdo', 'esa']
19: ['recuerdo', 'esa', 'ves']
20: ['esa', 'ves', 'q']
21: ['ves', 'q', 'lo']
22: ['q', 'lo', 'vi']


In [13]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999], shape=(102751,))

In [14]:
lista_palbras_en_sus_ys = [ngram_data.id_to_word[w] for w in y_ngram_train[:22]]
for i, palabra in enumerate(lista_palbras_en_sus_ys):
    print(f"{i+1}: {palabra}")

1: @usuario
2: @usuario
3: @usuario
4: q
5: se
6: puede
7: esperar
8: del
9: maricon
10: de
11: closet
12: de
13: la
14: ya√±ez
15: aun
16: recuerdo
17: esa
18: ves
19: q
20: lo
21: vi
22: en


In [15]:
args.batch_size = 64
args.num_workers = 0
DTYPE = torch.int64

def tensor_dataset(x: np.ndarray, y: np.ndarray) -> TensorDataset:
    tensor_data = TensorDataset(
        torch.tensor(x, dtype=DTYPE),
        torch.tensor(y, dtype=DTYPE))
    return tensor_data

def data_loader(dataset: TensorDataset, shuffle: bool) -> DataLoader:
    dataloader = DataLoader(dataset,
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=shuffle)
    return dataloader


# Crear los DataLoaders
train_dataset = tensor_dataset(x_ngram_train, y_ngram_train)

train_loader = data_loader(train_dataset, shuffle=True)

val_dataset = tensor_dataset(x_ngram_val, y_ngram_val)

val_loader = data_loader(val_dataset, shuffle=False)

In [16]:
batch = next(iter(train_loader))
print(f'X shape: {batch[0].shape}')
print(f'Y shape: {batch[1].shape}')

X shape: torch.Size([64, 3])
Y shape: torch.Size([64])


In [17]:
batch[0]

tensor([[4998,    0,  235],
        [  32,   27, 4997],
        [   7,  218,  161],
        [ 109,    7, 1139],
        [4998, 4998, 4998],
        [  48,  342,   45],
        [  43,    9, 1325],
        [4997, 4997,    7],
        [4997, 1799,  225],
        [4998, 4997,   48],
        [   7,    9,  565],
        [4998,  700, 4997],
        [4997, 4997, 4997],
        [3538, 4997,  165],
        [4998, 4998, 4998],
        [  15, 4997, 4997],
        [ 114,    2, 3261],
        [ 455,   51,  456],
        [  45, 4997,   83],
        [  48,  375,   48],
        [ 942,   43, 2500],
        [  60,   45, 2302],
        [ 253,   48,  255],
        [  55,   48,   21],
        [ 419, 2462,   33],
        [4998, 4998, 4998],
        [4997, 4997,   66],
        [ 931, 4997, 4997],
        [  48,  166,  128],
        [1595,  337,   17],
        [ 114,   48,   46],
        [ 980, 4997,  696],
        [4998, 4998, 4998],
        [ 112,  272, 4997],
        [1559,  192,  705],
        [ 106, 4997,

In [18]:
[[ngram_data.id_to_word[w] for w in tw] for tw in batch[0].tolist()]

[['<s>', '@usuario', 'luchona'],
 ['y', 'un', '<unk>'],
 ['de', 'perra', 'como'],
 ['lugar', 'de', 'unas'],
 ['<s>', '<s>', '<s>'],
 ['a', 'mi', 'me'],
 ['por', 'la', 'noche'],
 ['<unk>', '<unk>', 'de'],
 ['<unk>', 'instagram', 'para'],
 ['<s>', '<unk>', 'a'],
 ['de', 'la', 'palabra'],
 ['<s>', 'ah', '<unk>'],
 ['<unk>', '<unk>', '<unk>'],
 ['favoritas', '<unk>', 'pero'],
 ['<s>', '<s>', '<s>'],
 ['lo', '<unk>', '<unk>'],
 ['solo', 'se', 'escucha'],
 ['dise', 'gorda', 'nena'],
 ['me', '<unk>', 'el'],
 ['a', 'comprar', 'a'],
 ['triste', 'por', 'joe'],
 ['que', 'me', 'anda'],
 ['tiene', 'a', 'gusto'],
 ['es', 'a', 'quien'],
 ['hay', 'momento', 'm√°s'],
 ['<s>', '<s>', '<s>'],
 ['<unk>', '<unk>', 'muy'],
 ['jugar', '<unk>', '<unk>'],
 ['a', 'tu', 'madre'],
 ['a√∫n', 'sigo', 'en'],
 ['solo', 'a', 'las'],
 ['soluci√≥n', '<unk>', 'luego'],
 ['<s>', '<s>', '<s>'],
 ['cada', 'foto', '<unk>'],
 ['mamadas', '..', 'eso'],
 ['te', '<unk>', 'si'],
 ['hoy', 'en', 'd√≠a'],
 ['<s>', 'uuuugh', 'me'],
 

In [19]:
batch[1]

tensor([  32, 4997,   15, 4997,    7, 3093, 1304,  575,  160, 2407, 4999,  165,
        3840,  955,   35, 4999,   60,  242, 1207,    9, 3119, 3050, 4997,   33,
        4997,    0, 4045,   45,  166, 4997,   33,   57,   83,   55,   39,  106,
         101,   65, 4997,  109, 4997,  100,   46,  111,    7,   48, 4997, 1916,
          34,    0, 4997,  419,   48, 4997,    0, 4997,   93,   70, 4997,  128,
         318,  385,  923,   32])

In [20]:
class NeuralLM(nn.Module):
    def __init__(self, args):
        super(NeuralLM, self).__init__()
        self.window_size = args.N - 1
        self.embedding_size = args.d

        self.emb = nn.Embedding(args.vocab_size, args.d)
        self.fc1 = nn.Linear(args.d * (args.N - 1), args.d)
        self.drop1 = nn.Dropout(p=args.dropout)
        self.fc2 = nn.Linear(args.d, args.vocab_size, bias=False)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.embedding_size * self.window_size)
        h = F.relu(self.fc1(x))
        h = self.drop1(h)
        return self.fc2(h)

In [21]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

In [22]:
def model_eval(data, model, gpu=False):
    with torch.no_grad():
        preds, tgts = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()

            outputs = model(window_words)
            y_pred = get_preds(outputs)

            tgt = labels.numpy()
            tgts.append(tgt)
            preds.append(y_pred)

    tgts = [e for l in tgts for e in l]
    preds = [e for l in preds for e in l]

    return accuracy_score(tgts, preds)

In [23]:
def save_checkpoint(state, is_best, checkpoint_path, filename="checkpoint.pt"):
    filename = os.path.join(checkpoint_path, filename)
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(checkpoint_path, "model_best.pt"))

In [24]:
# Model hyparameters
args.vocab_size = ngram_data.size
args.d = 100
args.d_h = 200
args.dropout = 0.1

# Training hyperparameters
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

# Scheduler hyperparameters
args.lr_patience = 10
args.lr_factor = 0.5

# Saving directory
args.savedir = 'model'
os.makedirs(args.savedir, exist_ok=True)

# Create model
model = NeuralLM(args)

# Send to GPU
args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
    model.cuda()

# Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    patience=args.lr_patience,
    factor=args.lr_factor
)

In [25]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()

    for window_words, labels in train_loader:

        # If GPU available
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        # Forward pass
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        # Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    # Get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(val_loader, model, gpu=args.use_gpu)
    metric_history.append(tuning_metric)

    # Update scheduler
    scheduler.step(tuning_metric)

    # Check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        no_improve = 0
    else:
        no_improve += 1

    # Save best model if metric improved
    save_checkpoint(
        {
            'epoch': epoch + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict(),
            'best_metric': best_metric,
        },
        is_improvement,
        args.savedir,
    )

    # Early stopping
    if no_improve >= args.patience:
        print("No improvement. Breaking out of loop.")
        break

    print(f'Train acc {mean_epoch_metric:.4f}')
    print(f'Epoch: {epoch}, Loss: {np.mean(loss_epoch):.4f}, Val acc: {tuning_metric:.4f}')
    epoch_time = time.time() - epoch_start_time
    print(f'Epoch time: {epoch_time:.4f}')

Train acc 0.1774
Epoch: 0, Loss: 5.5258, Val acc: 0.1738
Epoch time: 6.8022
Train acc 0.1844
Epoch: 1, Loss: 5.1077, Val acc: 0.2257
Epoch time: 6.1975
Train acc 0.1885
Epoch: 2, Loss: 4.9256, Val acc: 0.2016
Epoch time: 5.9357
Train acc 0.1923
Epoch: 3, Loss: 4.7968, Val acc: 0.2252
Epoch time: 9.1137
Train acc 0.1925
Epoch: 4, Loss: 4.6915, Val acc: 0.2022
Epoch time: 5.4584
Train acc 0.1970
Epoch: 5, Loss: 4.5930, Val acc: 0.2291
Epoch time: 6.2871
Train acc 0.1952
Epoch: 6, Loss: 4.5077, Val acc: 0.2278
Epoch time: 6.6156
Train acc 0.1937
Epoch: 7, Loss: 4.4343, Val acc: 0.2313
Epoch time: 6.2607
Train acc 0.1965
Epoch: 8, Loss: 4.3639, Val acc: 0.1871
Epoch time: 6.9253
Train acc 0.1953
Epoch: 9, Loss: 4.3018, Val acc: 0.1678
Epoch time: 7.5081
Train acc 0.1954
Epoch: 10, Loss: 4.2397, Val acc: 0.2131
Epoch time: 7.7477
Train acc 0.1986
Epoch: 11, Loss: 4.1808, Val acc: 0.1678
Epoch time: 6.8429
Train acc 0.1981
Epoch: 12, Loss: 4.1288, Val acc: 0.2141
Epoch time: 7.3568
Train acc

In [26]:
def print_closest_words(embeddings, ngram_data, word, n):
    word_id = torch.LongTensor([ngram_data.word_to_id[word]])  # get word id
    word_embed = embeddings(word_id)                     # get word embedding
    dists = torch.norm(embeddings.weight - word_embed, dim=1).detach()  # compute distances to all words
    lst = sorted(enumerate(dists.numpy()), key=lambda x: x[1])          # sort by distance
    for idx, difference in lst[1:n+1]:                                 # take the top n, ignore word itself
        print(ngram_data.id_to_word[idx], difference)

In [27]:
# Model with learned embeddings from scratch
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load('model/model_best.pt')['state_dict'])
best_model.train(False)

print("-" * 30)
print("Learned embeddings")
print("-" * 30)
print_closest_words(best_model.emb, ngram_data, "chivas", 10)


------------------------------
Learned embeddings
------------------------------
<unk> 10.375457
dejarte 10.661759
laferte 10.887369
mueven 10.964158
pared 10.99325
muchas 11.043353
tenia 11.092116
vaso 11.1542015
haciendose 11.167336
pregunt√≥ 11.192324


In [28]:
def parse_text(text, tokenizer):
  all_tokens = [w.lower() if w in ngram_data.word_to_id else '<unk>' for w in tokenizer.tokenize(text)]
  tokens_id = [ngram_data.word_to_id[word.lower()] for word in all_tokens]
  return all_tokens, tokens_id

In [29]:
def sample_next_word(logits, temperature=1.0):
    logits = np.asarray(logits).astype('float64')
    preds = logits / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [30]:
def predict_next_token(model, token_ids):
    word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()

    # y_probs = F.softmax(y_raw_pred, dim=1)
    # y_pred = torch.argmax(y_probs, dim=1).detach().numpy()

    y_pred = sample_next_word(y_raw_pred, 1.0)
    return y_pred

In [31]:
def generate_sentence(model, initial_text, tokenizer):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizer)

    for i in range(100):
        y_pred = predict_next_token(model, window_word_ids)
        next_word = ngram_data.id_to_word[y_pred]
        all_tokens.append(next_word)

        if next_word == '</s>':
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)

    return " ".join(all_tokens)

In [32]:
initial_tokens = '<s> las chivas'

print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
<s> las chivas bien dinero <unk> como estaba ver un <unk> üò© üò© </s>


In [33]:
initial_tokens = 'yo opino que'

print('-' * 30)
print('Learned embeddings')
print('-' * 30)
print(generate_sentence(best_model, initial_tokens, tk))

------------------------------
Learned embeddings
------------------------------
yo opino que <unk> tecatito letra puto <unk> anda a gustar brayan t√≠a una los <unk> esos de sexo <unk> hasta es que me hago <unk> <unk> <unk> üçÜ üò¢ o tenemos <unk> es la masa que ya <unk> par de ser <unk> <unk> a todos cu√°ntas <unk> su culera <unk> <unk> pero igual hdp üé∂ </s>


In [34]:
def log_likelihood(model, text, ngram_data):
    # Generate n-gram windows from input text and the respective label y
    X, y = ngram_data.transform([text])
    # Discard first two n-gram windows since they contain <s> tokens not necessary
    X, y = X[2:], y[2:]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim=1).numpy()

    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [35]:
print("log likelihood:", log_likelihood(best_model, "Estamos en la clase de procesamiento de lenguaje natural", ngram_data))

log likelihood: -32.4111


In [36]:
print("log likelihood:", log_likelihood(best_model, "Estamos procesamiento clase en la de natural de lenguaje", ngram_data))

log likelihood: -41.06167


# Estructuras Sintacticas correctas

In [37]:
from itertools import permutations
from random import shuffle

word_list = "sino gano me voy a la chingada".split(' ')
perms = [' '.join(perm) for perm in permutations(word_list)]
print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[:5]:
    print(f"{p:.4f} {t}")

print('-' * 50)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[-5:]:
    print(f"{p:.4f} {t}")


--------------------------------------------------


-21.7664 sino gano a la chingada me voy
-22.4687 gano sino a la chingada me voy
-22.5653 gano sino me voy a la chingada
-23.4178 sino gano me voy a la chingada
-24.7224 gano sino la chingada me voy a
--------------------------------------------------
-63.4815 la a voy sino chingada gano me
-63.8410 me a voy chingada gano sino la
-63.9567 a la sino voy chingada gano me
-64.6745 a me chingada voy sino gano la
-64.9590 la me a voy chingada gano sino
