In [1]:
# Tools
import os
import time
import shutil
import random
from argparse import Namespace
import matplotlib.pyplot as plt

# Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import TweetTokenizer
from nltk import FreqDist
import pandas as pd
import numpy as np

# PyTorch
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.nn.functional as F

# scikit-learn
from sklearn.metrics import accuracy_score

In [2]:
sedd = 1111
random.seed(sedd)
np.random.seed(sedd)
torch.manual_seed(sedd)
torch.backends.cudnn.benchmark = False # ????


In [3]:
x_train = pd.read_csv("data/mex/mex20_train.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
x_val = pd.read_csv("data/mex/mex20_val.txt", sep='\r\n', engine='python', header=None).loc[:,0].values.tolist()
print("N√∫mero de ejemplos de entrenamiento:", len(x_train))
print(x_train[:10])
print("N√∫mero de ejemplos de validaci√≥n:", len(x_val))
print(x_val[:10])

N√∫mero de ejemplos de entrenamiento: 5278
['@USUARIO @USUARIO @USUARIO Q se puede esperar del maricon de closet de la Ya√±ez aun recuerdo esa ves q lo vi en zona rosa viendo quien lo levantada', '@USUARIO La piel nueva siempre arde un poquito los primeros d√≠as... y m√°s con este puto clima', 'Ustedes no se enamoran de m√≠‚Ä¶ por tontas.', 'Me las va a pagar esa puta gorda roba tuits...', '@USUARIO LA GENTE ES TONTA PORQUE NO SE DAN CUENTA QUE T√ö HACES A BATMAN AZUL', 'Estoy muy encabronada con las pseudo feministas por tontas e iletradas, a veces me averg√ºenza ser mujer; preferir√≠a tener un falo. #NiUnaMas', 'Anden putos, recuerdan el #noerapenal #Holanda fuera de #Rusia2018, esto se llama #karma ehhhhhhhh #puuuuuutos', 'Si no tienen chichis no traten de ense√±ar se ven muy mal y m√°s cuando son prietas.', 'Ojal√° asi me agarrars cuando te digo que me voy en lugar de correrme a la verga cada 5 minutos.', '@USUARIO @USUARIO @USUARIO @USUARIO Es solo un HDP aprovechado y que su "Dio

In [4]:
args = Namespace()
args.N = 4

In [5]:
lista_excluidas = set(['.', ',', ';', ':', '!', '?', '¬ø', '¬°', '"', "'", '(', ')', '[', ']', '{', '}', '-', '_', '‚Äî', '...',
                       '@', '#', '$', '%', '^', '&', '*', '/', '|', '~', '`', '<', '>', '¬´', '¬ª', '‚Äú', '‚Äù', '‚Äò', '‚Äô','<url>','<@usuario>',
                       
                       ])

class NgramData:
    def __init__(self, N: int, vocab_size: int, tokenizer = None, embeddinds_model = None):
        self.tokenizer = tokenizer if tokenizer is not None else self.default_tokenizer
        self.punct = lista_excluidas
        self.N = N
        self.vocab_size = vocab_size
        self.UNK = "<unk>"
        self.SOS = "<s>"
        self.EOS = "</s>"
        self.embeddinds_model = embeddinds_model # TODO: implementar
    
        
    def default_tokenizer(self, text: str) -> list:
        return text.split()


    def remove_word(self, word: str) -> bool:
        word = word.lower()
        is_punct = word in self.punct
        is_digit = word.isdigit()
        return is_punct or is_digit
    

    def get_vocab(self, corpus: list) -> set:
        freq_dist = FreqDist()
        for sentence in corpus:
            tokens = self.tokenizer(sentence)
            tokens = [token.lower() for token in tokens if not self.remove_word(token)]
            freq_dist.update(tokens)
        most_common = freq_dist.most_common(self.vocab_size - 3)  # Tengo que reservar espacio para UNK, SOS, EOS
        vocab = set([word for word, _ in most_common])
        return vocab


    def fit(self, corpus: list) -> None:
        self.vocab = self.get_vocab(corpus)
        self.vocab.add(self.UNK)
        self.vocab.add(self.SOS)
        self.vocab.add(self.EOS)
        
        self.word_to_id = {}
        self.id_to_word = {}
        
        if self.embeddinds_model is not None:
            self.embedding_matriz = np.empty([len(self.vocab), self.embeddinds_model.vector_size])
            
        id = 0
        for doc in corpus:
            for word in self.tokenizer(doc):
                word_ = word.lower()
                if word_ in self.vocab and  word_ not in self.word_to_id:
                    self.word_to_id[word_] = id
                    self.id_to_word[id] = word_
                    
                    if self.embeddinds_model is not None:
                        if word_ in self.embeddinds_model:
                            self.embedding_matriz[id] = self.embeddinds_model[word_]
                        else:
                            self.embedding_matriz[id] = np.random.normal(self.embeddinds_model.vector_size)
                    id += 1

        self.word_to_id.update({self.UNK: id, self.SOS: id + 1, self.EOS: id + 2})
        self.id_to_word.update({id: self.UNK, id + 1: self.SOS, id + 2: self.EOS})
    
    
    def replace_unk(self, doc_tokens: list) -> list:
        for i, token in enumerate(doc_tokens):
            if token.lower() not in self.vocab:
                doc_tokens[i] = self.UNK
        return doc_tokens
    
    
    def get_ngram_doc(self, doc: str) -> list:
        doc_tokens = self.tokenizer(doc)
        doc_tokens = self.replace_unk(doc_tokens)
        doc_tokens = [w.lower() for w in doc_tokens]
        doc_tokens = [self.SOS] * (self.N - 1) + doc_tokens + [self.EOS]
        return list(ngrams(doc_tokens, self.N))
    
    
    def transform(self, corpus: list) -> tuple[np.ndarray, np.ndarray]:
        x_ngrams = []
        y = []
        
        for doc in corpus:
            doc_ngram = self.get_ngram_doc(doc)
            for words_in_window in doc_ngram:
                words_in_window_ids = [self.word_to_id[w] for w in words_in_window]
                x_ngrams.append(list(words_in_window_ids[:-1]))
                y.append(words_in_window_ids[-1])
        
        return np.array(x_ngrams), np.array(y)
    
    
    # =========== PROPOEDADES ===========    
    @property
    def size(self) -> int:
        return len(self.vocab)
        
                    
    

In [6]:
tk = TweetTokenizer()

ngram_data = NgramData(args.N, 5_000, tokenizer=tk.tokenize)
ngram_data.fit(x_train)


In [7]:
print(f"Tama√±o del vocabulario: {ngram_data.size:,}")

Tama√±o del vocabulario: 5,000


In [8]:
x_ngram_train, y_ngram_train = ngram_data.transform(x_train)
x_ngram_val, y_ngram_val = ngram_data.transform(x_val)


In [9]:
x_ngram_train

array([[4998, 4998, 4998],
       [4998, 4998,    0],
       [4998,    0,    0],
       ...,
       [4997,  937,   32],
       [ 937,   32, 2524],
       [  32, 2524, 4997]])

In [10]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999])

In [11]:
# Tama√±os de los ngrams
x_train_shape = x_ngram_train.shape
y_train_shape = y_ngram_train.shape

x_val_shape = x_ngram_val.shape
y_val_shape = y_ngram_val.shape
print("TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO")
print(f"x_ngram_train: {x_train_shape}")
print(f"y_ngram_train: {y_train_shape}")
print("TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN")
print(f"x_ngram_val: {x_val_shape}")
print(f"y_ngram_val: {y_val_shape}")

TAMA√ëO DE LOS NGRAMS DE ENTRENAMIENTO
x_ngram_train: (102751, 3)
y_ngram_train: (102751,)
TAMA√ëO DE LOS NGRAMS DE VALIDACI√ìN
x_ngram_val: (11558, 3)
y_ngram_val: (11558,)


nota: creo que los tama√±os varian segun la lista de palabras excluidas que tengo

In [12]:
lista_palabras = [[ngram_data.id_to_word[w]  for w in tw] for tw in x_ngram_train[:22]]
for i, palabras in enumerate(lista_palabras):
    print(f"{i+1}: {palabras}")

1: ['<s>', '<s>', '<s>']
2: ['<s>', '<s>', '@usuario']
3: ['<s>', '@usuario', '@usuario']
4: ['@usuario', '@usuario', '@usuario']
5: ['@usuario', '@usuario', 'q']
6: ['@usuario', 'q', 'se']
7: ['q', 'se', 'puede']
8: ['se', 'puede', 'esperar']
9: ['puede', 'esperar', 'del']
10: ['esperar', 'del', 'maricon']
11: ['del', 'maricon', 'de']
12: ['maricon', 'de', 'closet']
13: ['de', 'closet', 'de']
14: ['closet', 'de', 'la']
15: ['de', 'la', 'ya√±ez']
16: ['la', 'ya√±ez', 'aun']
17: ['ya√±ez', 'aun', 'recuerdo']
18: ['aun', 'recuerdo', 'esa']
19: ['recuerdo', 'esa', 'ves']
20: ['esa', 'ves', 'q']
21: ['ves', 'q', 'lo']
22: ['q', 'lo', 'vi']


In [13]:
y_ngram_train

array([   0,    0,    0, ..., 2524, 4997, 4999])

In [14]:
lista_palbras_en_sus_ys = [ngram_data.id_to_word[w] for w in y_ngram_train[:22]]
for i, palabra in enumerate(lista_palbras_en_sus_ys):
    print(f"{i+1}: {palabra}")

1: @usuario
2: @usuario
3: @usuario
4: q
5: se
6: puede
7: esperar
8: del
9: maricon
10: de
11: closet
12: de
13: la
14: ya√±ez
15: aun
16: recuerdo
17: esa
18: ves
19: q
20: lo
21: vi
22: en


In [15]:
args.batch_size = 64
args.num_workers = 2
DTYPE = torch.int64


def tensor_dataset(x: np.ndarray, y: np.ndarray) -> TensorDataset:
    tensor_data = TensorDataset(
        torch.tensor(x, dtype=DTYPE),
        torch.tensor(y, dtype=DTYPE))
    return tensor_data
    
    
def data_loader(dataset: TensorDataset, shuffle: bool) -> DataLoader:
    dataloader = DataLoader(dataset, 
        batch_size=args.batch_size,
        num_workers=args.num_workers,
        shuffle=shuffle)
    return dataloader

    
# Crear los DataLoaders
train_dataset = tensor_dataset(x_ngram_train, y_ngram_train)

train_loader = data_loader(train_dataset, shuffle=True)

val_dataset = tensor_dataset(x_ngram_val, y_ngram_val)

val_loader = data_loader(val_dataset, shuffle=False)

In [16]:
batch = next(iter(train_loader))

print(f"X shape: {batch[0].shape}")
print(f"Y shape: {batch[1].shape}")

X shape: torch.Size([64, 3])
Y shape: torch.Size([64])


In [17]:
batch[0]

tensor([[4998,    0,  235],
        [  32,   27, 4997],
        [   7,  218,  161],
        [ 109,    7, 1139],
        [4998, 4998, 4998],
        [  48,  342,   45],
        [  43,    9, 1325],
        [4997, 4997,    7],
        [4997, 1799,  225],
        [4998, 4997,   48],
        [   7,    9,  565],
        [4998,  700, 4997],
        [4997, 4997, 4997],
        [3538, 4997,  165],
        [4998, 4998, 4998],
        [  15, 4997, 4997],
        [ 114,    2, 3261],
        [ 455,   51,  456],
        [  45, 4997,   83],
        [  48,  375,   48],
        [ 942,   43, 2500],
        [  60,   45, 2302],
        [ 253,   48,  255],
        [  55,   48,   21],
        [ 419, 2462,   33],
        [4998, 4998, 4998],
        [4997, 4997,   66],
        [ 931, 4997, 4997],
        [  48,  166,  128],
        [1595,  337,   17],
        [ 114,   48,   46],
        [ 980, 4997,  696],
        [4998, 4998, 4998],
        [ 112,  272, 4997],
        [1559,  192,  705],
        [ 106, 4997,

In [18]:
lista_palabras = [[ngram_data.id_to_word[w]  for w in tw] for tw in batch[0].tolist()]
for i, palabras in enumerate(lista_palabras):
    print(f"{i+1}: {palabras}")

1: ['<s>', '@usuario', 'luchona']
2: ['y', 'un', '<unk>']
3: ['de', 'perra', 'como']
4: ['lugar', 'de', 'unas']
5: ['<s>', '<s>', '<s>']
6: ['a', 'mi', 'me']
7: ['por', 'la', 'noche']
8: ['<unk>', '<unk>', 'de']
9: ['<unk>', 'instagram', 'para']
10: ['<s>', '<unk>', 'a']
11: ['de', 'la', 'palabra']
12: ['<s>', 'ah', '<unk>']
13: ['<unk>', '<unk>', '<unk>']
14: ['favoritas', '<unk>', 'pero']
15: ['<s>', '<s>', '<s>']
16: ['lo', '<unk>', '<unk>']
17: ['solo', 'se', 'escucha']
18: ['dise', 'gorda', 'nena']
19: ['me', '<unk>', 'el']
20: ['a', 'comprar', 'a']
21: ['triste', 'por', 'joe']
22: ['que', 'me', 'anda']
23: ['tiene', 'a', 'gusto']
24: ['es', 'a', 'quien']
25: ['hay', 'momento', 'm√°s']
26: ['<s>', '<s>', '<s>']
27: ['<unk>', '<unk>', 'muy']
28: ['jugar', '<unk>', '<unk>']
29: ['a', 'tu', 'madre']
30: ['a√∫n', 'sigo', 'en']
31: ['solo', 'a', 'las']
32: ['soluci√≥n', '<unk>', 'luego']
33: ['<s>', '<s>', '<s>']
34: ['cada', 'foto', '<unk>']
35: ['mamadas', '..', 'eso']
36: ['te', '<u

In [19]:
batch[1]

tensor([  32, 4997,   15, 4997,    7, 3093, 1304,  575,  160, 2407, 4999,  165,
        3840,  955,   35, 4999,   60,  242, 1207,    9, 3119, 3050, 4997,   33,
        4997,    0, 4045,   45,  166, 4997,   33,   57,   83,   55,   39,  106,
         101,   65, 4997,  109, 4997,  100,   46,  111,    7,   48, 4997, 1916,
          34,    0, 4997,  419,   48, 4997,    0, 4997,   93,   70, 4997,  128,
         318,  385,  923,   32])

In [20]:
lista_palbras_en_sus_ys = [ngram_data.id_to_word[w] for w in batch[1].tolist()]
for i, palabra in enumerate(lista_palbras_en_sus_ys):
    print(f"{i+1}: {palabra}")

1: y
2: <unk>
3: lo
4: <unk>
5: de
6: estuviera
7: dejan
8: lado
9: putas
10: pura
11: </s>
12: pero
13: escribe
14: antes
15: este
16: </s>
17: que
18: eres
19: saludo
20: la
21: jonas
22: jodiendo
23: <unk>
24: m√°s
25: <unk>
26: @usuario
27: caro
28: me
29: tu
30: <unk>
31: m√°s
32: porque
33: el
34: es
35: no
36: te
37: son
38: estoy
39: <unk>
40: lugar
41: <unk>
42: cuando
43: las
44: verga
45: de
46: a
47: <unk>
48: don
49: con
50: @usuario
51: <unk>
52: hay
53: a
54: <unk>
55: @usuario
56: <unk>
57: si
58: e
59: <unk>
60: madre
61: as√≠
62: ‚ô•
63: nadie
64: y


In [21]:
# Tama√±o del vocabulario
args.vocab_size = ngram_data.size

# Dimensionalidad del word embedding
args.d = 50

# Dimension por capa oculta
args.d_h = 100

# Dropout
args.dropout = 0.1

In [22]:
class NeuralLM(nn.Module):
    def __init__(self,args , embeddings = None):
        super(NeuralLM, self).__init__()
        self.window_size = args.N - 1
        self.embedding_dim = args.d
        self.emb = embeddings if embeddings is not None else nn.Embedding(args.vocab_size, args.d)
        # Capas fully connected 1
        self.fc1 = nn.Linear(args.d * (args.N - 1), args.d_h)
        self.drop1 = nn.Dropout(p=args.dropout)
        # Capa fully connected 2 que va a predecir la siguiente palabra
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias=False)
        
    def forward(self, x):
        x = self.emb(x)
        x = x.view(-1, self.window_size * self.embedding_dim)
        h = F.relu(self.fc1(x)) # relu(z) = max(0, z)
        h = self.drop1(h)
        out = self.fc2(h)
        return out

In [23]:
def get_preds(raw_logits):
    probs = F.softmax(raw_logits.detach(), dim=1)
    y_pred = torch.argmax(probs, dim=1).cpu().numpy()
    return y_pred

In [24]:
def model_eval(data,model, gpu = False):
    with torch.no_grad():
        preds, tgts = [], []
        for window_words, labels in data:
            if gpu:
                window_words = window_words.cuda()
                labels = labels.cuda()
            outputs = model(window_words)
            # Obtener predicci√≥n
            y_pred = get_preds(outputs)
            
            tgt = labels.cpu().numpy()
            tgts.append(tgt)
            preds.append(y_pred)
            
    tgts = [e for seq in tgts for e in seq ]
    preds = [e for seq in preds for e in seq ]
    acurracy = accuracy_score(tgts, preds)
    return acurracy

In [25]:
def save_checkpoint(state, is_best, check_point_path, filename='checkpoint.pt'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, os.path.join(check_point_path, 'model_best.pt'))

In [26]:
# Model hyperparameters
args.vocab_size = ngram_data.size
args.d = 100 # Dimensionalidad del word embedding
args.d_h = 200 # Dimension por capa oculta
args.dropout = 0.1 

# Training hyperparameters
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

# Scheduler hyperparameters
args.lr_patience = 10 # N√∫mero de √©pocas sin mejora para reducir lr
args.lr_factor = 0.5  # Factor de reducci√≥n del lr

# Saving directory
args.savedir = 'model'
os.makedirs(args.savedir, exist_ok=True)

# Create model
model = NeuralLM(args)

# Send to GPU if available
args.gpu = torch.cuda.is_available()
if args.gpu:
    model = model.cuda()
    
# Loss, Optimizer and Scheduler
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=args.lr)


scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode='min',
    patience=args.lr_patience,
    #verbose=True,
    factor=args.lr_factor
)

In [29]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.metrics._classification")

start_time = time.perf_counter()
base_metric = 0
n_no_improve = 0
metric_history = []
tran_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.perf_counter()
    loss_epoch = []
    training_metric = []
    model.train()
    
    # ======== Training ========
    for window_word, labels in train_loader:
        if args.gpu:
            window_word = window_word.cuda()
            labels = labels.cuda()
        
        
        # Forward pass
        outputs = model(window_word)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())
        
        # Get training metric
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
    # Get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    tran_metric_history.append(mean_epoch_metric)
    
    
    # Get metric in validation dataset
    model.eval()
    tunning_metric = model_eval(val_loader, model, gpu=args.gpu)
    metric_history.append(mean_epoch_metric)
    
    # Update scheduler
    scheduler.step(tunning_metric)
    
    # Check for improvement
    is_improvement = tunning_metric > base_metric
    if is_improvement:
        base_metric = tunning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1
        
    # Save checkpoint
    save_checkpoint({
        'epoch': epoch + 1,
        'state_dict': model.state_dict(),
        'optimizer': optimizer.state_dict(),
        'scheduler': scheduler.state_dict(),
        'base_metric': base_metric,
    }, is_improvement, 
        args.savedir,)
    
    # Early stopping
    if n_no_improve >= args.patience:
        print(f"No hubo mejora en las √∫ltimas {args.patience} √©pocas. Terminando entrenamiento.")
        break
    print(f'Traing accuracy: {mean_epoch_metric}')
    print(f"√âpoca [{epoch+1}/{args.num_epochs}] "
          f"Tiempo: {time.perf_counter() - epoch_start_time:.2f}s "
          f"P√©rdida: {np.mean(loss_epoch):.4f} "
          f"Precisi√≥n Entrenamiento: {mean_epoch_metric:.4f} "
          f"Precisi√≥n Validaci√≥n: {tunning_metric:.4f} "
          f"{'MEJORA' if is_improvement else ''}")
print(f"Tiempo total de entrenamiento: {time.perf_counter() - start_time:.2f}s")

Traing accuracy: 0.41084058520467603
√âpoca [1/100] Tiempo: 6.32s P√©rdida: 2.4705 Precisi√≥n Entrenamiento: 0.4108 Precisi√≥n Validaci√≥n: 0.2035 MEJORA
Traing accuracy: 0.40885458512433215
√âpoca [2/100] Tiempo: 5.23s P√©rdida: 2.4723 Precisi√≥n Entrenamiento: 0.4089 Precisi√≥n Validaci√≥n: 0.2055 MEJORA
Traing accuracy: 0.4123517404491222
√âpoca [3/100] Tiempo: 5.82s P√©rdida: 2.4645 Precisi√≥n Entrenamiento: 0.4124 Precisi√≥n Validaci√≥n: 0.2060 MEJORA
Traing accuracy: 0.4112887533643997
√âpoca [4/100] Tiempo: 5.21s P√©rdida: 2.4626 Precisi√≥n Entrenamiento: 0.4113 Precisi√≥n Validaci√≥n: 0.2074 MEJORA
Traing accuracy: 0.41057915377817056
√âpoca [5/100] Tiempo: 5.03s P√©rdida: 2.4656 Precisi√≥n Entrenamiento: 0.4106 Precisi√≥n Validaci√≥n: 0.2064 
Traing accuracy: 0.40935861738239665
√âpoca [6/100] Tiempo: 5.13s P√©rdida: 2.4654 Precisi√≥n Entrenamiento: 0.4094 Precisi√≥n Validaci√≥n: 0.2031 
Traing accuracy: 0.4141899203591371
√âpoca [7/100] Tiempo: 5.41s P√©rdida: 2.4389 Precisi√

In [30]:
def print_closest_words(embeddings, ngram_data, word, k=5):
    if word not in ngram_data.vocab:
        print(f"La palabra '{word}' no est√° en el vocabulario.")
        return
    
    word_id = torch.tensor([ngram_data.word_to_id[word]])
    word_embed = embeddings(word_id)
    dist = torch.norm(embeddings.weight - word_embed, dim=1).detach()
    lst = sorted(enumerate(dist.numpy()), key=lambda x: x[1]) # Ordenar por la menor distancia
    for idx, difference in lst[1:k+1]:  # Saltar el primero porque es la misma palabra
        print(f"Palabra: {ngram_data.id_to_word[idx]}, Distancia: {difference:.4f}")
    

In [36]:
best_model = NeuralLM(args)
best_model.load_state_dict(torch.load('model/model_best.pt')["state_dict"])
best_model.train(False)


print("-"*30)
print("Learning word embeddings")
print("-"*30)

print_closest_words(best_model.emb, ngram_data, word="jaja", k=10)

------------------------------
Learning word embeddings
------------------------------
Palabra: <unk>, Distancia: 10.9262
Palabra: examen, Distancia: 11.0856
Palabra: pens√©, Distancia: 11.0972
Palabra: ojotes, Distancia: 11.1100
Palabra: el, Distancia: 11.1328
Palabra: mordidas, Distancia: 11.1361
Palabra: l, Distancia: 11.2239
Palabra: #aborto, Distancia: 11.2842
Palabra: reglas, Distancia: 11.3070
Palabra: üë¨, Distancia: 11.3209


In [37]:
def parse_text(text, tokenizador):
    all_tokens = [w.lower() if w in ngram_data.word_to_id else '<unk>' for w in tokenizador(text)]
    tokens_ids = [ngram_data.word_to_id[w] for w in all_tokens]
    return all_tokens, tokens_ids

In [38]:
def sample_next_word(raw_logits, temperature=1.0):
    logits = np.asanyarray(raw_logits).astype(np.float64)
    
    preds = logits / temperature
    expo_preds = np.exp(preds)
    preds = expo_preds / np.sum(expo_preds)
    
    proabs = np.random.multinomial(1, preds)
    return np.argmax(proabs)

In [39]:
def pred_next_token(model, token_ids):
    word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()

    
    y_pred = sample_next_word(y_raw_pred, temperature=1.0)
    return y_pred

In [40]:
def generar_texto(model, initial_text, tokenizador):
    all_tokens, window_word_ids = parse_text(initial_text, tokenizador)
    for i in range(100):
        y_pred = pred_next_token(best_model, window_word_ids)
        next_word = ngram_data.id_to_word[y_pred]
        all_tokens.append(next_word)
        if next_word == ngram_data.EOS:
            break
        else:
            window_word_ids.pop(0)
            window_word_ids.append(y_pred)
            
    return ' '.join(all_tokens)

In [91]:
initial_tokens = "<s> <s> fernando "

print("-"*30)
print("Learning word embeddings")
print("-"*30)
print(generar_texto(best_model, initial_tokens, tk.tokenize))

------------------------------
Learning word embeddings
------------------------------
<s> <s> fernando sus putas <unk> para que vean que ya te gano a <unk> o estar mamando <unk> gracias <unk> <unk> </s>


In [93]:
def log_likelihood(model, text, ngram_model):
    x,y = ngram_model.transform([text])
    x,y = x[2:], y[2:]
    x = torch.LongTensor(x).unsqueeze(0)
    logits = model(x).detach()
    probs = F.softmax(logits, dim=1).numpy()
    
    y = np.sum([np.log(probs[i][word]) for i, word in enumerate(y)])
    return y

In [97]:
print(log_likelihood(best_model, "hola como estas", ngram_data))
print(log_likelihood(best_model, "estas hola como ", ngram_data))

-16.62249
-18.758595


# ESTRUCTURAS SINTACTICAS CORRECTAS

In [109]:
from itertools import permutations
from random import shuffle

word_list = "si no gano me voy a la chingada".split(' ')
permutaciones = [" ".join(p) for p in permutations(word_list)]

permutaciones_ordenadas = sorted([(log_likelihood(best_model, text, ngram_data), text) for text in permutaciones], reverse=True)[:5]

print("-"*30)
print("MEJORES PERMUTACIONES SEG√öN LOG-LIKELIHOOD")
print("-"*30)
for p, t in permutaciones_ordenadas:
    print(f"Log-Likelihood: {p:.4f} | Texto: {t}")
    
peores_permutaciones_ordenadas = sorted([(log_likelihood(best_model, text, ngram_data), text) for text in permutaciones], reverse=True)[-5:]

print("-"*30)
print("PEORES PERMUTACIONES SEG√öN LOG-LIKELIHOOD")
print("-"*30)
for p, t in peores_permutaciones_ordenadas:
    print(f"Log-Likelihood: {p:.4f} | Texto: {t}")
    


------------------------------
MEJORES PERMUTACIONES SEG√öN LOG-LIKELIHOOD
------------------------------
Log-Likelihood: -23.7495 | Texto: si gano no me voy a la chingada
Log-Likelihood: -24.1518 | Texto: no gano si me voy a la chingada
Log-Likelihood: -25.5432 | Texto: gano no me voy a la chingada si
Log-Likelihood: -25.8474 | Texto: no gano me voy a la chingada si
Log-Likelihood: -27.2372 | Texto: gano si no me voy a la chingada
------------------------------
PEORES PERMUTACIONES SEG√öN LOG-LIKELIHOOD
------------------------------
Log-Likelihood: -109.0404 | Texto: si no gano voy chingada la me a
Log-Likelihood: -109.8371 | Texto: no la voy chingada gano me a si
Log-Likelihood: -110.4188 | Texto: a no gano voy chingada la me si
Log-Likelihood: -111.0062 | Texto: la no a voy chingada gano me si
Log-Likelihood: -112.5344 | Texto: a no la voy chingada gano me si
