# Practica 5

Eric Lemus Avalos

In [1]:
# Librerias necesarias 
import os 
import time 
import shutil 
import random 
from typing import Tuple 
from argparse import Namespace

import pandas as pd
import numpy as np  

from nltk import ngrams # extraer los N-gramas
from nltk.probability import FreqDist
from nltk.tokenize import TweetTokenizer

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn.metrics import accuracy_score

In [2]:
# Repricabilidad 
seed = 777
random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False 
torch.cuda.empty_cache()

# Este ultimo atributo de PyTorch permite activar o desactivar una optimización específica para seleccionar 
# las mejores configuraciones de algoritmos de cuDNN en función del tamaño de los tensores.

In [92]:
print(torch.__version__)          
print(torch.version.cuda) 


2.4.1
12.4


Trabajamos sobre los datos (tweets) de la practica 4. 

In [3]:
path_train = r'C:\Users\ericl\anaconda3\envs\nlp\practicas\Practica_4\Data\mex_train.txt'
path_val = r'C:\Users\ericl\anaconda3\envs\nlp\practicas\Practica_4\Data\mex_val.txt'
path_trg_val = r'C:\Users\ericl\anaconda3\envs\nlp\practicas\Practica_4\Data\mex_val_labels.txt'
path_trg_train = r'C:\Users\ericl\anaconda3\envs\nlp\practicas\Practica_4\Data\mex_train_labels.txt'


df_train = pd.read_csv(path_train, header=None)
df_trg_train = pd.read_csv(path_trg_train, header=None)
df_val = pd.read_csv(path_val, header=None)
df_trg_val = pd.read_csv(path_trg_val, header=None)

x_train = df_train[0].tolist()
x_val   = df_val[0].tolist()

In [4]:
# Observamos una muestra de los datos (tweets)
x_train[:10]

['lo peor de todo es que no me dan por un tiempo y luego vuelven estoy hasta la verga de estl',
 'a la vga no seas mamón 45 putos minutos después me dices que apenas sales no me querías avisar en 3 horas? 😑',
 'considero que lo más conveniente seria que lo retes a unos vergazos mi jelipe! rómpele la madre a ese pinchi joto!',
 'el marica de mi ex me tiene bloqueada de todo así  uno no puede admirar la "belleza" de su garnacha 😂',
 'mujer despechadaya pinche amlo hazle esta que se pela la loca #reynosafollow #reynosa',
 'putos. no tienen madre. ambriados mantenidos. ojetes. como es posible. mejor matarlos',
 'ustedes si puden andar de chanceros pero cuidadito y seamos nosotras porque luego luego empiezan a mamar hijos de la chingada.',
 '@usuario jajjaja te digo esa madre si está buena ajjaja',
 'odio los putos trámites de titulación 😡😡😡😡😡😡😡😡😡😡😡 pero me urge la precedula.',
 '@usuario no te equivocabas mi madre y tu tenían muchísima razón siempre es mejor lo que viene 💚']

In [5]:
# Tetagramas: creamos objeto Namespace vacio y creamos el arg N. 
args = Namespace()
args.N = 4


In [96]:
# El archivo NgramData.py contiene la clase del mismo nombre utilizada para crear los n-gramas
from NgramData import NgramData

Creamos un objeto TweetTokenizer  

In [57]:
tok = TweetTokenizer(reduce_len = True)
tokenizador = tok.tokenize
ngram_data = NgramData(args.N, 5000, tokenizador)
ngram_data.fit(x_train)

In [8]:
print('Vocal size: ', ngram_data.get_vocab_size())
# ngram_data.vocab

Vocal size:  5000


In [9]:
list(ngram_data.vocab)[:10]

['ta',
 'alfa',
 'puedes',
 'caricias',
 'regalar',
 'wau',
 'libro',
 'conoces',
 'mande',
 'ética']

In [10]:
w = ngram_data.word2id 
muestra_m = dict(list(w.items())[:10])
print(muestra_m)

{'lo': 0, 'peor': 1, 'de': 2, 'todo': 3, 'es': 4, 'que': 5, 'no': 6, 'me': 7, 'dan': 8, 'por': 9}


**Nota:** Necesitamos dejar los tokens listos para ser tomado por el modelo. Funcion transform de la clase (token -> seq).


&nbsp;


Ejemplo: 

 ```python
                                  [
                                    ['<s>', '<s>', '<s>'],
                                    ['<s>', '<s>', 'lo'],
 Lo peor de todo es que no se ->    ['<s>', 'lo', 'peor'],
                                    ['lo', 'peor', 'de'],
                                    ['peor', 'de', 'todo'],
                                    ['de', 'todo', 'es'],
                                    ['todo', 'es', 'que'],
                                    ['es', 'que', 'no'],
                                    ['que', 'no', 'me']
                                                         ]

```


### Dataset y Dataloader 


Preparamos los datos para ingresarlos a un TensorDataset

In [11]:
X_ngram_train, y_ngram_train = ngram_data.transform(x_train)
X_ngram_val, y_ngram_val   = ngram_data.transform(x_val)

In [93]:
print(f'Shape of X_ngram_train: {X_ngram_train.shape}')
print(f'Shape of y_ngram_train: {y_ngram_train.shape}')


Shape of X_ngram_train: (106823, 3)
Shape of y_ngram_train: (106823,)


In [94]:
print(f'Shape of X_ngram_val: {X_ngram_val.shape}')
print(f'Shape of y_ngram_val: {y_ngram_val.shape}')


Shape of X_ngram_val: (11587, 3)
Shape of y_ngram_val: (11587,)


In [15]:
# Podemos aplicar un mapeo para ver las palabras 
[[ngram_data.id2word[id] for id in lista] for lista in X_ngram_train[:20]]


[['<s>', '<s>', '<s>'],
 ['<s>', '<s>', 'lo'],
 ['<s>', 'lo', 'peor'],
 ['lo', 'peor', 'de'],
 ['peor', 'de', 'todo'],
 ['de', 'todo', 'es'],
 ['todo', 'es', 'que'],
 ['es', 'que', 'no'],
 ['que', 'no', 'me'],
 ['no', 'me', 'dan'],
 ['me', 'dan', 'por'],
 ['dan', 'por', 'un'],
 ['por', 'un', 'tiempo'],
 ['un', 'tiempo', 'y'],
 ['tiempo', 'y', 'luego'],
 ['y', 'luego', 'vuelven'],
 ['luego', 'vuelven', 'estoy'],
 ['vuelven', 'estoy', 'hasta'],
 ['estoy', 'hasta', 'la'],
 ['hasta', 'la', 'verga']]

In [16]:
args.batch_size = 32
args.num_workers = 2

# Dataset 
train_dataset = TensorDataset(torch.tensor(X_ngram_train, dtype= torch.int64), 
                              torch.tensor(y_ngram_train, dtype= torch.int64))

val_dataset = TensorDataset(torch.tensor(X_ngram_val, dtype= torch.int64), 
                            torch.tensor(y_ngram_val, dtype= torch.int64))

# Loader 
train_loader =  DataLoader(train_dataset, 
                           batch_size = args.batch_size, 
                           num_workers = args.num_workers, 
                           shuffle = True, pin_memory=False)

valid_loader =  DataLoader(val_dataset, 
                         batch_size = args.batch_size, 
                         num_workers = args.num_workers, 
                         shuffle = False, pin_memory=False)

In [17]:
batch = next(iter(train_loader))
print(f'X shape: {batch[0].shape}')
print(f'y shape: {batch[1].shape}')

X shape: torch.Size([32, 3])
y shape: torch.Size([32])


In [97]:
muestra_batch = batch[0]
# Podemos usar el diccionario id2word para ver el contentenido.
[[ngram_data.id2word[id] for id in lista] for lista in muestra_batch.tolist()][:20]


[['de', 'calor', 'encima'],
 ['bastardo', 'de', '<unk>'],
 ['con', 'sus', 'quejas'],
 ['quiero', 'vivir', 'sola'],
 ['la', 'que', 'no'],
 ['mil', 'putas', 'juntas'],
 ['empedar', 'la', 'vida'],
 ['me', 'gusta', 'alguien'],
 ['<s>', '<s>', 'hola'],
 ['la', 'madre', 'no'],
 ['a', '<unk>', '<unk>'],
 ['a', 'ti', 'te'],
 ['de', 'la', 'verga'],
 ['<s>', 'putos', 'chairos'],
 ['<s>', 'este', 'hdp'],
 ['de', '<unk>', 'donde'],
 ['de', 'los', 'videos'],
 ['poner', 'al', 'tu'],
 ['weyes', 'a', 'lado'],
 ['se', 'vienen', 'los']]

**Notas**

In [99]:
# Ejemplo de como se ve los embeddings
e = nn.Embedding(10,3)
print(e.weight) 

print('-----'*10)
print('\n')

print('Hacemos lo equivalente a la operacion de usar vectores one-hot\n')
e(torch.tensor([0,2,5])) 


Parameter containing:
tensor([[-0.5027,  0.7398,  0.0321],
        [-0.7081,  0.9164, -2.3439],
        [ 0.4917, -1.1725, -0.7897],
        [ 0.7475, -1.8150, -0.8740],
        [ 2.1855,  0.2381, -0.4865],
        [-0.3414, -0.8560,  0.8810],
        [-1.1504,  0.8726, -0.2438],
        [ 0.7136,  0.3745,  1.3340],
        [ 0.6958,  1.3113,  0.0672],
        [-1.0748, -1.1580, -0.3790]], requires_grad=True)
--------------------------------------------------


Hacemos lo equivalente a la operacion de usar vectores one-hot



tensor([[-0.5027,  0.7398,  0.0321],
        [ 0.4917, -1.1725, -0.7897],
        [-0.3414, -0.8560,  0.8810]], grad_fn=<EmbeddingBackward0>)

In [100]:
# detach(): Returns a new Tensor, detached from the current graph.
tensor1 = torch.tensor([7.8, 3.2, 4.4, 4.3, 3.3], requires_grad=True) 
print(tensor1) 
  
# detach the tensor 
print(tensor1.detach()) 

tensor([7.8000, 3.2000, 4.4000, 4.3000, 3.3000], requires_grad=True)
tensor([7.8000, 3.2000, 4.4000, 4.3000, 3.3000])


### Modelo de Bengio 

In [19]:
class Model_NLM_Bengio(nn.Module): 
    def __init__(self, 
                 args):
        super(Model_NLM_Bengio, self).__init__()
        self.window_size = args.N - 1
        self.embedding_size = args.d_embeddings
        self.embedding = nn.Embedding(args.vocab_size, args.d_embeddings)
        self.fc1 = nn.Linear(args.d_embeddings*(args.N-1),args.d_h)
        self.drop1 = nn.Dropout(p = args.dropout)
        self.fc2 = nn.Linear(args.d_h, args.vocab_size, bias = False)

    def forward(self, x): 
        x = self.embedding(x) #embedings en forma de matriz
        x = x.view(-1, self.window_size*self.embedding_size) # Ponerlo en un vector del tamano necesari 
        h = F.relu(self.fc1(x)) # modelo de bengio, ReLU(x) = max(0,x)
        h = self.drop1(h)
        return self.fc2(h)


In [101]:
# Obtener una distibucion de probabilidade de los logits 
def get_preds(raw_logits): 
    '''
    Funcion para obetener la distribucion de probabilidad de los logits del modelo 
    '''
    probs = F.softmax(raw_logits.detach(), dim = 1)
    y_pred = torch.argmax(probs, dim = 1).cpu().numpy()
    return y_pred


**Evaluacion del modelo**

In [98]:
def model_eval(data,model, gpu = False):
    with torch.no_grad(): 
        preds, trgs = [], [] 
        for window_words, labels in data: 
            if gpu:
                window_words = window_words.cuda()

            outputs = model(window_words)

            # Get predictions 
            y_pre = get_preds(outputs)
            preds.append(y_pre)

            trg  = labels.numpy()
            trgs.append(trg)
    # Aplanar el batch 
    tgts = [e for l in trgs for e in l]
    preds = [e for l in preds for e in l]
            
    return accuracy_score(tgts,preds)


def save_checkpoint(state, is_best, checkpoint_patn, filename = 'checkpoint.pt'):
    '''Funcion para serializar objetos a disco y guardar ese objeto'''
    filename = os.path.join(checkpoint_patn, filename)
    torch.save(state, filename)
    if is_best: 
        shutil.copyfile(filename, os.path.join(checkpoint_patn, 'model_best.pt'))

In [35]:
# Model hyperparameters
args.vocab_size = ngram_data.get_vocab_size()
args.d_embeddings = 100
args.d_h = 200
args.dropout = 0.1

# Train hyperparameters 
args.lr = 2.3e-1
args.num_epochs = 100
args.patience = 20

# Scheduler hyperparameters 
args.lr_patience = 10
args.lr_factor = 0.5 

# save directory
args.savedir = 'model'
os.makedirs(args.savedir, exist_ok=True)


# Model
model = Model_NLM_Bengio(args)

args.use_gpu = torch.cuda.is_available()
if args.use_gpu:
     model.cuda()
else:
     model.cpu() 

criterion = nn.CrossEntropyLoss()
optimizador = torch.optim.SGD(model.parameters(), lr = args.lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizador, mode = 'min', 
                                                      patience = args.lr_patience, 
                                                      verbose = True,  
                                                      factor = args.lr_factor) 


In [28]:
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

### Entrenamiento 

In [36]:
start_time = time.time()
best_metric = 0
metric_history = []
train_metric_history = []

for epoch in range(args.num_epochs):
    epoch_start_time = time.time()
    loss_epoch = []
    training_metric = []
    model.train()

    for window_words, labels in train_loader:

        # If GPU available
        if args.use_gpu:
            window_words = window_words.cuda()
            labels = labels.cuda()

        # Forward pass
        outputs = model(window_words)
        loss = criterion(outputs, labels)
        loss_epoch.append(loss.item())

        # Get training metrics
        y_pred = get_preds(outputs)
        tgt = labels.cpu().numpy()
        training_metric.append(accuracy_score(tgt, y_pred))

        # Backward and optimize
        optimizador.zero_grad()
        loss.backward()
        optimizador.step()

    # Get metric in training dataset
    mean_epoch_metric = np.mean(training_metric)
    train_metric_history.append(mean_epoch_metric)

    # Get metric in validation dataset
    model.eval()
    tuning_metric = model_eval(valid_loader, model, gpu = args.use_gpu)
    metric_history.append(mean_epoch_metric)

    # Update scheduler
    scheduler.step(tuning_metric)

    # Check for metric improvement
    is_improvement = tuning_metric > best_metric
    if is_improvement:
        best_metric = tuning_metric
        n_no_improve = 0
    else:
        n_no_improve += 1


    # Save best model if metric improved
    save_checkpoint(
        {
            "epoch": epoch + 1,
            "state_dict": model.state_dict(),
            "optimizer": optimizador.state_dict(),
            "scheduler": scheduler.state_dict(),
            "best_metric": best_metric
        },
        is_improvement,
        args.savedir
    )

    # Early stopping
    if n_no_improve >= args.patience:
        print("No improvement. Breaking out of loop.")
        break

    print('Train acc: {}'.format(mean_epoch_metric))
    print('Epoch [{}/{}], Loss: {:.4f} - Val accuracy: {:.4f} - Epoch time: {:.2f}'
          .format(epoch+1, args.num_epochs, np.mean(loss_epoch), tuning_metric, (time.time() - epoch_start_time)))

print("--- %s seconds ---" % (time.time() - start_time))


Train acc: 0.14828675180764128
Epoch [1/100], Loss: 5.5514 - Val accuracy: 0.1801 - Epoch time: 13.97
Train acc: 0.16396322680015402
Epoch [2/100], Loss: 5.1085 - Val accuracy: 0.1886 - Epoch time: 14.07
Train acc: 0.170683053095452
Epoch [3/100], Loss: 4.8670 - Val accuracy: 0.1952 - Epoch time: 13.94
Train acc: 0.17637337954049542
Epoch [4/100], Loss: 4.6739 - Val accuracy: 0.1768 - Epoch time: 14.36
Train acc: 0.17969585632995336
Epoch [5/100], Loss: 4.5045 - Val accuracy: 0.1887 - Epoch time: 14.18
Train acc: 0.18510006203739357
Epoch [6/100], Loss: 4.3469 - Val accuracy: 0.1988 - Epoch time: 14.07
Train acc: 0.18974618149146452
Epoch [7/100], Loss: 4.2001 - Val accuracy: 0.1446 - Epoch time: 14.35
Train acc: 0.19519049758268087
Epoch [8/100], Loss: 4.0738 - Val accuracy: 0.2114 - Epoch time: 14.01
Train acc: 0.2049694091473067
Epoch [9/100], Loss: 3.9507 - Val accuracy: 0.1296 - Epoch time: 14.00
Train acc: 0.21405708509819021
Epoch [10/100], Loss: 3.8444 - Val accuracy: 0.1832 - 

### Exploracion 

In [46]:
def print_closest_word(embedding,ngram_data, word, n):
     wors_id = torch.LongTensor([ngram_data.word2id[word]])
     word_embed = embedding(wors_id)
     distancia = torch.norm(embedding.weight - word_embed, dim = 1).detach()
     words_list = sorted(enumerate(distancia.numpy()), key = lambda x:x[1])
     for idx, diff in words_list[1:n+1]:
          print(ngram_data.id2word[idx], diff)


In [42]:
# Cargamos el mejor modelo con los embeddings ya entrenados 
best_model = Model_NLM_Bengio(args)
bets_model_path = r'C:\Users\ericl\anaconda3\envs\nlp\practicas\Practica_5\model\model_best.pt'
best_model.load_state_dict(torch.load(bets_model_path)['state_dict'])
best_model.eval()

  best_model.load_state_dict(torch.load(bets_model_path)['state_dict'])


Model_NLM_Bengio(
  (embedding): Embedding(5000, 100)
  (fc1): Linear(in_features=300, out_features=200, bias=True)
  (drop1): Dropout(p=0.1, inplace=False)
  (fc2): Linear(in_features=200, out_features=5000, bias=False)
)

In [49]:
print('-'*20)
print('Learned Embeddings')
print('-'*20)
print_closest_word(best_model.embedding, ngram_data, 'perro', 10)

--------------------
Learned Embeddings
--------------------
<s> 10.153593
<unk> 10.441092
que 10.998937
de 11.083012
parece 11.109263
aguas 11.172365
jajajaa 11.212874
ptm 11.261413
tendré 11.264101
cama 11.266591


Podemos utilizarlo para generar texto 

In [67]:
def parse_text(text,tokenizador): 
    '''Funcion para sacar los tokens del texto'''
    all_tokens = [w.lower() if w in ngram_data.word2id else '<unk>' for w in tokenizador.tokenize(text)]
    tokens_ids = [ngram_data.word2id[token.lower()] for token in all_tokens]
    return all_tokens, tokens_ids


def sample_next_word(logits, temperature = 1.0): 
    logits = np.asarray(logits).astype('float64')
    preds = logits / temperature #temperature es un factor de estocasidad de los tokens
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds)
    return np.argmax(probas)

def predic_next_token(model, token_ids):
    word_ids_tensor = torch.LongTensor(token_ids).unsqueeze(0)
    y_raw_pred = model(word_ids_tensor).squeeze(0).detach().numpy()

    #y_probs = F.softmax(y_raw_pred, dim = 1)
    #y_pred = torch.argmax(y_probs, dim = 1).detach().numpy()

    y_pred = sample_next_word(y_raw_pred, 1.0)
    return y_pred


def generate_text(model, initial_text, tokenizador, n_pred = 100): 
    all_tokens, window_word_ids = parse_text(initial_text, tokenizador)

    for i in range(n_pred): 
        y_pred = predic_next_token(model,window_word_ids)
        next_word = ngram_data.id2word[y_pred]
        all_tokens.append(next_word)

        if next_word == '</s>':
            break 
        else: 
            window_word_ids.pop(0) #Eliminar el ultimo elemento
            window_word_ids.append(y_pred) 
    
    return ' '.join(all_tokens)


In [71]:
tk = TweetTokenizer()
initial_text = '<s> <s> <s>'
print('-'*20)
print('Learned Embeddings')
print('-'*20)
generate_text(best_model,initial_text,tk)

--------------------
Learned Embeddings
--------------------


'<s> <s> <s> porq todos <unk> 😡 <unk> valió verga en <unk> hijo de tu puta pinche </s>'

In [72]:
initial_text = '<s> <s> hijo'
print('-'*20)
print('Learned Embeddings')
print('-'*20)
generate_text(best_model,initial_text,tk)

--------------------
Learned Embeddings
--------------------


'<s> <s> hijo de verga </s>'

In [82]:
initial_text = '<s> hola python'
print('-'*20)
print('Learned Embeddings')
print('-'*20)
generate_text(best_model,initial_text,tk)

--------------------
Learned Embeddings
--------------------


'<s> hola <unk> <unk> <unk> se borrar a tu maricon <unk> chinga y abel ahi sale <unk> <unk> que tú darte <unk> puta arriba y yo putas que se piojo todo el amigo de algunas <unk> <unk> <unk> ratas y estás sin basta <unk> en excelentes ... se <unk> esas puta <unk> <unk> </s>'

Queremos tener una especie de probabilidad que nos diga que tan probable es tener una seq de palabra dado 

un contexto (nuestro modelo previamente entrenado con los tweets). 

&nbsp;

**Nota:** La log verosimilitud no es una probabilidad. 

In [83]:
def log_likelihood(model, text, ngram_data):
    # Genera una ventana de n-gram 
    X, y = ngram_data.transform([text])
    # Desechamos los dos primeros n-gramas ya que contienen el token '<s>'
    X, y = X[2:], y[2:]
    X = torch.LongTensor(X).unsqueeze(0)

    logits = model(X).detach()
    probs = F.softmax(logits, dim = 1).numpy()

    return np.sum([np.log(probs[i][w]) for i, w in enumerate(y)])

In [88]:
llh = log_likelihood(best_model,"Arriva las chivas", ngram_data)
print("log likelihood: ", llh)

log likelihood:  -18.209625


In [89]:
llh = log_likelihood(best_model,"csm el america", ngram_data)
print("log likelihood: ", llh)

log likelihood:  -13.571409


### Estructuras sintacticas correctas

In [91]:
from itertools import permutations

word_seq = "si no gana el chivas pierde mi familia".split(' ')
perms = [' '.join(perm) for perm in permutations(word_seq)]

print('-'*20)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[:5]: 
    print(p, t)


# Tomamaos los ultimos 
print('-'*20)
for p, t in sorted([(log_likelihood(best_model, text, ngram_data), text) for text in perms], reverse=True)[-5:]: 
    print(p, t)


--------------------
-41.57523 pierde gana si el chivas no mi familia
-42.840347 familia gana si no el pierde chivas mi
-43.065834 familia pierde si no el chivas mi gana
-43.298546 mi pierde gana si el chivas no familia
-43.375275 familia chivas no mi pierde gana si el
--------------------
-86.67063 si chivas gana familia pierde mi el no
-86.78021 no chivas familia gana pierde mi el si
-87.30734 si no chivas gana familia pierde mi el
-87.855225 si no chivas familia gana pierde el mi
-87.99476 no chivas gana familia pierde mi el si
