# Práctica 1
Modifica el código anterior para adaptar el modelo LSTM al uso de embeddings preentrenados. Para ello, usa from torchtext.vocab import GloVe y elige el conjunto de embeddings GloVe que prefieras. Puedes encontrar más información en https://pytorch.org/text/stable/vocab.html#torchtext.vocab.GloVe

Verifica si se produce una mejora en la precisión del modelo. ¿Qué ocurre si usas un conjunto de embeddings preentrenados de diferentes tamaños?



**Cargamos el dataset** <br>

In [29]:
from torchtext import datasets
from torchtext.data import to_map_style_dataset
import numpy as np
from torchtext.vocab import GloVe, vocab
from torchtext.datasets import AG_NEWS
from torchtext.data.utils import get_tokenizer
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
import torch
import torch.nn as nn
from collections import Counter, OrderedDict
from torch.utils.data import DataLoader

# Load the dataset
train_iter, test_iter = datasets.AG_NEWS(split=('train', 'test'))

train_ds = to_map_style_dataset(train_iter)
test_ds = to_map_style_dataset(test_iter)


**Tokenizamos el Dataset y creamos nuestro vocabulario**

In [30]:
tokenizer = get_tokenizer("basic_english")

my_vocab = build_vocab_from_iterator(map(lambda x: tokenizer(x[1]), train_ds), specials=['<pad>','<unk>'])
my_vocab.set_default_index(my_vocab["<unk>"])

**Usamos DataLoader para modificar los datos para el posterior entrenamiento.**

In [31]:
num_class = len(set([label for (label, _) in train_iter]))

text_pipeline = lambda x: my_vocab(tokenizer(x))
label_pipeline = lambda x: int(x) - 1

def collate_batch(batch):
    label_list, text_list = [], []
    for sample in batch:
        label, text = sample
        text_list.append(torch.tensor(text_pipeline(text), dtype=torch.long))
        label_list.append(label_pipeline(label))
    return torch.tensor(label_list, dtype=torch.long), torch.nn.utils.rnn.pad_sequence(text_list, batch_first=True, padding_value=my_vocab["<pad>"])

train_dataloader = DataLoader(
    train_iter, batch_size=64, shuffle=True, collate_fn=collate_batch
)

test_dataloader = DataLoader(
    test_iter, batch_size=64, shuffle=True, collate_fn=collate_batch
)

**Aquí cargamos el vocabulario de GloVe para 100, 200 y 300 dimensiones.**

In [32]:
glove_vectors_100 = GloVe(name='6B', dim=100)
glove_vectors_200 = GloVe(name='6B', dim=200)
glove_vectors_300 = GloVe(name='6B', dim=300)

glove_vectors = [glove_vectors_100, glove_vectors_200, glove_vectors_300]

unk_token = "<unk>"
unk_index = 0

x = 100

for i in glove_vectors:
    glove_vocab_name = f'glove_vocab_{x}'
    glove_vocab_x = vocab(i.stoi)
    glove_vocab_x.insert_token(unk_token, unk_index)
    glove_vocab_x.set_default_index(unk_index)
    globals()[glove_vocab_name] = glove_vocab_x
    x += 100



**Creamos un embedding para cada una de las dimensiones.**


In [33]:
# versión 1
def pretrain_embeddings(tokens, glove_vectors):
    
    embeddings = [glove_vectors[token] for token in tokens]

    return torch.stack( list( map(torch.tensor, embeddings) ) )

embeddings_100 = pretrain_embeddings(my_vocab.get_itos(), glove_vectors_100)
embeddings_200 = pretrain_embeddings(my_vocab.get_itos(), glove_vectors_200)
embeddings_300 = pretrain_embeddings(my_vocab.get_itos(), glove_vectors_300)


  return torch.stack( list( map(torch.tensor, embeddings) ) )


**Con los embeddings creamos tres modelos para la clasificación de texto**

In [34]:
import torch
import torch.nn as nn


class LSTMTextClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_class, pretrained_embeddings):
        super(LSTMTextClassificationModel, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=True)

        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_class)

    def forward(self, text):
        embedded = self.embedding(text)  
        lstm_out, _ = self.lstm(embedded)
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)
        return output

vocab_size = len(my_vocab)  
hidden_dim = 64
num_class = 4

model_100 = LSTMTextClassificationModel(vocab_size, 100, hidden_dim, num_class, embeddings_100)
model_200 = LSTMTextClassificationModel(vocab_size, 200, hidden_dim, num_class, embeddings_200)
model_300 = LSTMTextClassificationModel(vocab_size, 300, hidden_dim, num_class, embeddings_300)




**Entrenamos el modelo de 100 dimensiones**

In [35]:
import time
model = model_100
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

def train(dataloader):
    model.train()
    total_acc, total_count, max_acc = 0, 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| {:5d} batches '
                  '| accuracy {:8.3f}'.format(idx, total_acc / total_count))

            if max_acc < total_acc / total_count:
                max_acc = total_acc / total_count
                
            total_acc, total_count = 0, 0
            start_time = time.time()
    return max_acc


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [28]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    accu_train = train(train_dataloader)
    accu_val = evaluate(test_dataloader)

    #if accu_train > accu_val:
    #    scheduler.step()
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

|   500 batches | accuracy    0.254
|  1000 batches | accuracy    0.253
|  1500 batches | accuracy    0.362
-----------------------------------------------------------
| end of epoch   1 | time: 69.11s | valid accuracy    0.845 
-----------------------------------------------------------
|   500 batches | accuracy    0.798
|  1000 batches | accuracy    0.858
|  1500 batches | accuracy    0.889
-----------------------------------------------------------
| end of epoch   2 | time: 63.54s | valid accuracy    0.871 
-----------------------------------------------------------
|   500 batches | accuracy    0.885
|  1000 batches | accuracy    0.893
|  1500 batches | accuracy    0.910
-----------------------------------------------------------
| end of epoch   3 | time: 83.79s | valid accuracy    0.900 
-----------------------------------------------------------
|   500 batches | accuracy    0.894
|  1000 batches | accuracy    0.899
|  1500 batches | accuracy    0.916
-------------------------

**Entrenamos el modelo con 200 dimensiones**

In [36]:
import time
model = model_200
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

def train(dataloader):
    model.train()
    total_acc, total_count, max_acc = 0, 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| {:5d} batches '
                  '| accuracy {:8.3f}'.format(idx, total_acc / total_count))

            if max_acc < total_acc / total_count:
                max_acc = total_acc / total_count
                
            total_acc, total_count = 0, 0
            start_time = time.time()
    return max_acc


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [37]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    accu_train = train(train_dataloader)
    accu_val = evaluate(test_dataloader)

    #if accu_train > accu_val:
    #    scheduler.step()
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

|   500 batches | accuracy    0.258
|  1000 batches | accuracy    0.259
|  1500 batches | accuracy    0.343
-----------------------------------------------------------
| end of epoch   1 | time: 72.58s | valid accuracy    0.649 
-----------------------------------------------------------
|   500 batches | accuracy    0.828
|  1000 batches | accuracy    0.877
|  1500 batches | accuracy    0.900
-----------------------------------------------------------
| end of epoch   2 | time: 72.06s | valid accuracy    0.868 
-----------------------------------------------------------
|   500 batches | accuracy    0.887
|  1000 batches | accuracy    0.898
|  1500 batches | accuracy    0.911
-----------------------------------------------------------
| end of epoch   3 | time: 70.87s | valid accuracy    0.896 
-----------------------------------------------------------
|   500 batches | accuracy    0.898
|  1000 batches | accuracy    0.904
|  1500 batches | accuracy    0.917
-------------------------

**Entrenamos el modelo con 300 dimensiones**

In [38]:
import time
model = model_300
# Hyperparameters
EPOCHS = 10 # epoch
LR = 5  # learning rate

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LR)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)

def train(dataloader):
    model.train()
    total_acc, total_count, max_acc = 0, 0, 0
    log_interval = 500
    start_time = time.time()

    for idx, (label, text) in enumerate(dataloader):
        optimizer.zero_grad()
        predicted_label = model(text)
        loss = criterion(predicted_label, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
        optimizer.step()

        total_acc += (predicted_label.argmax(1) == label).sum().item()
        total_count += label.size(0)
        if idx % log_interval == 0 and idx > 0:
            elapsed = time.time() - start_time
            print('| {:5d} batches '
                  '| accuracy {:8.3f}'.format(idx, total_acc / total_count))

            if max_acc < total_acc / total_count:
                max_acc = total_acc / total_count
                
            total_acc, total_count = 0, 0
            start_time = time.time()
    return max_acc


def evaluate(dataloader):
    model.eval()
    total_acc, total_count = 0, 0

    with torch.no_grad():
        for idx, (label, text) in enumerate(dataloader):
            predicted_label = model(text)
            loss = criterion(predicted_label, label)
            total_acc += (predicted_label.argmax(1) == label).sum().item()
            total_count += label.size(0)
    return total_acc / total_count

In [39]:
for epoch in range(1, EPOCHS + 1):
    epoch_start_time = time.time()

    accu_train = train(train_dataloader)
    accu_val = evaluate(test_dataloader)

    #if accu_train > accu_val:
    #    scheduler.step()
    
    print("-" * 59)
    print(
        "| end of epoch {:3d} | time: {:5.2f}s | "
        "valid accuracy {:8.3f} ".format(
            epoch, time.time() - epoch_start_time, accu_val
        )
    )
    print("-" * 59)

|   500 batches | accuracy    0.256
|  1000 batches | accuracy    0.256
|  1500 batches | accuracy    0.282
-----------------------------------------------------------
| end of epoch   1 | time: 83.11s | valid accuracy    0.684 
-----------------------------------------------------------
|   500 batches | accuracy    0.690
|  1000 batches | accuracy    0.786
|  1500 batches | accuracy    0.879
-----------------------------------------------------------
| end of epoch   2 | time: 82.27s | valid accuracy    0.878 
-----------------------------------------------------------
|   500 batches | accuracy    0.883
|  1000 batches | accuracy    0.895
|  1500 batches | accuracy    0.910
-----------------------------------------------------------
| end of epoch   3 | time: 85.47s | valid accuracy    0.902 
-----------------------------------------------------------
|   500 batches | accuracy    0.899
|  1000 batches | accuracy    0.905
|  1500 batches | accuracy    0.921
-------------------------

## **Conclusión**
Podemos comprobar que usando GloVe se mejora la presición del modelo, y también podemos ver que cuantas más dimensiones cojas del set de embeddings de GloVe mayor será la presición del modelo, pero tampoco hay mucha diferencia entre las tres dimensiones con las que hemos entrenado.