# Juan Carlos Perez Ramirez
## Procesamiento de Lenguaje Natural
## Practica 6: GRU

# Ejemplo GRU

En este notebook presentamos un ejemplo de como utilizar una GRU para clasificación de tweets en la tarea de agresivo y no agresivo. El dataset proviene de la competencia **MEX-A3T: Fake News and Aggressiveness Analysis**.


In [None]:
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
import copy

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F

from sklearn.metrics import f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


La clase Dataset de Pytorch permite un manejo ordenado de nuestros datos y una interacción sencilla con el objeto DataLoader utilizado para crear y cargar los batch de datos.

In [None]:
class aggression_dataset(Dataset):
    def __init__(self, split):
        super(Dataset, self).__init__()
        self.load_data(split)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           word_ids: lista de índices de las palbras en el vocabulario.
        '''
        label = self.data.iloc[index]['target']
        words, word_ids = self.preprocessed_text(index)
        return word_ids, label, words

    def preprocessed_text(self, index):
        '''Preprocess text and '''

        text = self.data.iloc[index]['text']
        words = nltk.word_tokenize(text)
        word_ids = [self.vocab[word] if word in self.vocab.keys() else self.emb_mat.shape[0]-1\
                        for word in words]
        return words, word_ids

    def load_data(self, split):
        '''Método para cargar datos.
           El texto está en la columna "text" y las categorías en la columna "target".
        '''
        self.data = pd.read_csv('%s.csv'%(split))

    def load_vocab_embeddings(self):
        '''Embeddings preentrenados en twitter.
           emb_mat: Matriz de embeddings. Un vector de tamaño 200 para cada palabra del vocabulario.
           vocab: Diccionario, asigna a cada palabra su renglón correspondiente en la matriz de embeddings.
        '''
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}
        with open('word2vec_col.txt', 'r') as f:
            for i, line in enumerate(f):
                if i!=0:
                    values = line.split()
                    self.vocab_dict[i+1] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], "float32")
                    embeddings_list.append(vector)
        embeddings_list.insert(0,np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.insert(0,np.zeros(100))
        self.vocab_dict[0] = '[PAD]'
        self.vocab_dict[1] = '[UNK]'
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = 1
        emb_mat = np.vstack(embeddings_list)

        return vocab, emb_mat

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''

        cat_0 = len(self.data[self.data['target']==0])
        cat_1 = len(self.data[self.data['target']==1])
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])

    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''

        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words

## GRU simple
El modelo se define heredando la clase nn.Module

In [None]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256):
        '''Constructor, aquí definimos las capas.
        input:
            input_size: Tamaño de los embeddings de las palabras.
            hidden_size: Tamaño de la capa oculta de la GRU.
            num_layers: Número de capas de la GRU.
            bidirectional: True si se quiere una GRu bidireccional.
            emb_mat: Matriz de embeddings del vocabulario.
            dense_hidden_size: Tamaño de la capa ocula del clasificador.
        '''
        super(SimpleRNN, self).__init__()
        # Matriz entrenable de embeddings, tamaño vocab_size x 100
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        # Gated Recurrent Unit
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        # Número de direcciones de la GRU
        directions = 2 if bidirectional else 1
        # Clasificador MLP
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))

    def forward(self, input_seq, lengths):
        '''Función feed-forward de la red.
        input:
            input_seq: Lista de ids para cada palabra.
            lengths: Número de palabras en cada una de las observaciones del batch.
        output:
            x: vectores para clasificar.
            return None for consistency with the next model
        '''
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        # Forma las secuencias de palabras que entraran a la GRU.
        x = x.split(lengths.tolist())
        # Añade pading y empaqueta las secuencias (mayor velocidad de cómputo).
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        hn = torch.cat([h for h in hn], dim=-1)
        x = self.classifier(hn)
        return x, None


In [None]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words = data
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)
            preds = torch.cat([preds, predictions.cpu()], dim=0)
            targets = torch.cat([targets, labels.cpu()], dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores
                words_list += words

        model.train()
        preds = preds.numpy()
        targets = targets.numpy()
        f1 = f1_score(targets, preds, average='binary')

        return np.mean(losses), f1, scores_list, words_list, pred_list

Definimos los iterables "dataloader" que se encargaran de generar los batch de datos.

In [None]:
batch_size=128

In [None]:
train_dataset = aggression_dataset('train')
val_dataset = aggression_dataset('val')
test_dataset = aggression_dataset('test')
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn = val_dataset.collate_fn, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

Definimos el los parámetros del optimizaor Adam y el dispositivo en que se entrenará la red, cuda o cpu.

In [None]:
lr = 0.001
epochs = 10
weight_decay=0.0001
beta1=0
beta2=0.999
device = torch.device('cuda')

Definimos el modelo, el optimizador y la función de pérdida (Negative Log-Likelihood).

In [None]:
model = SimpleRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

Entrenamos el modelo durante las épocas deseadas. Se guarda el modelo con mejor f1_score en el conjunto de valuación.

In [None]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        # Limpia basura de la memoria GPU
        torch.cuda.empty_cache()
        # Reiniciamos el cálculo del gradiente
        optimizer.zero_grad()
        # Desempaca los datos que salen del dataloader
        seq, seq_len, labels, _ = data
        # Mueve los datos al mismo device en el que este el modelo
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        # Calcula el gradiente de la pérdida
        loss.backward()
        # Realiza un paso de la optimización
        optimizer.step()

    #Evalúa los modelos en los conjuntos de entrenamiento y valuación
    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 0
train_loss: 0.535174 | val_loss: 0.623940 | train_f1: 0.599655 | val_f1: 0.510176


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 1
train_loss: 0.401649 | val_loss: 0.592191 | train_f1: 0.701754 | val_f1: 0.562147


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 2
train_loss: 0.392249 | val_loss: 0.588069 | train_f1: 0.688654 | val_f1: 0.585621


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 3
train_loss: 0.242259 | val_loss: 0.565033 | train_f1: 0.860325 | val_f1: 0.647378


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 4
train_loss: 0.224205 | val_loss: 0.712979 | train_f1: 0.881320 | val_f1: 0.609808


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 5
train_loss: 0.145965 | val_loss: 0.820152 | train_f1: 0.908430 | val_f1: 0.639847


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 6
train_loss: 0.147542 | val_loss: 0.732497 | train_f1: 0.882591 | val_f1: 0.643333


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 7
train_loss: 0.575726 | val_loss: 1.108819 | train_f1: 0.643498 | val_f1: 0.545238


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 8
train_loss: 0.075380 | val_loss: 0.855311 | train_f1: 0.943111 | val_f1: 0.648452


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 9
train_loss: 0.066686 | val_loss: 0.993887 | train_f1: 0.955703 | val_f1: 0.644068


Una vez que acabamos de entrenar cargamos el mejor modelo y lo evaluamos en los tres conjuntos.

In [None]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
test_loss, test_f1, _, _, _ = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('val_loss: %5f | val_f1: %5f'%(val_loss, val_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


train_loss: 0.075632 | train_f1: 0.943111
val_loss: 0.855311 | val_f1: 0.648452
test_loss: 0.870495 | test_f1: 0.632402


## GRU con atención

La sintaxis del modelo es similar al anterior pero se añade un módulo de atención. El modulo de atención toma los vectores de salida $h_t$ de la GRU y calcula una representación $s$ como suma ponderada:

$$ s = \sum_t \alpha_t h_t,$$

donde

\begin{align*}
    u_{t} &= \tanh(Wh_{t}+b),\\
    \alpha_{t} &= \frac{\exp(u_t^Tu)}{\sum_i\exp(u_{i}^Tu)}.
\end{align*}


In [None]:
class AttnModule(nn.Module):
    def __init__(self, input_size, attn_hidden_size=128):
        '''
        input:
            input_size: tamaño de la capa oculta de la GRU.
            attn_hidden_size: tamaño de la capa oculta.
        '''
        super(AttnModule, self).__init__()
        self.fc1 = nn.Linear(input_size, attn_hidden_size)
        self.fc2 = nn.Linear(attn_hidden_size, 1, bias=False)

    def forward(self, seq, lengths):
        '''
        input:
            seq: secuencia de vectores ocultos de la GRU.
            lengths: número de palabras en cada observación.
        '''
        x = pad_packed_sequence(seq)[0]
        seq_len, batch_size, nhid = x.size()
        u = self.fc1(x.view(batch_size*seq_len, nhid))
        u = torch.tanh(u)
        scores = self.fc2(u)
        scores = scores.view(seq_len, batch_size, 1)
        # Asigna -100 a las posiciones con padding para que no sean consideados en la atención.
        scores = nn.utils.rnn.pack_padded_sequence(scores, lengths=lengths,enforce_sorted=False)
        scores = nn.utils.rnn.pad_packed_sequence(scores, padding_value=-100)[0]
        scores = F.softmax(scores, dim=0)
        scores = scores.transpose(0,1)
        x = x.transpose(0,1).transpose(1,2)
        x = torch.bmm(x, scores)
        return x.squeeze(2), scores

In [None]:
class AttnRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1,
                 bidirectional=False, emb_mat=None, dense_hidden_size=256,
                 attn_hidden_size=128):
        super(AttnRNN, self).__init__()
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        directions = 2 if bidirectional else 1
        self.attn = AttnModule(input_size=hidden_size*directions)
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))

    def forward(self, input_seq, lengths):
        x = self.embeddings(input_seq)
        x = x.split(lengths.tolist())
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        x, scores = self.attn(output, lengths)
        x = self.classifier(x)
        return x, scores.detach()

In [None]:
lr = 0.0001
epochs = 20
device = torch.device('cuda')
weight_decay=0.0001
beta1=0
beta2=0.999

In [None]:
model = AttnRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

In [None]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        torch.cuda.empty_cache()
        optimizer.zero_grad()
        seq, seq_len, labels, _ = data
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()

    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1>best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 0
train_loss: 0.630384 | val_loss: 0.644028 | train_f1: 0.551771 | val_f1: 0.523154


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 1
train_loss: 0.537141 | val_loss: 0.571356 | train_f1: 0.626154 | val_f1: 0.573034


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 2
train_loss: 0.491195 | val_loss: 0.536931 | train_f1: 0.652726 | val_f1: 0.598496


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 3
train_loss: 0.455115 | val_loss: 0.510185 | train_f1: 0.686576 | val_f1: 0.626623


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 4
train_loss: 0.428186 | val_loss: 0.495324 | train_f1: 0.702874 | val_f1: 0.630225


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 5
train_loss: 0.405654 | val_loss: 0.483289 | train_f1: 0.719542 | val_f1: 0.644940


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 6
train_loss: 0.391759 | val_loss: 0.478862 | train_f1: 0.728858 | val_f1: 0.654723


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 7
train_loss: 0.372891 | val_loss: 0.468356 | train_f1: 0.748987 | val_f1: 0.672598


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 8
train_loss: 0.364508 | val_loss: 0.464894 | train_f1: 0.750216 | val_f1: 0.657051


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 9
train_loss: 0.344391 | val_loss: 0.457674 | train_f1: 0.766833 | val_f1: 0.667870


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 10
train_loss: 0.330464 | val_loss: 0.455144 | train_f1: 0.777709 | val_f1: 0.677362


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 11
train_loss: 0.320043 | val_loss: 0.452905 | train_f1: 0.781964 | val_f1: 0.683587


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 12
train_loss: 0.308999 | val_loss: 0.448087 | train_f1: 0.801124 | val_f1: 0.700544


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 13
train_loss: 0.294551 | val_loss: 0.454638 | train_f1: 0.809267 | val_f1: 0.688766


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 14
train_loss: 0.280904 | val_loss: 0.448510 | train_f1: 0.817929 | val_f1: 0.692857


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 15
train_loss: 0.267780 | val_loss: 0.445987 | train_f1: 0.828773 | val_f1: 0.691622


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 16
train_loss: 0.255050 | val_loss: 0.451424 | train_f1: 0.836722 | val_f1: 0.684397


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 17
train_loss: 0.242980 | val_loss: 0.456088 | train_f1: 0.850891 | val_f1: 0.686025


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 18
train_loss: 0.231322 | val_loss: 0.462273 | train_f1: 0.860553 | val_f1: 0.692168


HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))


epoch: 19
train_loss: 0.219862 | val_loss: 0.467937 | train_f1: 0.872716 | val_f1: 0.700186


In [None]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, train_scores, train_words, train_pred = eval_model(model, train_dataloader, criterion, device)
val_loss, val_f1, val_scores, val_words, val_pred = eval_model(model, val_dataloader, criterion, device)
test_loss, test_f1, test_scores, test_words, test_pred = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('val_loss: %5f | val_f1: %5f'%(val_loss, val_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

HBox(children=(FloatProgress(value=0.0, max=39.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))


train_loss: 0.309125 | train_f1: 0.801124
val_loss: 0.448087 | val_f1: 0.700544
test_loss: 0.464076 | test_f1: 0.687982


## Visualizando la atención

Uno de los beneficios de los mecanismos de atención es que nos permiten identificar qué elementos de las oraciones resultan más importantes.

In [None]:
from IPython.display import display, HTML
import matplotlib
import matplotlib.pyplot as plt

In [None]:
def colorize(words, color_array):
    '''
        Función para visuzalizar la atención, tomada de https://gist.github.com/ihsgnef/f13c35cd46624c8f458a4d23589ac768,
    '''
    # words is a list of words
    # color_array is an array of numbers between 0 and 1 of length equal to words
    cmap = matplotlib.cm.get_cmap('Reds')
    template = '<span class="barcode"; style="color: black; background-color: {}">{}</span>'
    colored_string = ''
    for word, color in zip(words, color_array):
        color = matplotlib.colors.rgb2hex(cmap(color)[:3])
        colored_string += template.format(color, '&nbsp' + word + '&nbsp')
    return colored_string

Las palabras con más atención se muestran en color azul y aquellas con menor atención en color rojo.

In [None]:
att = np.linspace(0,1,50)
p = [' ']*50
s = colorize(p, att)
# to display in ipython notebook
display(HTML(s))

In [None]:
max_attn = [np.max(scores) for scores in train_scores]
maxi = np.flip(np.argsort(max_attn))
for j in range(30):
    i = maxi[j]
    s = colorize(train_words[i], train_scores[i][:len(train_words[i])])
    # to display in ipython notebook
    category = 'Agresivo' if train_pred[maxi[j]]==1 else 'No agresivo'
    print('Categoría predicha: %s'%(category))
    display(HTML(s))

Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: No agresivo


Categoría predicha: Agresivo
