In [1]:
import pandas as pd
import pickle
import numpy as np
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
import copy
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pad_packed_sequence, pack_padded_sequence
import torch.nn.functional as F
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ulises\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
train_data = pd.read_fwf("mex_train.txt", header = None, encoding = 'utf-8', errors='replace')
train_label = pd.read_fwf("mex_train_labels.txt", header = None, encoding = 'utf-8', errors='replace')
val_data = pd.read_fwf("mex_val.txt", header = None, encoding = 'utf-8', errors='replace')
val_label = pd.read_fwf("mex_val_labels.txt", header = None, encoding = 'utf-8', errors='replace')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train_data, train_label, test_size=0.1, random_state=42)

In [4]:
train = pd.concat([X_train, y_train], axis=1)
test = pd.concat([X_test, y_test], axis=1)
val = pd.concat([val_data,val_label], axis=1)
print(len(train),len(test),len(val))

4989 555 616


In [5]:
train.to_csv('train.csv', header=['text','target'], index=False, encoding = 'utf-8-sig')
test.to_csv('test.csv', header=['text','target'], index=False, encoding = 'utf-8-sig')
val.to_csv('val.csv', header=['text','target'], index=False, encoding = 'utf-8-sig')

In [6]:
class agression_dataset(Dataset):
    def __init__(self, split):
        super(Dataset, self).__init__()
        self.load_data(split)
        self.vocab, self.emb_mat = self.load_vocab_embeddings()
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        '''Método principal para cargar una observación del dataset.
           label: categoría a la que pertenece la observación.
           words_ids: lista de índices de las palabras en el vocabulario.
        '''
        label = self.data.iloc[index]['target']
        words, words_ids = self.preprocessed_text(index)
        return words_ids, label, words
    
    def preprocessed_text(self, index):
        '''Preprocess text and '''
        text = self.data.iloc[index]['text']
        words = nltk.word_tokenize(text)
        words_ids = [self.vocab[word] if word in self.vocab.keys() else self.emb_mat.shape[0]-1\
                        for word in words]
        return words, words_ids
    
    def load_data(self, split):
        '''Método para cargar los datos.
           El texto está en la columna "text" y las categorías en la columna "target".        
        '''
        self.data = pd.read_csv('%s.csv'%(split))

    def load_vocab_embeddings(self):
        '''Embeddings preentrenados en twitter.
           emb_mat: Matriz de embeddings. Un vector de tamaño 200 para cada palabra del vocabulario.
           vocab: Diccionario, asigna a cada palabra su renglón correspondiente en la matriz de embeddings.        
        '''
        embeddings_list = []
        self.vocab_dict = {}
        vocab = {}
        with open('word2vec_col.txt', 'r') as f:
            for i, line in enumerate(f):
                if i!=0:
                    values = line.split()
                    self.vocab_dict[i+1] = values[0]
                    vocab[values[0]] = i+1
                    vector = np.asarray(values[1:], 'float32')
                    embeddings_list.append(vector)
        embeddings_list.insert(0,np.mean(np.vstack(embeddings_list), axis=0))
        embeddings_list.insert(0,np.zeros(100))
        self.vocab_dict[0] = '[PAD]'
        self.vocab_dict[1] = '[UNK]'
        vocab['[PAD]'] = 0
        vocab['[UNK]'] = 1
        emb_mat = np.vstack(embeddings_list)

        return vocab, emb_mat

    def get_weights(self):
        '''Devuelve pesos inversos para cada categoría. Mayor peso para la categoría con menos observaciones.'''
        cat_0 = len(self.data[self.data['target']==0])
        cat_1 = len(self.data[self.data['target']==1])
        maxi = max(cat_0, cat_1)
        return torch.tensor([maxi/cat_0, maxi/cat_1])
    
    def collate_fn(self, batch):
        '''Función que ejecuta el dataloader para formar batches de datos.'''
        zipped_batch = list(zip(*batch))
        word_ids = [torch.tensor(t) for t in zipped_batch[0]]
        word_ids = torch.cat(word_ids, dim=0)
        lengths = torch.tensor([len(t) for t in zipped_batch[0]])
        labels = torch.tensor(zipped_batch[1])
        words = zipped_batch[2]
        return word_ids, lengths, labels, words

In [7]:
class SimpleRNN(nn.Module):
    def __init__(self, input_size=100, hidden_size=128, num_layers=1, 
                 bidirectional=False, emb_mat=None, dense_hidden_size=256):
        '''Constructor, aquí definimos las capas.
        input:
            input_size: Tamaño de los embeddings de las palabras.
            hidden_size: Tamaño de la capa oculta de la GRU. 
            num_layers: Número de capas de la GRU.
            bidirectional: True si se quiere una GRU bidireccional. 
            emb_mat: Matriz de embeddings del vocabulario.
            dense_hidden_size: Tamaño de la capa oculta del clasificador.
        '''
        super(SimpleRNN, self).__init__()
        # Matriz entrenable de embeddings, tamaño del vocab_size x 100
        self.embeddings = nn.Embedding.from_pretrained(\
                            torch.FloatTensor(emb_mat), freeze=False)
        # Gated Recurrent Unit
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                          num_layers=num_layers, bidirectional=bidirectional)
        # Número de direcciones de la GRU
        directions = 2 if bidirectional else 1
        # Clasificador MLP
        self.classifier = nn.Sequential(\
                            nn.Linear(hidden_size*directions, dense_hidden_size),
                            nn.BatchNorm1d(dense_hidden_size),
                            nn.ReLU(),
                            nn.Linear(dense_hidden_size, 2))
        
    def forward(self, input_seq, lengths):
        '''Función feed-forward de la red.
        input:
            input_seq: Lista de ids para cada palabra.
            lengths: Número de palabras en cada una de las observaciones del batch.
        output:
            x: vectores para clasificar.
            return None for consistency with the next model
        '''
        # Calcula el embedding para cada palabra.
        x = self.embeddings(input_seq)
        # Forma las secuencias de palabras que entran a la GRU.
        x = x.split(lengths.tolist())
        # Añade pading y empaqueta las secuencias (mayor velocidad de cómputo).
        x = pad_sequence(x)
        x = pack_padded_sequence(x, lengths, enforce_sorted=False)
        output, hn = self.gru(x)
        hn = torch.cat([h for h in hn], dim= 1)
        x = self.classifier(hn)
        return x, None

In [8]:
def eval_model(model, dataloader, criterion, device):
    '''Función para evaluar el modelo.'''
    with torch.no_grad():
        model.eval()
        losses = []
        preds = torch.empty(0).long()
        targets = torch.empty(0).long()
        scores_list = []
        words_list = []
        pred_list = []
        for data in tqdm(dataloader):
            torch.cuda.empty_cache()
            seq, seq_len, labels, words = data 
            seq, labels = seq.to(device), labels.to(device)
            output, scores = model(seq, seq_len)
            output = F.log_softmax(output, dim=1)
            loss = criterion(output, labels)
            losses.append(loss.item())
            predictions = F.log_softmax(output, dim=1).argmax(1)
            preds = torch.cat([preds, predictions.cpu()],dim=0)
            targets = torch.cat([targets, labels.cpu()],dim=0)
            if scores is not None:
                pred_list += predictions.tolist()
                scores = scores.cpu().squeeze(2).tolist()
                scores_list += scores 
                words_list += words 

        model.train()
        preds = preds.numpy()
        targets = targets.numpy()
        f1 = f1_score(targets, preds, average='binary')

        return np.mean(losses), f1, scores_list, words_list, pred_list

In [11]:
batch_size=32

In [12]:
train_dataset = agression_dataset('train')
val_dataset = agression_dataset('val')
test_dataset = agression_dataset('test')
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn = train_dataset.collate_fn, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn = val_dataset.collate_fn, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn = test_dataset.collate_fn, shuffle=False)

In [13]:
lr = 0.001
epochs = 10
weight_decay=0.0001
beta1=0
beta2=0.999
device = torch.device('cpu') #no tengo gpu :(

In [14]:
model = SimpleRNN(emb_mat=train_dataset.emb_mat, bidirectional=False).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr,weight_decay=weight_decay, betas = (beta1, beta2))
weight = train_dataset.get_weights().to(device)
criterion = nn.NLLLoss(weight = weight)

In [15]:
best_val_f1 = 0
for epoch in range(epochs):
    for data in tqdm(train_dataloader):
        # Limpia basura de la memoria GPU
        #torch.cuda.empty_cache()
        # Reiniciamos el cálculo del gradiente
        optimizer.zero_grad()
        # Desempaca los datos que salen del dataloader
        seq, seq_len, labels, _ = data 
        # Mueve los datos al mismo device en el que esté el modelo
        seq, labels = seq.to(device), labels.to(device)
        output, _ = model(seq, seq_len)
        output = F.log_softmax(output, dim=1)
        loss = criterion(output, labels)
        # Calcula el gradiente de la pérdida
        loss.backward()
        # Realiza un paso de la optimización
        optimizer.step()

    # Evalúa los modelos en los conjuntos de entrenamiento y validación
    train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
    val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
    print('epoch: %d'%(epoch))
    print('train_loss: %5f | val_loss: %5f | train_f1: %5f | val_f1: %5f'%(train_loss, val_loss, train_f1, val_f1))
    if val_f1 > best_val_f1:
        best_val_f1=val_f1
        best_state_dict=copy.deepcopy(model.state_dict())

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 0
train_loss: 0.459784 | val_loss: 0.510845 | train_f1: 0.726678 | val_f1: 0.704830


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 1
train_loss: 0.332247 | val_loss: 0.488197 | train_f1: 0.833100 | val_f1: 0.706161


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 2
train_loss: 0.277221 | val_loss: 0.484871 | train_f1: 0.841369 | val_f1: 0.718147


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 3
train_loss: 0.189715 | val_loss: 0.508579 | train_f1: 0.924204 | val_f1: 0.674877


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 4
train_loss: 0.108768 | val_loss: 0.573001 | train_f1: 0.952329 | val_f1: 0.728929


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 5
train_loss: 0.103890 | val_loss: 0.832332 | train_f1: 0.958111 | val_f1: 0.690000


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 6
train_loss: 0.059130 | val_loss: 0.788103 | train_f1: 0.971117 | val_f1: 0.725738


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 7
train_loss: 0.048851 | val_loss: 0.787183 | train_f1: 0.974891 | val_f1: 0.741053


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 8
train_loss: 0.049602 | val_loss: 1.044419 | train_f1: 0.980834 | val_f1: 0.701176


  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

epoch: 9
train_loss: 0.031544 | val_loss: 0.955065 | train_f1: 0.982320 | val_f1: 0.717391


In [16]:
model.load_state_dict(best_state_dict)
train_loss, train_f1, _, _, _ = eval_model(model, train_dataloader, criterion, device)
val_loss, val_f1, _, _, _ = eval_model(model, val_dataloader, criterion, device)
test_loss, test_f1, _, _, _ = eval_model(model, test_dataloader, criterion, device)
print('train_loss: %5f | train_f1: %5f'%(train_loss, train_f1))
print('val_loss: %5f | val_f1: %5f'%(val_loss, val_f1))
print('test_loss: %5f | test_f1: %5f'%(test_loss, test_f1))

  0%|          | 0/156 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

train_loss: 0.049209 | train_f1: 0.974891
val_loss: 0.787183 | val_f1: 0.741053
test_loss: 0.740891 | test_f1: 0.714286
