### En este notebook usaré un modelo transformer para ver si mejora el modelo sin llegar a sobreentrenar.

In [23]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
import shutil
import sys

In [24]:
train_df = pd.read_csv('sem_eval_train_es.csv')
test_df = pd.read_csv('sem_eval_test_blank_es.csv')

In [25]:
train_df.columns

Index(['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
      dtype='object')

In [26]:
target_list = ['anger', 'anticipation', 'disgust', 'fear', 'joy',
       'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust']

In [27]:
MAX_LEN = 140 #El límite de los tweets era de 140 caracteres.
TRAIN_BS = 32
VALID_BS = 32
EPOCHS = 2
LR = 1e-05

In [28]:
from transformers import BertTokenizer, BertModel

In [29]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [30]:
example_test = 'Este es un texto de prueba. Para ver cómo funciona el encoder.'
encodings = tokenizer.encode_plus(
    example_test,
    add_special_tokens = True,
    max_length = MAX_LEN,
    padding = 'max_length',
    truncation = True,
    return_attention_mask = True,
    return_tensors = 'pt'
)

In [31]:
encodings

{'input_ids': tensor([[  101, 28517,  9686,  4895,  3793,  2080,  2139, 10975,  5657,  3676,
          1012, 11498,  2310,  2099, 18609,  4569, 10446,  2050,  3449,  4372,
         16044,  2099,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [32]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len):
        self.df = df
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.title = self.df['Tweet']
        self.targets = self.df[target_list].values

    def __len__(self):
        return len(self.title)
    
    def __getitem__(self, index):
        title = str(self.title[index])
        tititle = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            padding = 'max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask = True,
            return_tensors= 'pt'
        )

        return{
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs['token_type_ids'].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

In [33]:
train_size = 0.8
train_df, val_df = train_test_split(train_df, test_size = 0.2)

In [34]:
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
val_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)

In [35]:
train_data_loader = torch.utils.data.DataLoader(
    train_dataset,
    shuffle = True,
    batch_size = TRAIN_BS,
    num_workers = 0
)

val_data_loader = torch.utils.data.DataLoader(
    val_dataset,
    shuffle = False,
    batch_size = VALID_BS,
    num_workers = 0
)

In [36]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

cuda


In [37]:
def load_ckp(checkpoint_fpath, model, optimizer):
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    valid_loss_min = checkpoint['valid_loss_min']
    return model, optimizer, checkpoint['epoch'], valid_loss_min.item()

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    f_path = checkpoint_path
    torch.save(state, f_path)

    if is_best:
        best_fpath = best_model_path
        shutil.copyfile(f_path, best_fpath)

In [41]:
class BERTClass(nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased', return_dict = True)
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768, 11)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert_model(input_ids, attention_mask, token_type_ids)
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output


model = BERTClass()
model.to(device)

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [42]:
def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss() (outputs, targets)

optimizer = torch.optim.Adam(params=model.parameters(), lr = LR)

In [44]:
def train_model(n_epochs, training_loader, validation_loader, model, optimizer, checkpoint_path, best_model_path):
    valid_loss_min = np.inf
    for epoch in range(1, n_epochs+1):
        train_loss = 0
        valid_loss = 0
        model.train()

        # Training loop
        for index, batch in enumerate(training_loader):
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
            targets = batch['targets'].to(device, dtype=torch.float)
            outputs = model(input_ids, attention_mask, token_type_ids)
            optimizer.zero_grad()
            loss = loss_fn(outputs, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss = train_loss + (1/(index+1)(loss.item()-train_loss))


        #Validation loop
        model.eval()
        with torch.no_grad():
            for index, batch in enumerate(training_loader):
                input_ids = batch['input_ids'].to(device, dtype=torch.long)
                attention_mask = batch['attention_mask'].to(device, dtype=torch.long)
                token_type_ids = batch['token_type_ids'].to(device, dtype=torch.long)
                targets = batch['targets'].to(device, dtype=torch.float)
                outputs = model(input_ids, attention_mask, token_type_ids)
                loss = loss_fn(outputs, targets)
                valid_loss = valid_loss + (1/(index+1)(loss.item()-valid_loss))

        checkpoint = {
            'epoch': epoch+1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
        }

        save_ckp(checkpoint, False, checkpoint_path, best_model_path)

    return model

In [46]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, "/currBERT_ckpt", 'bestBERT.pt')

KeyError: 2507