In [1]:

from torchtext.vocab import build_vocab_from_iterator
from transformers import AutoTokenizer, AutoModelForPreTraining

import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import random
import time
import re
from random import randint
from googletrans import Translator
import plotly.express as px

In [2]:
emb_model = AutoModelForPreTraining.from_pretrained('bert-base-multilingual-cased')
tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')


In [3]:
LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}

In [4]:
x_train = pd.read_csv('trainLivraria.csv')
x_test = pd.read_csv('testLivraria.csv')

# x_train['alt_title'] = x_train['alt_title'].map(lambda title: \
#                                                 tokenizer.encode_plus(title, 
#                                                                       add_special_tokens=True,
#                                                                       padding='longest'))
# x_test['alt_title'] = x_test['alt_title'].map(lambda title: \
#                                               tokenizer.encode_plus(title, 
#                                                                     add_special_tokens=True,
#                                                                     padding='longest'))

In [5]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_len: int = None) -> object:
        
        self.data = dataframe
        self.tokenizer = tokenizer 
        self.max_len = max_len 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        textos = self.data.loc[index, 'Titulo'], 
        labels = self.data.loc[index, 'Genero']

        encoding = tokenizer(textos, 
                             max_length=self.max_len,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')
        encoding2 = tokenizer(labels, 
                             max_length=self.max_len,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')
        
        inputs = encoding['input_ids']
        tkn_type = encoding['token_type_ids']
        att_mask = encoding['attention_mask']

        labels = encoding2['input_ids']
        return inputs, labels


In [6]:
train_dataset = MyDataset(x_train, tokenizer=tokenizer, max_len=32)
test_dataset = MyDataset(x_test, tokenizer=tokenizer, max_len=32)


#del x_train, x_test

In [7]:
BATCH_SIZE = 100

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [8]:
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)

LEARNING_RATE = 0.005
#BATCH_SIZE = 100
NUM_EPOCHS = 20

DEVICE = torch.device('cuda:1' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 100
HIDDEN_DIM = 256
NUM_CLASSES = 7

In [43]:
# Construção do Modelo LSTM

class RNN(torch.nn.Module):
    
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim):
        super().__init__()

        self.embedding = torch.nn.Embedding(input_dim, embedding_dim)
        
        self.rnn = torch.nn.LSTM(embedding_dim,
                                 hidden_dim, batch_first=True)        
        
        self.fc = torch.nn.Linear(hidden_dim, output_dim)
        

    def forward(self, text):
        #  dimensão do text: [sentence length, batch size]
        
        embedded = self.embedding(text)
        #  dimensão embedded: [sentence length, batch size, embedding dim]
        
        output, (hidden, cell) = self.rnn(embedded)
        #  dimensão output: [sentence length, batch size, hidden dim]
        #  dimensão hidden: [1, batch size, hidden dim]

        hidden.squeeze_(0)
        #  dimensão hidden: [batch size, hidden dim]
        
        output = self.fc(hidden)
        print(output.dim())
        return output

In [44]:
# Inicializando o modelo

torch.manual_seed(RANDOM_SEED)

model = RNN(input_dim=len(tokenizer),
            embedding_dim=EMBEDDING_DIM,
            hidden_dim=HIDDEN_DIM,
            output_dim=NUM_CLASSES
)

model = model.to(DEVICE)

# Utilização do otimizador Adam
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

print(model)

RNN(
  (embedding): Embedding(119547, 100)
  (rnn): LSTM(100, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=7, bias=True)
)


In [54]:

import torch.nn as nn
from tqdm import tqdm

def train(model, train_data, learning_rate, epochs):

    # train, val = Dataset(train_data), Dataset(val_data)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(NUM_EPOCHS):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_loader):
                # mask = train_input['attention_mask'].to(device)
                # input_id = train_input['input_ids'].squeeze(1).to(device)
                train_input = train_input.squeeze(0)
                print(train_input.squeeze
                output = model(train_input)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            """with torch.no_grad():

                for input_id, type_id, attention, label in val_dataloader:

                    val_label = label.to(device)
                    # mask = val_input['attention_mask'].to(device)
                    # input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id.squeeze(1), attention.squeeze(1), type_id.squeeze(1))

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc"""
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} \
                | Train Accuracy: {total_acc_train / len(train_data): .3f} \
                | Val Loss: {total_loss_val / len(val_data): .3f} \
                | Val Accuracy: {total_acc_val / len(val_data): .3f}')
                  
EPOCHS = 17
# model = BertClassifier()
LR = 1e-6
              
train(model, train_dataset, LEARNING_RATE, EPOCHS)

  0%|                                                                                           | 0/10 [00:00<?, ?it/s]


ValueError: LSTM: Expected input to be 2D or 3D, got 4D instead

In [39]:
def evaluate(model, test_data):

    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=2)

    total_acc_test = 0
    with torch.no_grad():

        for input_id, type_id, attention, label in test_loader:

              output = model(input_id.squeeze(1), attention.squeeze(1), type_id.squeeze(1))

              acc = (output.argmax(dim=1) == label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')
    
evaluate(model, test_dataset)


ValueError: not enough values to unpack (expected 4, got 2)

In [None]:
# torch.save(model.state_dict(), '../model/embeddigns-tf/model.pt')

In [None]:
for i in enumerate(model.parameters()): 
    print(i)

In [None]:
# valores de treino e validação nas 17 epochs para alteração de palavras por contexto
# [0.398, 0.591, 0.674, 0.737, 0.793, 0.842, 0.876, 0.903, 0.923, 0.936, 0.949, 0.960, 0.966, 0.964, 0.969, 0.971, 0.976]
# [0.517 0.647 0.671 0.700 0.695 0.702 0.699 0.683 0.682 0.686 0.695 0.706 0.690 0.710 0.700 0.690 0.695]