In [1]:
import pandas as pd
import numpy as np
import torch

from torchtext.vocab import build_vocab_from_iterator
from transformers import AutoTokenizer, AutoModelForPreTraining

#import config as cfg


ModuleNotFoundError: No module named 'transformers'

In [None]:
emb_model = AutoModelForPreTraining.from_pretrained('neuralmind/bert-base-portuguese-cased')
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')


### Import da base

In [None]:
LABELS = {'Historia': 0, 'Administracao': 1, 'Geografia': 2, 'Biologia': 3, 
          'Literatura': 4, 'Artes': 5, 'Matematica': 6}

In [None]:
x_train = pd.read_csv('../data/processed/train_final.csv')
x_test = pd.read_csv('../data/processed/test.csv')

x_test.drop(['titulo', 'genero'], axis=1, inplace=True)

# x_train['alt_title'] = x_train['alt_title'].map(lambda title: \
#                                                 tokenizer.encode_plus(title, 
#                                                                       add_special_tokens=True,
#                                                                       padding='longest'))
# x_test['alt_title'] = x_test['alt_title'].map(lambda title: \
#                                               tokenizer.encode_plus(title, 
#                                                                     add_special_tokens=True,
#                                                                     padding='longest'))

In [None]:
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, dataframe: pd.DataFrame, tokenizer, max_len: int = None) -> object:
        
        self.data = dataframe
        self.tokenizer = tokenizer 
        self.max_len = max_len 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        textos = self.data.loc[index, 'alt_title'], 
        labels = self.data.loc[index, 'label']

        encoding = tokenizer(textos, 
                             max_length=self.max_len,
                             padding='max_length',
                             truncation=True,
                             return_tensors='pt')
        
        inputs = encoding['input_ids']
        tkn_type = encoding['token_type_ids']
        att_mask = encoding['attention_mask']

        return inputs, tkn_type, att_mask, labels


In [None]:
train_dataset = MyDataset(x_train, tokenizer=tokenizer, max_len=32)
test_dataset = MyDataset(x_test, tokenizer=tokenizer, max_len=32)

del x_train, x_test

In [None]:
BATCH_SIZE = 128

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
from transformers import BertModel, BertTokenizer

In [None]:
import torch
from torch import nn
from transformers import BertModel

class CustomBERTModel(nn.Module):
    def __init__(self, num_classes):
        super(CustomBERTModel, self).__init__()

        # Load the pre-trained BERT model and tokenizer
        self.bert = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased')

        # Optionally, you can choose to freeze the embeddings
        for p in self.bert.embeddings.parameters():
            p.requires_grad = False

        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)  # Additional linear layer
        self.gelu = nn.GELU()
        self.fc2 = nn.Linear(256, 128)  # Additional linear layer
        self.fc3 = nn.Linear(128, 64)
        self.fc4 = nn.Linear(64, 32)
        self.fc5 = nn.Linear(32, num_classes)
        

    def forward(self, input_ids, attention_mask, token_tp_ids):
        # Forward pass through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_tp_ids)

        # Extract the last hidden state of the token [CLS] for classification task
        pooled_output = outputs.pooler_output

        # Apply the additional linear layer with ReLU activation
        x = self.fc1(pooled_output)
        x = self.gelu(x)
        x = self.fc2(x)
        x = self.gelu(x)
        x = self.fc3(x)
        x = self.gelu(x)
        x = self.fc4(x)
        x = self.gelu(x)

        # Pass the transformed output through the classification layer
        logits = self.fc5(x)
        print(logits)
        return logits  # Softmax is applied outside the model during training

model = CustomBERTModel(num_classes=7)
# model = CustomBERTModel(num_classes=7)


In [None]:
from tqdm import tqdm

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

# Initialize variables to keep track of training statistics
total_loss = 0
correct = 0
total = 0

# Train the model
for epoch in range(10):
    model.train()  # Set the model to training mode
    total_loss = 0
    correct = 0
    total = 0

    for batch_idx, (inputs, tkn_type, att_mask, labels) in enumerate(train_loader):
        
        optimizer.zero_grad()

        # Forward pass
    
        logits = model(inputs.squeeze(1), att_mask.squeeze(1), tkn_type.squeeze(1))
        # print(outputs)
        # Calculate the loss
        
        loss = criterion(logits, labels)
        
        total_loss += loss.item()

        # torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

        # Backward pass and update the model parameters
        loss.backward()
        optimizer.step()

        # Calculate accuracy for the current batch
        predicted = torch.softmax(logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        # Print training statistics for the epoch
        accuracy = 100 * correct / total
        print(f'batch_idx [{batch_idx + 1}/{len(train_loader)}], Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%', end='\r')

    # Print training statistics for the epoch
    accuracy = 100 * correct / total
    print(f'Epoch [{epoch + 1}/10], Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.2f}%')

# Final accuracy after training
print('Final Accuracy:', accuracy)