# Importar Bibliotecas


In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd

# Carregando a base de dados


In [2]:
df = pd.read_csv('Industry Sector.csv')

In [3]:
df['class'].value_counts()

Unnamed: 0_level_0,count
class,Unnamed: 1_level_1
services,2595
consumer,1548
financial,959
basic,947
capital,635
transportation,505
technology,495
healthcare,397
energy,354
utilities,282


In [4]:
encode_class = {}
decode_class = {}
for i, classe in enumerate(df['class'].unique().tolist()):
    encode_class[classe] = i
    decode_class[i] = classe

texts = df['text'].tolist()
labels = [encode_class[classe] for classe in df['class'].tolist()]

# Fazendo a divisão entre treino, validação e teste

In [6]:
train_val_texts, test_texts, train_val_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42, stratify=labels)
train_texts, val_texts, train_labels, val_labels = train_test_split(train_val_texts, train_val_labels, test_size=0.175, random_state=42, stratify=train_val_labels)

# Treinando o BERT

In [7]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

In [13]:
bert_model_name = 'bert-base-uncased'
max_length = 128
batch_size = 16
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [14]:
train_dataset = TextClassificationDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

Step 4: Build our customer BERT classifier

In [16]:
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

Step 5: Define the train() function

In [17]:
def train(model, data_loader, optimizer, scheduler, device):
    model.train()
    for batch in data_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()
        optimizer.step()
        scheduler.step()

Step 6: Build our evaluation method

In [18]:
def evaluate(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

Step 7: Build our prediction method

In [19]:
def predict_class(text, model, tokenizer, device, max_length=128):
    model.eval()
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs, dim=1)
    return decode_class[preds.item()]

Step 8: Define our model’s parameters

In [20]:
num_classes = len(df['class'].unique())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [22]:
num_epochs = 4
learning_rate = 2e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)



In [None]:
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, val_dataloader, device)
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy: 0.6907
              precision    recall  f1-score   support

           0       0.66      0.59      0.62       122
           1       0.78      0.29      0.42        97
           2       0.00      0.00      0.00        13
           3       0.68      0.69      0.69       200
           4       0.70      0.61      0.65        49
           5       0.85      0.65      0.74       143
           6       0.80      0.74      0.77        66
           7       0.68      0.87      0.76       339
           8       0.55      0.66      0.60        76
           9       0.60      0.80      0.69        86
          10       0.85      0.66      0.74        44

    accuracy                           0.69      1235
   macro avg       0.65      0.60      0.61      1235
weighted avg       0.70      0.69      0.68      1235

Epoch 2/4


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Validation Accuracy: 0.7927
              precision    recall  f1-score   support

           0       0.71      0.75      0.73       122
           1       0.79      0.57      0.66        97
           2       0.00      0.00      0.00        13
           3       0.77      0.85      0.81       200
           4       0.76      0.76      0.76        49
           5       0.86      0.80      0.83       143
           6       0.83      0.91      0.87        66
           7       0.83      0.83      0.83       339
           8       0.63      0.84      0.72        76
           9       0.84      0.86      0.85        86
          10       0.94      0.70      0.81        44

    accuracy                           0.79      1235
   macro avg       0.72      0.72      0.71      1235
weighted avg       0.79      0.79      0.79      1235

Epoch 3/4


In [None]:
torch.save(model.state_dict(), "bert_classifier.pth")

In [None]:
# Set up parameters
 bert_model_name = 'bert-base-uncased'
 num_classes = 2
 max_length = 128
 batch_size = 16
 num_epochs = 4
 learning_rate = 2e-5