In [None]:
!pip install transformers

import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
import re


Successfully installed huggingface-hub-0.16.4 safetensors-0.3.2 tokenizers-0.13.3 transformers-4.31.0


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


# Ler os arquivos CSV usando a codificação ISO-8859-1
train_data = pd.read_csv('/content/drive/MyDrive/NLP/disaster/train.csv', encoding='ISO-8859-1')
test_data = pd.read_csv('/content/drive/MyDrive/NLP/disaster/test.csv', encoding='ISO-8859-1')

Mounted at /content/drive


In [None]:
# Função para remover URLs e converter para minúsculas
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    return text.lower()

train_data['text'] = train_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)


In [None]:
# Tokenizar o conjunto de treinamento

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


train_input_ids = []
train_attention_masks = []

for text in train_data['text']:
    encoded_data = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    train_input_ids.append(encoded_data['input_ids'])
    train_attention_masks.append(encoded_data['attention_mask'])

train_input_ids = torch.cat(train_input_ids, dim=0)
train_attention_masks = torch.cat(train_attention_masks, dim=0)
train_labels = torch.tensor(train_data['target'].values)

# Tokenizar o conjunto de teste
test_input_ids = []
test_attention_masks = []

for text in test_data['text']:
    encoded_data = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=256,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt'
    )
    test_input_ids.append(encoded_data['input_ids'])
    test_attention_masks.append(encoded_data['attention_mask'])

test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)


Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
# Carregar modelo BERT pré-treinado
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2,
    output_attentions=False,
    output_hidden_states=False
)





# Criar DataLoader para os dados de treinamento
#rain_dataset = TensorDataset(train_input_ids, train_attention_masks, train_labels)
#train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=8)



# Dividir os dados em treinamento e validação
train_inputs, val_inputs, train_labels, val_labels, train_masks, val_masks = train_test_split(
    train_input_ids, train_labels, train_attention_masks, test_size=0.1, random_state=2021)

# Criar TensorDatasets para treinamento e validação
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Parâmetros de treinamento (ajuste conforme sua máquina)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 8
epochs = 4

# Criar DataLoader
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
validation_dataloader = DataLoader(val_dataset, sampler=RandomSampler(val_dataset), batch_size=batch_size)

# Otimizador e agendador de taxa de aprendizado
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)

# Loop de treinamento (deve ser ajustado conforme a necessidade)
for epoch in range(epochs):
    # Treinamento
    model.train()
    total_loss = 0
    for step, batch in enumerate(train_dataloader):
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        model.zero_grad()
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    # Cálculo da perda média
    avg_train_loss = total_loss / len(train_dataloader)

    # Validação
    model.eval()
    eval_loss = 0
    nb_eval_steps = 0
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs[0]
        eval_loss += loss.item()
        nb_eval_steps += 1
    avg_val_loss = eval_loss/nb_eval_steps




KeyboardInterrupt: ignored

In [None]:
from torch.utils.data import SequentialSampler

# Criar DataLoader para o conjunto de testes
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=32)

# 2. Fazer as previsões usando o modelo BERT treinado
model.eval()  # Configura o modelo para modo de avaliação
predictions = []

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_input_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    batch_predictions = np.argmax(logits, axis=1)
    predictions.extend(batch_predictions)

# 3. Salvar as previsões no formato desejado
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': predictions
})

submission.to_csv('submission_BERT.csv', index=False)

Colab nãoa guentou o treino do transformer