In [3]:
# Import all required packages
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, get_linear_schedule_with_warmup
from torch.optim import AdamW
import pandas as pd
import numpy as np
from tqdm.auto import tqdm # For a nice progress bar


traindata = pd.read_csv('data/train.csv')
testdata = pd.read_csv('data/test.csv')

In [4]:
# Check if a CUDA-enabled GPU is available, otherwise fall back to CPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Success! There are {torch.cuda.device_count()} GPU(s) available.")
    print("We will use the GPU:", torch.cuda.get_device_name(0))
else:
    print("No GPU available, using the CPU instead.")
    device = torch.device("cpu")

Success! There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce RTX 4050 Laptop GPU


In [5]:
# Load the pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and format the training data
train_encodings = tokenizer(
    list(traindata['text']),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Tokenize and format the test data
test_encodings = tokenizer(
    list(testdata['text']),
    max_length=128,
    padding='max_length',
    truncation=True,
    return_tensors='pt'
)

# Prepare the labels as a tensor
train_labels = torch.tensor(traindata['target'].values)

In [6]:
class DisasterTweetDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

# Create datasets
train_dataset = DisasterTweetDataset(train_encodings, train_labels)
test_dataset = DisasterTweetDataset(test_encodings)

# Create dataloaders
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [7]:
# Load the PyTorch version of the pre-trained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')

# Move the model to the GPU
model.to(device)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=3e-5)
total_steps = len(train_dataloader) * 2 # 2 epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
print("Starting training...")
model.train() # Set the model to training mode

for epoch in range(2): # Train for 2 epochs
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}")
    for batch in progress_bar:
        # Clear any previously calculated gradients
        optimizer.zero_grad()
        
        # Move batch data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass: compute predicted outputs by passing inputs to the model
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        
        # Get the loss
        loss = outputs.loss
        
        # Backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        
        # Update the model's weights
        optimizer.step()
        scheduler.step()
        
        # Update the progress bar
        progress_bar.set_postfix({'loss': loss.item()})

print("Training finished.")

Starting training...


Epoch 1:   0%|          | 0/476 [00:00<?, ?it/s]

Epoch 2:   0%|          | 0/476 [00:00<?, ?it/s]

Training finished.


In [9]:
print("Starting prediction...")
model.eval() # Set the model to evaluation mode
all_predictions = []

with torch.no_grad(): # Turn off gradients for prediction
    for batch in tqdm(test_dataloader, desc="Predicting"):
        # Move batch data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        # Get model outputs
        outputs = model(input_ids, attention_mask=attention_mask)
        
        # Get the predictions (the class with the highest score)
        predictions = torch.argmax(outputs.logits, dim=1)
        
        # Move predictions to CPU and convert to numpy array
        all_predictions.extend(predictions.cpu().numpy())

print("Prediction finished.")

Starting prediction...


Predicting:   0%|          | 0/204 [00:00<?, ?it/s]

Prediction finished.


In [10]:
# Create and save the submission file
submission_bert_pytorch = pd.DataFrame({
    'id': testdata['id'],
    'target': all_predictions
})
submission_bert_pytorch.to_csv('submission_bert_pytorch.csv', index=False)

print("\nSubmission file 'submission_bert_pytorch.csv' created successfully!")
print(submission_bert_pytorch.head())


Submission file 'submission_bert_pytorch.csv' created successfully!
   id  target
0   0       1
1   2       1
2   3       1
3   9       1
4  11       1
