In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.cuda.amp import GradScaler, autocast
import pandas as pd
import json

In [2]:
# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [3]:
# Function to load the dataset from a JSONL file
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Load tokenizer and model with correct number of labels for Subtask B
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
num_labels = 6  # Update this based on the actual number of classes in Subtask B
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [4]:
# Preprocess and tokenize the data
def preprocess_data(data, tokenizer, max_length=512):
    tokenized = tokenizer(data['text'].tolist(), padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
    return tokenized

In [5]:
# Load and prepare Subtask B training data
train_data = load_data('data\SubtaskB\subtaskB_train.jsonl')  # Update with correct path
tokenized_inputs = preprocess_data(train_data, tokenizer)
labels = torch.tensor(train_data['label'].tolist())

# Modify DataLoader setup to handle the correct data
dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], labels)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [7]:
def save_preprocessed_data(tokenized_inputs, labels, file_path='preprocessed_data_train_subtaskB.pt'):
    torch.save({
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels
    }, file_path)

# Save the preprocessed data
save_preprocessed_data(tokenized_inputs, labels)

In [6]:
# Define optimizer and scaler for training
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

# Function to train the model
def train_model(model, data_loader, optimizer, device, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(data_loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

            if batch_idx % 20 == 0:  # Logging for visibility
                print(f'Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item()}')

        avg_loss = total_loss / len(data_loader)
        print(f'End of Epoch {epoch+1}, Average Loss: {avg_loss}')



In [8]:
train_model(model, data_loader, optimizer, device)

# Saving the trained model
model.save_pretrained('./my_fine_tuned_bert_multiclass')

Epoch: 1, Batch: 0, Loss: 1.80242919921875
Epoch: 1, Batch: 20, Loss: 1.7625732421875
Epoch: 1, Batch: 40, Loss: 1.2254180908203125
Epoch: 1, Batch: 60, Loss: 1.4940338134765625
Epoch: 1, Batch: 80, Loss: 1.1616249084472656
Epoch: 1, Batch: 100, Loss: 1.0655441284179688
Epoch: 1, Batch: 120, Loss: 0.9373207092285156
Epoch: 1, Batch: 140, Loss: 0.9362010955810547
Epoch: 1, Batch: 160, Loss: 0.8044834136962891
Epoch: 1, Batch: 180, Loss: 1.2731742858886719
Epoch: 1, Batch: 200, Loss: 0.5685396194458008
Epoch: 1, Batch: 220, Loss: 0.7308998107910156
Epoch: 1, Batch: 240, Loss: 0.8597469329833984
Epoch: 1, Batch: 260, Loss: 0.9117484092712402
Epoch: 1, Batch: 280, Loss: 0.5755653381347656
Epoch: 1, Batch: 300, Loss: 0.5395126342773438
Epoch: 1, Batch: 320, Loss: 0.9269037246704102
Epoch: 1, Batch: 340, Loss: 0.8286418914794922
Epoch: 1, Batch: 360, Loss: 0.5047273635864258
Epoch: 1, Batch: 380, Loss: 0.27229976654052734
Epoch: 1, Batch: 400, Loss: 0.7296180725097656
Epoch: 1, Batch: 420, L

In [9]:
tokenizer.save_pretrained('./my_fine_tuned_bert_multiclass')

('./my_fine_tuned_bert_multiclass\\tokenizer_config.json',
 './my_fine_tuned_bert_multiclass\\special_tokens_map.json',
 './my_fine_tuned_bert_multiclass\\vocab.txt',
 './my_fine_tuned_bert_multiclass\\added_tokens.json')

Test

In [10]:
# Load the tokenizer and model for testing
tokenizer = BertTokenizer.from_pretrained('./my_fine_tuned_bert_multiclass')
model = BertForSequenceClassification.from_pretrained('./my_fine_tuned_bert_multiclass', num_labels=6)
model.to(device)
model.eval()

# Load and prepare test data
test_data = load_data('data\SubtaskB\subtaskB_dev.jsonl')  # Update with correct path
tokenized_test = preprocess_data(test_data, tokenizer)
test_dataset = TensorDataset(tokenized_test['input_ids'], tokenized_test['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)


In [11]:
# Function to generate predictions
def generate_predictions(model, dataloader):
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device)}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = generate_predictions(model, test_loader)

In [12]:
# Assuming you have the true labels for evaluation
test_labels = torch.tensor(test_data['label'].tolist())  # This assumes labels are present in your test data JSONL

from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report

# Print evaluation metrics
def print_evaluation_metrics(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

print_evaluation_metrics(predictions, test_labels.numpy())

# Optional: Save predictions to a JSONL file for further analysis or submission
def save_predictions(predictions, data, file_path):
    with open(file_path, 'w') as f:
        for idx, pred in enumerate(predictions):
            result = {
                "id": str(data.iloc[idx]['id']),  # Convert ID to string if needed
                "label": int(pred)  # Ensure label is a Python int
            }
            f.write(json.dumps(result) + "\n")

save_predictions(predictions, test_data, 'BERT_SubtaskB_results.jsonl')

Accuracy: 0.6213333333333333
Precision: 0.6440920563111239
Recall: 0.6213333333333333
F1 Score: 0.6129514471012584
