#Train

In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.cuda.amp import GradScaler, autocast
import pandas as pd
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Function to load the dataset from a JSONL file
def load_data(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return pd.DataFrame(data)

# Preprocess and tokenize the data
def preprocess_data(data, tokenizer, max_length=512):
    tokenized = tokenizer(data['text'].tolist(), padding='max_length', max_length=max_length, truncation=True, return_tensors="pt")
    return tokenized

In [3]:
# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [5]:
def save_preprocessed_data(tokenized_inputs, labels, file_path='preprocessed_data.pt'):
    torch.save({
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask'],
        'labels': labels
    }, file_path)

def load_preprocessed_data(file_path='preprocessed_data.pt'):
    data = torch.load(file_path)
    return data

In [None]:

# Load and prepare data
train_data = load_data('data/subtaskA/subtaskA_train_monolingual.jsonl')
tokenized_inputs = preprocess_data(train_data, tokenizer)
labels = torch.tensor(train_data['label'].tolist()).unsqueeze(1)

# Save the preprocessed data
save_preprocessed_data(tokenized_inputs, labels)

In [6]:

# Load the preprocessed data
preprocessed_data = load_preprocessed_data()
# Create TensorDataset and DataLoader
#dataset = TensorDataset(tokenized_inputs['input_ids'], tokenized_inputs['attention_mask'], labels)
dataset = TensorDataset(preprocessed_data['input_ids'], preprocessed_data['attention_mask'], preprocessed_data['labels'])
data_loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)

# Define optimizer and loss
optimizer = AdamW(model.parameters(), lr=5e-5)
scaler = GradScaler()

In [7]:
# Training loop
def train_model(model, data_loader, optimizer, device, epochs=1):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_idx, batch in enumerate(data_loader):
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            optimizer.zero_grad()

            with autocast():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

            if batch_idx % 20 == 0:  # Log every 100 batches
                print(f'Epoch: {epoch+1}, Batch: {batch_idx}, Loss: {loss.item()}')

        avg_loss = total_loss / len(data_loader)
        print(f'End of Epoch {epoch+1}, Average Loss: {avg_loss}')


In [8]:
train_model(model, data_loader, optimizer, device)

# Save the trained model
model.save_pretrained('./my_fine_tuned_bert')
tokenizer.save_pretrained('./my_fine_tuned_bert')

Epoch: 1, Batch: 0, Loss: 0.7237396240234375
Epoch: 1, Batch: 20, Loss: 0.593963623046875
Epoch: 1, Batch: 40, Loss: 0.5186614990234375
Epoch: 1, Batch: 60, Loss: 0.34515380859375
Epoch: 1, Batch: 80, Loss: 0.09571170806884766
Epoch: 1, Batch: 100, Loss: 0.39281463623046875
Epoch: 1, Batch: 120, Loss: 0.42992591857910156
Epoch: 1, Batch: 140, Loss: 0.2562241554260254
Epoch: 1, Batch: 160, Loss: 0.0980074405670166
Epoch: 1, Batch: 180, Loss: 0.3342905044555664
Epoch: 1, Batch: 200, Loss: 0.6742444038391113
Epoch: 1, Batch: 220, Loss: 0.37153661251068115
Epoch: 1, Batch: 240, Loss: 0.19369721412658691
Epoch: 1, Batch: 260, Loss: 0.5417435169219971
Epoch: 1, Batch: 280, Loss: 0.2115495204925537
Epoch: 1, Batch: 300, Loss: 0.11365318298339844
Epoch: 1, Batch: 320, Loss: 0.392169713973999
Epoch: 1, Batch: 340, Loss: 0.17237114906311035
Epoch: 1, Batch: 360, Loss: 0.4928017854690552
Epoch: 1, Batch: 380, Loss: 0.09472620487213135
Epoch: 1, Batch: 400, Loss: 0.49457454681396484
Epoch: 1, Batc

('./my_fine_tuned_bert\\tokenizer_config.json',
 './my_fine_tuned_bert\\special_tokens_map.json',
 './my_fine_tuned_bert\\vocab.txt',
 './my_fine_tuned_bert\\added_tokens.json')

Test

In [4]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('./my_fine_tuned_bert')
model = BertForSequenceClassification.from_pretrained('./my_fine_tuned_bert', num_labels=2)
model.to(device)
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [14]:
def save_preprocessed_data_test(tokenized_inputs, file_path='preprocessed_data.pt'):
    torch.save({
        'input_ids': tokenized_inputs['input_ids'],
        'attention_mask': tokenized_inputs['attention_mask']
    }, file_path)

In [15]:
# Load and prepare data
test_data = load_data('data\SemEval2024-Task8-test\subtaskA_monolingual.jsonl')
tokenized_test = preprocess_data(test_data, tokenizer)
#labels = torch.tensor(test_data['label'].tolist()).unsqueeze(1)

# Save the preprocessed data
save_preprocessed_data_test(tokenized_test, 'preprocessed_test_data.pt')

In [16]:
# Create TensorDataset and DataLoader
preprocessed_data = load_preprocessed_data('preprocessed_test_data.pt')
test_dataset = TensorDataset(preprocessed_data['input_ids'], preprocessed_data['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [17]:
# Generate predictions
def generate_predictions(model, dataloader):
    predictions = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {'input_ids': batch[0].to(device),
                      'attention_mask': batch[1].to(device)}
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            predictions.extend(preds.cpu().numpy())
    return predictions

predictions = generate_predictions(model, test_loader)

In [18]:
# Save predictions to JSONL for scoring
def save_predictions(predictions, data, file_path):
    with open(file_path, 'w') as f:
        for idx, pred in enumerate(predictions):
            result = {
                "id": str(data.iloc[idx]['id']),  # Convert ID to string if needed
                "label": int(pred)  # Ensure label is a Python int
            }
            f.write(json.dumps(result) + "\n")

save_predictions(predictions, test_data, 'BERT_SubtaskA_mono_results.jsonl')

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Print evaluation metrics (optional, for immediate insight) on the dev dataset
def print_evaluation_metrics(predictions, true_labels):
    accuracy = accuracy_score(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='binary')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'F1 Score: {f1}')

print_evaluation_metrics(predictions, test_data['label'].values)

Accuracy: 0.7068
Precision: 0.882396449704142
Recall: 0.4772
F1 Score: 0.6194184839044652
