In [None]:
import os
import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np
np.object = object 

In [None]:
# arguments
input_max_len = 512

batch_size = 4 

learning_rate = 2e-5 
epochs = 3

concat_options = ["without-options", "options-in-between", "options-at-end"]
concat_option_id = 2;

# data preparation

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset
import torch
from torch.utils.data import DataLoader
import json

In [None]:
task_1_train_data_path = '../input/semevaldataset/training_data/Task_1_train.jsonl'
task_2_train_data_path = '../input/semevaldataset/training_data/Task_2_train.jsonl'

task_1_eval_data_path = '../input/semevaldataset/training_data/Task_1_dev.jsonl'
task_2_eval_data_path = '../input/semevaldataset/training_data/Task_2_dev.jsonl'

In [None]:
def concat_text(question, article, options, tag = concat_option_id):
    if (tag == 0):
        return question.replace("@placeholder", '[MASK]') + ' [SEP] '+ article
    elif (tag == 1):
        return question.replace("@placeholder", '[MASK]') + ' [SEP] ' +  ' '.join(options)  + ' [SEP] ' + article
    elif (tag == 2):
        return question.replace("@placeholder", '[MASK]') + ' [SEP] ' + article + ' [SEP] ' +  ' '.join(options)

In [None]:
def read_examples(input_file):
    examples = []
    with open(input_file, 'r', encoding='utf-8') as f:
        for line in f:
            json_line = json.loads(line.strip())
            article = json_line.get('article', '')
            label = json_line.get('label', '')
            question = json_line.get('question', '')
            options = [json_line.get(f'option_{i}', '') for i in range(5)]
            examples.append({
                "text" :  concat_text(question, article, options),
                "options" : options,
                "label" : int(label)
            })
    return examples

In [None]:
task_1_train_data = Dataset.from_pandas(pd.DataFrame(read_examples(task_1_train_data_path)))
task_2_train_data = Dataset.from_pandas(pd.DataFrame(read_examples(task_2_train_data_path)))

task_1_eval_data = Dataset.from_pandas(pd.DataFrame(read_examples(task_1_eval_data_path)))
task_2_eval_data = Dataset.from_pandas(pd.DataFrame(read_examples(task_2_eval_data_path)))

In [None]:
task_1_train_data[0]

# modeling

In [None]:
from transformers import BertTokenizer, BertForMaskedLM

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  

In [None]:
def get_feature(example):
    # Concatenate the question and article with the sep token
    
    # Convert the concatenated text to tokens
    inputs = tokenizer(example['text'], max_length=input_max_len, truncation=True, padding='max_length', return_attention_mask=True)
    
    # Set the labels for all tokens to -100 (ignored by the loss function), except for the masked token
    labels = [-100 if t_id != tokenizer.mask_token_id else tokenizer.convert_tokens_to_ids(tokenizer.tokenize(example['options'][example['label']]))[0] for t_id in inputs.input_ids]
    
    example["input_ids"] = inputs.input_ids
    example["attention_mask"] = inputs.attention_mask
    example["labels"] = labels.copy()

    return example

In [None]:
remove_columns=["text", "label", "options"]
columns = ["input_ids", "attention_mask", "labels"]

In [None]:
task_1_train_tokenize = task_1_train_data.map(get_feature, remove_columns=remove_columns)
task_1_val_tokenize = task_1_eval_data.map(get_feature, remove_columns=remove_columns)

task_2_train_tokenize = task_2_train_data.map(get_feature, remove_columns=remove_columns)
task_2_val_tokenize = task_2_eval_data.map(get_feature, remove_columns=remove_columns)

In [None]:
task_1_train_tokenize.set_format(type='torch', columns=columns)
task_2_train_tokenize.set_format(type='torch', columns=columns)

task_1_val_tokenize.set_format(type='torch', columns=columns)
task_2_val_tokenize.set_format(type='torch', columns=columns)

In [None]:
task_1_train_tokenize

In [None]:
task_1_val_tokenize

# Training

In [None]:
from torch.optim import AdamW

In [None]:
def train(model, train_dataloader, val_dataloader, output_checkpoint):
    best_val_loss = float('inf')
    
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    optimizer = AdamW(model.parameters(), lr=learning_rate)
    model = model.to(device)
    
    for epoch in range(epochs):
        total_loss = 0

        model.train()
        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()

            # progress update after every 100 batches.
            if step % 100 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            # scheduler.step()
            
            if step % 100 == 0 and not step == 0:
                print('Step loss: ', loss.item())
                
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_dataloader)
        print(f'Training loss: {avg_train_loss}')

        model.eval()
        total_val_loss = 0
        for batch in val_dataloader:
            with torch.no_grad():
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        avg_val_loss = total_val_loss / len(val_dataloader)
        print(f'Validation loss: {avg_val_loss}')

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            model.save_pretrained(output_checkpoint)
            print('best model saved.')

In [None]:
task_1_train_dataloader = DataLoader(task_1_train_tokenize, batch_size=batch_size, shuffle=True)
task_1_val_dataloader = DataLoader(task_1_val_tokenize, batch_size=batch_size, shuffle=True)

task_2_train_dataloader = DataLoader(task_2_train_tokenize, batch_size=batch_size, shuffle=True)
task_2_val_dataloader = DataLoader(task_2_val_tokenize, batch_size=batch_size, shuffle=True)

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
train(model, task_1_train_dataloader, task_1_val_dataloader, '../working/task_1_checkpoint')

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
train(model, task_2_train_dataloader, task_2_val_dataloader, '../working/task_2_checkpoint')