In [1]:
import pandas as pd
from transformers import RobertaTokenizer, RobertaForMultipleChoice
import torch

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForMultipleChoice.from_pretrained('roberta-base')

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.weight', 'roberta.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import json

def format_data_for_roberta(json_file_path):
    # Load the JSON file
    with open(json_file_path, 'r') as file:
        data = json.load(file)

    # Format data
    formatted_data = []
    for item in data:
        context = item['context']
        question = item['question']
        correct_answer = item['correct_answer']

        # Loop through each answer
        for i in range(4):
            answer_key = f'answer{i}'
            answer = item[answer_key]

            formatted_data.append({
                'context': context,
                'question': question,
                'answer': answer,
                'correct_answer': correct_answer == answer
            })
    
    return formatted_data

# Example usage
json_file_path = 'dataset.json'  # Replace with your file path
formatted_data = format_data_for_roberta(json_file_path)


In [3]:
# Specify the file name for the JSON file
file_name = 'roBERTa.json'

# Writing the list of JSON objects to a file
with open(file_name, 'w') as file:
    json.dump(formatted_data, file, indent=4)


In [4]:
from datasets import DatasetDict
ds = DatasetDict.from_json({'train': 'roBERTa.json'})

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
train_test_split = ds["train"].train_test_split(test_size=0.3)

# Combine the new train and validation set with the other sets in the original DatasetDict
ds = DatasetDict({
    'train': train_test_split['train'],
    'validation': train_test_split['test'],
})

In [7]:
ds['train'][0]

{'question': 'In the argument, the portion in boldface plays which of the following roles?',
 'context': "Country X' s recent stock-trading scandal should not diminish investors' confidence in the country's stock market. For one thing, <b> the discovery of the scandal confirms that Country X has a strong regulatory system </b>, as the following considerations show. In any stock market, some fraudulent activity is inevitable. If a stock market is well regulated, any significant stock-trading fraud in it will very likely be discovered. This deters potential perpetrators and facilitates improvement in regulatory processes.",
 'answer': "It is a conclusion for which the argument provides support and which itself is used to support the argument's main conclusion.",
 'correct_answer': True}

In [9]:
import json
from transformers import RobertaTokenizer
import torch

# Function to tokenize data
def tokenize_roberta_data(formatted_data):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    tokenized_data = []
    
    for item in formatted_data:
        encoded_dict = tokenizer.encode_plus(
            item['context'] + " " + tokenizer.sep_token + " " + item['question'] + " " + tokenizer.sep_token + " " + item['answer'],
            add_special_tokens = True,
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt',
        )
        tokenized_data.append({
            'input_ids': encoded_dict['input_ids'],
            'attention_mask': encoded_dict['attention_mask'],
            'label': 1 if item['correct_answer'] else 0
        })

    return tokenized_data

tokenized_data_train = tokenize_roberta_data(ds['train'])
tokenized_data_val = tokenize_roberta_data(ds['validation'])


In [10]:
def show_original_sample(tokenized_data, tokenizer, sample_index):
    # Extract the token IDs for the sample
    input_ids = tokenized_data[sample_index]['input_ids'].squeeze()

    # Decode the token IDs back to text
    decoded_text = tokenizer.decode(input_ids, skip_special_tokens=False)

    return decoded_text

# Example usage
sample_index = 0  # Index of the sample you want to display
original_text = show_original_sample(tokenized_data_train, tokenizer, sample_index)

print(original_text)


<s>Country X' s recent stock-trading scandal should not diminish investors' confidence in the country's stock market. For one thing, <b> the discovery of the scandal confirms that Country X has a strong regulatory system </b>, as the following considerations show. In any stock market, some fraudulent activity is inevitable. If a stock market is well regulated, any significant stock-trading fraud in it will very likely be discovered. This deters potential perpetrators and facilitates improvement in regulatory processes. </s> In the argument, the portion in boldface plays which of the following roles? </s> It is a conclusion for which the argument provides support and which itself is used to support the argument's main conclusion.</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pa

In [29]:
len([d['label'] for d in tokenized_data_train])

12986

In [33]:
from transformers import RobertaForMultipleChoice, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

# Assuming tokenized_train_data and tokenized_val_data are already prepared
# Each should be a list of dictionaries with 'input_ids', 'attention_mask', and 'label'

def create_dataloader(tokenized_data, batch_size):
    input_ids = torch.stack([d['input_ids'] for d in tokenized_data])
    attention_masks = torch.stack([d['attention_mask'] for d in tokenized_data])
    labels = torch.tensor([d['label'] for d in tokenized_data])

    dataset = TensorDataset(input_ids, attention_masks, labels)
    return DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Create DataLoaders
train_dataloader = create_dataloader(tokenized_data_train, batch_size=8)
val_dataloader = create_dataloader(tokenized_data_val, batch_size=8)

In [34]:
# Initialize the model
model = RobertaForMultipleChoice.from_pretrained('roberta-base')
optimizer = AdamW(model.parameters(), lr=2e-5, no_deprecation_warning=True)

# Training Loop
num_epochs = 3  # Set the number of epochs
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_dataloader:
        b_input_ids, b_attention_masks, b_labels = batch    
        model.zero_grad()
        
        outputs = model(input_ids=b_input_ids, attention_mask=b_attention_masks, labels=b_labels)
        print(outputs)
        loss = outputs.loss
        total_train_loss += loss.item()
        
        loss.backward()
        optimizer.step()

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Epoch {epoch+1} | Training loss: {avg_train_loss}")

    # # Validation Loop
    # model.eval()
    # total_eval_accuracy = 0

    # for batch in val_dataloader:
    #     b_input_ids, b_attention_masks, b_labels = batch
        
    #     with torch.no_grad():
    #         outputs = model(input_ids=b_input_ids, attention_mask=b_attention_masks)
        
    #     logits = outputs.logits
    #     predictions = torch.argmax(logits, dim=-1)
    #     total_eval_accuracy += torch.sum(predictions == b_labels)

    # avg_val_accuracy = total_eval_accuracy / len(val_dataloader)
    # print(f"Epoch {epoch+1} | Validation Accuracy: {avg_val_accuracy}")

# Save the model
model.save_pretrained('roberta-bertje')


Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'classifier.weight', 'roberta.pooler.dense.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


IndexError: Target 1 is out of bounds.