In [1]:
from datasets import load_dataset
from datasets import Dataset
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
cbt_dataset = load_dataset('cbt','CN')

Found cached dataset cbt (/home/ys3344/.cache/huggingface/datasets/cbt/CN/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b)
100%|██████████| 3/3 [00:00<00:00,  3.28it/s]


In [23]:
cbt_dataset = load_dataset('cbt','V')

Found cached dataset cbt (/home/ys3344/.cache/huggingface/datasets/cbt/V/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b)
100%|██████████| 3/3 [00:00<00:00, 736.70it/s]


In [36]:
cbt_dataset = load_dataset('cbt','NE')

Found cached dataset cbt (/home/ys3344/.cache/huggingface/datasets/cbt/NE/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b)
100%|██████████| 3/3 [00:00<00:00, 637.59it/s]


In [44]:
cbt_dataset = load_dataset('cbt','P')

Downloading and preparing dataset cbt/P to /home/ys3344/.cache/huggingface/datasets/cbt/P/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b...


                                                                                         

Dataset cbt downloaded and prepared to /home/ys3344/.cache/huggingface/datasets/cbt/P/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:02<00:00,  1.25it/s]


In [3]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
#tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [4]:
def preprocess_function(examples):
    sentences = [[''.join(sent)] * 10 for sent in examples["sentences"]]
    question = examples["question"]
    possible_answer = [
        [q.replace('XXXXX', option) for option in examples["options"][i]] for i, q in enumerate(question)
    ]

    all_sentences = sum(sentences, [])
    all_possible_answer = sum(possible_answer, [])
    
    tokenized_examples = tokenizer(all_sentences, all_possible_answer, truncation=True)

    return {k: [v[i : i + 10] for i in range(0, len(v), 10)] for k, v in tokenized_examples.items()}

In [45]:
train_set = Dataset.from_dict(cbt_dataset['train'][:10000]).map(preprocess_function, batched = True).map(get_label)

                                                                   

In [46]:
validation = Dataset.from_dict(cbt_dataset['validation'][:]).map(preprocess_function, batched = True).map(get_label)

                                                                 

In [48]:
test = Dataset.from_dict(cbt_dataset['test'][:]).map(preprocess_function, batched = True).map(get_label)

                                                                 

In [14]:
#tokenized_data = cbt_dataset.map(preprocess_function, batched = True)

Loading cached processed dataset at /home/ys3344/.cache/huggingface/datasets/cbt/CN/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b/cache-bef6552f34baf130.arrow
Loading cached processed dataset at /home/ys3344/.cache/huggingface/datasets/cbt/CN/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b/cache-c5d6b83a5471067f.arrow


In [15]:
def get_label(example):
    options = example['options']
    answer = example['answer']
    label = options.index(answer)
    example['label'] = label
    return example

In [16]:
#train = tokenized_data['train'].map(get_label)

                                                                     

In [17]:
#validation = tokenized_data['validation'].map(get_label)

Loading cached processed dataset at /home/ys3344/.cache/huggingface/datasets/cbt/CN/1.1.0/dc4451a8a4b50cebb78fdb19fa9f964b27fcdcef915467b8b7055a3a8d8cef7b/cache-372918a9a442a067.arrow


In [18]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [19]:
import evaluate

accuracy = evaluate.load("accuracy")

In [20]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [21]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
#model = AutoModelForMultipleChoice.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [22]:
#train_set = train.select(range(10000))

In [16]:
#CN
torch.cuda.set_device(0)

training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, 
    fp16=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set ,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.9897,0.745847,0.7395


TrainOutput(global_step=1250, training_loss=1.3265567993164062, metrics={'train_runtime': 1670.2272, 'train_samples_per_second': 5.987, 'train_steps_per_second': 0.748, 'total_flos': 2.63108692992e+16, 'train_loss': 1.3265567993164062, 'epoch': 1.0})

In [33]:
#V
torch.cuda.set_device(0)

training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, 
    fp16=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set ,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4952,0.396155,0.877


TrainOutput(global_step=1250, training_loss=0.6095053588867188, metrics={'train_runtime': 1655.7716, 'train_samples_per_second': 6.039, 'train_steps_per_second': 0.755, 'total_flos': 2.63108692992e+16, 'train_loss': 0.6095053588867188, 'epoch': 1.0})

In [42]:
#NE
torch.cuda.set_device(0)

training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, 
    fp16=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set ,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.7455,0.705701,0.762


TrainOutput(global_step=1250, training_loss=0.847780810546875, metrics={'train_runtime': 1654.4349, 'train_samples_per_second': 6.044, 'train_steps_per_second': 0.756, 'total_flos': 2.63108692992e+16, 'train_loss': 0.847780810546875, 'epoch': 1.0})

In [49]:
#P
torch.cuda.set_device(0)

training_args = TrainingArguments(
    output_dir="my_awesome_swag_model",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01, 
    fp16=True
)



trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set ,
    eval_dataset=validation,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy
1,0.519,0.443559,0.8735


TrainOutput(global_step=1250, training_loss=0.5717488525390625, metrics={'train_runtime': 1653.0135, 'train_samples_per_second': 6.05, 'train_steps_per_second': 0.756, 'total_flos': 2.63108692992e+16, 'train_loss': 0.5717488525390625, 'epoch': 1.0})

In [None]:
test = tokenized_data['test'].map(get_label)

In [20]:
#CN
trainer.evaluate(eval_dataset=test)

{'eval_loss': 0.799060046672821,
 'eval_accuracy': 0.7308,
 'eval_runtime': 132.8957,
 'eval_samples_per_second': 18.812,
 'eval_steps_per_second': 2.355,
 'epoch': 1.0}

In [35]:
#V
trainer.evaluate(eval_dataset=test)

{'eval_loss': 0.36293280124664307,
 'eval_accuracy': 0.8824,
 'eval_runtime': 132.0624,
 'eval_samples_per_second': 18.93,
 'eval_steps_per_second': 2.37,
 'epoch': 1.0}

In [43]:
#NE
trainer.evaluate(eval_dataset=test)

{'eval_loss': 0.6902772188186646,
 'eval_accuracy': 0.724,
 'eval_runtime': 52.7789,
 'eval_samples_per_second': 18.947,
 'eval_steps_per_second': 2.368,
 'epoch': 1.0}

In [50]:
#P
trainer.evaluate(eval_dataset=test)

{'eval_loss': 0.4269050657749176,
 'eval_accuracy': 0.872,
 'eval_runtime': 132.1437,
 'eval_samples_per_second': 18.919,
 'eval_steps_per_second': 2.369,
 'epoch': 1.0}