In [4]:
!pip install datasets accelerator transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
from datasets import load_dataset
data = load_dataset('under-tree/labeled-multiple-choice', split='train')



In [6]:
from pprint import pprint
pprint(data[0])

{'answerKey': 'f',
 'combinedfact': 'beads of water can be formed by clouds.',
 'formatted_question': 'what type of water formation is formed by clouds? (a) '
                       'pearls (b) streams (c) shells (d) diamonds (e) rain '
                       '(f) beads (g) cooled (h) liquid',
 'topic': 'physics'}


In [7]:
def gen_prompt(elem):
    # return f'question: {elem.formatted_question}\nanswer: {elem.answerKey}\ncontext: {elem.combinedfact}\n'
    # dict 
    return {'text': f'question: {elem["formatted_question"]}\nanswer: {elem["answerKey"]}\ncontext: {elem["combinedfact"]}\n'}
print(gen_prompt(data[0])['text'])

question: what type of water formation is formed by clouds? (a) pearls (b) streams (c) shells (d) diamonds (e) rain (f) beads (g) cooled (h) liquid
answer: f
context: beads of water can be formed by clouds.



In [8]:
data_with_prompt = data.map(gen_prompt, batched=False, remove_columns=data.column_names, num_proc=4)



In [9]:
from transformers import AutoTokenizer, AutoModelForCausalLM

checkpoint = 'distilgpt2'
tokenizer = AutoTokenizer.from_pretrained(checkpoint, pad_token='<|pad|>', use_fast=True)
special_tokens = {'additional_special_tokens': ['question: ', 'answer: ', 'context: ']}
tokenizer.add_special_tokens(special_tokens)

model = AutoModelForCausalLM.from_pretrained(checkpoint)
model.resize_token_embeddings(len(tokenizer))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Embedding(50261, 768)

In [10]:
tokenizer(data_with_prompt[0]['text'], return_tensors='pt')

{'input_ids': tensor([[50258, 10919,  2099,   286,  1660,  9978,   318,  7042,   416, 15114,
            30,   357,    64,     8, 25286,  7278,   357,    65,     8, 15190,
           357,    66,     8, 19679,   357,    67,     8, 30984,   357,    68,
             8,  6290,   357,    69,     8, 36116,   357,    70,     8, 32162,
           357,    71,     8,  8122,   198, 50259,    69,   198, 50260,    65,
          1329,    82,   286,  1660,   460,   307,  7042,   416, 15114,    13,
           198]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [11]:
mx = max(map(len, data_with_prompt['text']))
print('Max: ', mx)

def encode(elem):
    return tokenizer(elem['text'], truncation=True)

data_encoded = data_with_prompt.map(encode, batched=True, remove_columns=data_with_prompt.column_names)



Max:  738


In [12]:
block_size = 128

def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [13]:
data_lm = data_encoded.map(group_texts, batched=True, num_proc=4)



In [14]:
data_dict = data_lm.train_test_split(test_size=0.2)

In [15]:
from transformers import Trainer, TrainingArguments
from accelerate import Accelerator
accelerator = Accelerator()
data_dict_copy = data_dict.copy()
data_dict = accelerator.prepare(data_dict)

training_args = TrainingArguments(
    output_dir='./results',   
    evaluation_strategy='epoch',
    num_train_epochs=2
)



In [18]:
# default args are pretty good: https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments
data_dict = data_dict_copy.copy()
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data_dict['train'],
    eval_dataset=data_dict['test']
)

In [1]:
trainer.save_model('result/')

NameError: ignored

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss


In [3]:
import torch
from transformers import get_scheduler, AdamW


data_dict.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
train, test = data_dict['train'], data_dict['test']
train_loader, test_loader = torch.utils.data.DataLoader(train, batch_size=8, shuffle=True), torch.utils.data.DataLoader(test, batch_size=8)
accelerator = Accelerator()
train_loader, test_loader = accelerator.prepare(train_loader, test_loader)

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = len(train) * num_epochs
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

NameError: ignored

In [None]:
exit()

In [None]:
test[0]

In [None]:
for batch in train_loader:
    print(batch.values())
    break

In [None]:
from tqdm import tqdm

def train():
    accelerator = Accelerator()
    model.train()
    progress_bar = tqdm(range(num_epochs * len(train_loader)))
    for epoch in range(num_epochs):
        for step, batch in enumerate(train_loader):
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            if step % 500 == 0:
                print(f'Epoch: {epoch}, Step: {step}, Loss: {loss}')


In [None]:
from accelerate import notebook_launcher

# notebook_launcher(train)