In [1]:
from datasets import load_dataset
dataset = load_dataset('squad')

Found cached dataset parquet (/Users/williammahnke/.cache/huggingface/datasets/parquet/plain_text-b38df7ca980d7b55/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
dataset = dataset.remove_columns(['id','title','context','answers'])

def add_end_of_text(example):
    example['question'] = example['question'] + '<|endoftext|>'
    return example

dataset = dataset.map(add_end_of_text)

In [None]:
dataset['train']['question'][:10]

In [None]:
from transformers import AutoTokenizer
model_checkpoint = 'distilgpt2'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast = True)

In [None]:
sequence = ('Thanks for coming the last DS UCSB workshop this year!')
tokens = tokenizer.tokenize(sequence)
print(tokens)

In [None]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

In [None]:
tokenizer(sequence)

In [None]:
def tokenize_function(examples):
    return tokenizer(examples['question'], truncation = True)

tokenized_datasets = dataset.map(
    tokenize_function,
    batched = True,
    num_proc = 4,
    remove_columns = ['question']
)

In [None]:
tokenized_datasets['train'][:10]

In [None]:
block_size = 256

def group_texts(examples):
    concatenated_examples = {k:sum(examples[k], []) for k in examples.key}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size

    result = {
        k: [t[i : i + block_size]]
        for k,t in concatenated_examples.items()
    }
    result['labels'] = result['input_ids'].copy()
    return result

lm_datasets = tokenized_datasets.map(
   group_texts,
   batched = True,
   batch_size = 1000,
   num_proc = 4 
)

In [None]:
print(lm_datasets['train']['input_ids'][0])
print(tokenizer.decode(lm_datasets['train']['input_ids'][0]))
print(tokenizer.decode(lm_datasets['train']['input_ids'][1]))

In [None]:
small_train_dataset = lm_datasets['train'].shuffle(seed = 42).select(range(100))
small_eval_dataset = lm_datasets['validation'].shuffle(seed = 42).select(range(100))

In [None]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [None]:
training_args = TrainingArguments(
    f'{model_checkpoint}-squad',
    evaluation_strategy = 'epoch',
    report_to = 'none',
    learning_rate = 2e-5,
    weight_decay = 0.01
)

trainer = Trainer(
    model = model, 
    args = training_args,
    train_dataset = small_train_dataset,
    eval_dataset = small_eval_dataset
)

In [None]:
trainer.train()

WandB is a good website for visualizing and explaining metrics when training a model

In [None]:
import math
eval_results = trainer.evaluate()

In [None]:
print (f'Perplexity: {math.exp(eval_results['eval_loss'])}:.2f')