In [1]:
from datasets import load_dataset
dataset = load_dataset("squad")
# print(dataset)

def add_end_of_text(example):
    example['question'] = example['question'] + '<|endoftext|> '
    return example

dataset = dataset.remove_columns(['id', 'context', 'answers', 'title'])
dataset = dataset.map(add_end_of_text)

dataset['train']['question'][:10]

['To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?<|endoftext|> ',
 'What is in front of the Notre Dame Main Building?<|endoftext|> ',
 'The Basilica of the Sacred heart at Notre Dame is beside to which structure?<|endoftext|> ',
 'What is the Grotto at Notre Dame?<|endoftext|> ',
 'What sits on top of the Main Building at Notre Dame?<|endoftext|> ',
 'When did the Scholastic Magazine of Notre dame begin publishing?<|endoftext|> ',
 "How often is Notre Dame's the Juggler published?<|endoftext|> ",
 'What is the daily student paper at Notre Dame called?<|endoftext|> ',
 'How many student news papers are found at Notre Dame?<|endoftext|> ',
 'In what year did the student paper Common Sense begin publication at Notre Dame?<|endoftext|> ']

In [2]:
from transformers import AutoTokenizer
model_checkpoint = "distilgpt2"
# tokenizers are available in a python implementation of "Fast" implementation which uses the Rust language
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# Example of tokenising
sequence = ("This tokenizer is being applied in CS197 at Harvard.<|endoftext|>")



In [3]:
tokens = tokenizer.tokenize(sequence)
tokens

['This',
 'Ġtoken',
 'izer',
 'Ġis',
 'Ġbeing',
 'Ġapplied',
 'Ġin',
 'ĠCS',
 '197',
 'Ġat',
 'ĠHarvard',
 '.',
 '<|endoftext|>']

In [4]:
ids = tokenizer.convert_tokens_to_ids(tokens)
ids

[1212, 11241, 7509, 318, 852, 5625, 287, 9429, 24991, 379, 11131, 13, 50256]

In [5]:
#the tokenizer actually automatically chains these operations for us when we use __call__:
tokenizer(sequence)




{'input_ids': [1212, 11241, 7509, 318, 852, 5625, 287, 9429, 24991, 379, 11131, 13, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [6]:
sequence2 = "Hello my name is Andy and I'm just starting to use Jupyter Notebook"
tokens = tokenizer.tokenize(sequence2)
tokens


['Hello',
 'Ġmy',
 'Ġname',
 'Ġis',
 'ĠAndy',
 'Ġand',
 'ĠI',
 "'m",
 'Ġjust',
 'Ġstarting',
 'Ġto',
 'Ġuse',
 'ĠJ',
 'up',
 'y',
 'ter',
 'ĠNote',
 'book']

In [7]:
# import multiprocessing as mp
# # Check if multiprocessing context is suitable
# mp.set_start_method('spawn', force=True)
# from multiprocess import set_start_method
# set_start_method('spawn', force=True)


def tokenize_function(examples):
    return tokenizer(examples["question"], truncation=True)

# By setting batched=True we process multiple elements of the dataset at once
# num_proc sets the number of processes
# Finally we remove the questions column because we won't need it now
tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=1, remove_columns=["question"])

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 10570
    })
})

In [9]:
block_size = 128

def group_texts(examples):
    # repeat concatenation fo rinput_ids and other keys
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // block_size) * block_size
    
    #populae each of input_ids and other keys
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()        
    }
    
    # add labels because we'll need it as the output
    result["labels"] = result["input_ids"].copy()
    return result
    
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=1
)

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [10]:
print(lm_datasets['train']['input_ids'][0])

[2514, 4150, 750, 262, 5283, 5335, 7910, 1656, 287, 1248, 3365, 287, 406, 454, 8906, 4881, 30, 50256, 220, 2061, 318, 287, 2166, 286, 262, 23382, 20377, 8774, 11819, 30, 50256, 220, 464, 32520, 3970, 286, 262, 17380, 2612, 379, 23382, 20377, 318, 13970, 284, 543, 4645, 30, 50256, 220, 2061, 318, 262, 10299, 33955, 379, 23382, 20377, 30, 50256, 220, 2061, 10718, 319, 1353, 286, 262, 8774, 11819, 379, 23382, 20377, 30, 50256, 220, 2215, 750, 262, 3059, 349, 3477, 11175, 286, 23382, 288, 480, 2221, 12407, 30, 50256, 220, 2437, 1690, 318, 23382, 20377, 338, 262, 39296, 1754, 3199, 30, 50256, 220, 2061, 318, 262, 4445, 3710, 3348, 379, 23382, 20377, 1444, 30, 50256, 220, 2437, 867, 3710, 1705, 9473, 389, 1043, 379, 23382, 20377, 30]


In [11]:
print(tokenizer.decode(lm_datasets['train']['input_ids'][0]))

To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?<|endoftext|> What is in front of the Notre Dame Main Building?<|endoftext|> The Basilica of the Sacred heart at Notre Dame is beside to which structure?<|endoftext|> What is the Grotto at Notre Dame?<|endoftext|> What sits on top of the Main Building at Notre Dame?<|endoftext|> When did the Scholastic Magazine of Notre dame begin publishing?<|endoftext|> How often is Notre Dame's the Juggler published?<|endoftext|> What is the daily student paper at Notre Dame called?<|endoftext|> How many student news papers are found at Notre Dame?


In [12]:
small_train_dataset = \
    lm_datasets["train"].shuffle(seed=42).select(range(100))
small_eval_dataset = \
    lm_datasets["validation"].shuffle(seed=42).select(range(100))

In [17]:
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer

model = AutoModelForCausalLM.from_pretrained("distilgpt2")

#Token removed again

training_args = TrainingArguments(
    f"{model_checkpoint}-squad",
    eval_strategy="epoch",
    learning_rate=2e-5, 
    weight_decay=0.01,
    push_to_hub=True
)

trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
)

# Not tried this but should increase the number of epochs from the default 3
# Modify the number of epochs
#trainer.args.num_train_epochs = 10

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,4.097046
2,No log,3.491851
3,No log,3.359954


TrainOutput(global_step=39, training_loss=4.13212389823718, metrics={'train_runtime': 111.7001, 'train_samples_per_second': 2.686, 'train_steps_per_second': 0.349, 'total_flos': 9798628147200.0, 'train_loss': 4.13212389823718, 'epoch': 3.0})

In [18]:
import math
eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 28.79


In [19]:
tokenizer.save_pretrained('gpt2-squad')
model.push_to_hub('gpt2-squad')

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/AndyRoberts/gpt2-squad/commit/f33753b8418f6f44a15cc301eb7ea51fd3d09412', commit_message='Upload model', commit_description='', oid='f33753b8418f6f44a15cc301eb7ea51fd3d09412', pr_url=None, pr_revision=None, pr_num=None)