In [None]:
# Install requirements packages
# !pip install torch transformers pandas datasets "transformers[torch]"

In [None]:
# Initialize GPT2 model imported from hugging face

from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "distilgpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

model

In [None]:
# Test out the base GPT2 model

prompt = "Once upon a time"

inputs = tokenizer(prompt, return_tensors="pt")
# The parameters can be changed
outputs = model.generate(inputs.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
output_string

In [None]:
# Import our dataset from Hugging Face
from datasets import load_dataset

short_stories_dataset = load_dataset("ShehryarAzhar/stories")
# BUG: 'DatasetDict' object has no attribute 'train_test_split'
# short_stories_dataset = short_stories_dataset.train_test_split(train_size=0.8)
short_stories_dataset

In [None]:
[len(x["story"]) for x in short_stories_dataset["train"]]

In [None]:
# Tokenize our dataset
def preprocess_batch(batch):
    all_text_items = batch["story"]
    trimmed_text_items = [x[:500] for x in all_text_items]
    return tokenizer(trimmed_text_items)

tokenized_dataset = short_stories_dataset.map(
    preprocess_batch,
    batched=True,
    batch_size=10
    remove_columns=short_stories_dataset["train"].column_names,
)

In [None]:
# Create a Data Collator
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
# Create our Trainer and TrainingArguments
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./output',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    weight_decay=0.01,
    num_train_epochs=10
)

trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    args=training_args,
    data_collator=data_collator
)

trainer.train()

In [None]:
# Load model from the latest checkpoint

model = AutoModelForCausalLM.from_pretrained("./output/checkpoint-1000/")

In [None]:
# Start training


In [None]:
# Inference

prompt = 'Once'

inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(inputs.input_ids, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)

output_string = tokenizer.batch_decode(outputs)
output_string