# Create and Train our Transformer

### Load the Dataset

In [3]:
from datasets import load_dataset


# Load the dataset
train_dataset, test_dataset = load_dataset("dataset/PokedexDataset.py", split=["train", "test"])

Using the latest cached version of the module from C:\Users\Will Sumerfield\.cache\huggingface\modules\datasets_modules\datasets\PokedexDataset\3aa69cafe54b66fd5280fcf12671ebf466fe770853e5254944dd13cefd7a6698 (last modified on Thu Feb 24 10:36:35 2022) since it couldn't be found locally at dataset/PokedexDataset.py\PokedexDataset.py or remotely (FileNotFoundError).
Reusing dataset pokedex_dataset (C:\Users\Will Sumerfield\.cache\huggingface\datasets\pokedex_dataset\plain_text\1.0.0\3aa69cafe54b66fd5280fcf12671ebf466fe770853e5254944dd13cefd7a6698)


### Tokenize the Dataset

In [4]:
from transformers import AutoTokenizer


# Create the tokenizer instance
tokenizer = AutoTokenizer.from_pretrained("distilgpt2")

# Set the padding token to use the end of sentence token
tokenizer.pad_token = tokenizer.eos_token


def tokenize_data(data):
    """This function maps our dataset to a tokenized dataset"""
    tokenized_data = tokenizer(data['text'], max_length=55, padding='max_length', truncation=True)
    tokenized_data['labels'] = tokenized_data['input_ids']
    return tokenized_data

# Tokenize our datasets
tokenized_train_dataset = train_dataset.map(tokenize_data, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_data, batched=True)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))




### Import our Model

In [77]:
from transformers import AutoModelForCausalLM
import torch


# Get a pretrained instance of the model
model = AutoModelForCausalLM.from_pretrained("distilgpt2").to('cuda:0')

# Add the new 'pokename' token to our model
tokenizer.add_tokens(["pokename"])

# Resize the model embeddings
model.resize_token_embeddings(len(tokenizer))

Embedding(50258, 768)

### Train the Model

In [78]:
from transformers import TrainingArguments
from transformers import Trainer


# Set the parameters for the trainer
training_args = TrainingArguments("model_checkpoints", per_device_train_batch_size=2, evaluation_strategy="epoch")

# Create a trainer with the provided arguments
trainer = Trainer(model=model,
                  args=training_args,
                  train_dataset=tokenized_train_dataset,
                  eval_dataset=tokenized_test_dataset
                  )

In [79]:
# Train the Model
trainer.train()

Epoch,Training Loss,Validation Loss


TrainOutput(global_step=4608, training_loss=1.5335284570852916, metrics={'train_runtime': 396.4695, 'train_samples_per_second': 11.623, 'total_flos': 5838783160320.0, 'epoch': 3.0})

In [80]:
# Save the model
model.save_pretrained("model")

In [81]:
import shutil


# Delete the checkpoints
shutil.rmtree('model_checkpoints')

In [20]:
from transformers import pipeline, AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained('distilgpt2')

# Create the text generator
generator = pipeline('text-generation', tokenizer=tokenizer, device=0, model=model)

input_ids = tokenizer.encode('pikachu', return_tensors='pt').to('cuda:0')
greedy_output = model.generate(input_ids, max_length=50)
print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))

# Generate the entry
#for i in range(10):
    #print(generator("< type=fire bird > * This is fine.", max_length=30, num_return_sequences=1, pad_token_id=50256)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[   79,  1134, 32323]], device='cuda:0')
pikachu: I'm so excited for all of you guys who are excited for this story. I want to go to school and have some fun. I'm hoping to be in the best shape possible this season and will go down a rabbit path
