In [1]:
!pip install -U transformers datasets accelerate




In [2]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)



In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "openai-community/gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)

tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.pad_token_id = tokenizer.eos_token_id
model.to(device)


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [4]:
raw_datasets = load_dataset("SparkleDark/Everything_about_dogs")


In [5]:
split = raw_datasets["train"].train_test_split(test_size=0.05, seed=42)
datasets = {
    "train": split["train"],
    "validation": split["test"]
}


In [6]:
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True)

tokenized = {
    split_name: ds.map(
        tokenize_fn,
        batched=True,
        remove_columns=["text"]
    )
    for split_name, ds in datasets.items()
}



Map:   0%|          | 0/652 [00:00<?, ? examples/s]

In [7]:
block_size = 128

def group_texts(examples):
    # Concatenate all input_ids/attention_mask and split into block_size chunks
    all_input_ids = sum(examples["input_ids"], [])
    all_attention_masks = sum(examples["attention_mask"], [])
    total_len = (len(all_input_ids) // block_size) * block_size

    input_id_chunks = [
        all_input_ids[i : i + block_size]
        for i in range(0, total_len, block_size)
    ]
    mask_chunks = [
        all_attention_masks[i : i + block_size]
        for i in range(0, total_len, block_size)
    ]
    return {"input_ids": input_id_chunks, "attention_mask": mask_chunks}

lm_datasets = {
    split_name: tokenized_ds.map(group_texts, batched=True)
    for split_name, tokenized_ds in tokenized.items()
}



Map:   0%|          | 0/652 [00:00<?, ? examples/s]

In [8]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)



In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
  output_dir="./gpt2-dogs",
  overwrite_output_dir=True,
  num_train_epochs=3,
  per_device_train_batch_size=4,
  per_device_eval_batch_size=4,
  do_eval=True,              
  eval_steps=500,           
  logging_steps=200,
  save_steps=500,
  save_total_limit=2,
  learning_rate=5e-5,
  weight_decay=0.01,
  fp16=torch.cuda.is_available(),
  report_to="none",
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
    data_collator=data_collator,
)



In [11]:
trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
200,2.9247
400,2.7048
600,2.5929
800,2.4614
1000,2.43
1200,2.3636
1400,2.3331
1600,2.316




TrainOutput(global_step=1614, training_loss=2.5141944625268344, metrics={'train_runtime': 14030.5144, 'train_samples_per_second': 0.46, 'train_steps_per_second': 0.115, 'total_flos': 421725339648000.0, 'train_loss': 2.5141944625268344, 'epoch': 3.0})

In [12]:
trainer.save_model("./gpt2-dogs")
tokenizer.save_pretrained("./gpt2-dogs")


('./gpt2-dogs\\tokenizer_config.json',
 './gpt2-dogs\\special_tokens_map.json',
 './gpt2-dogs\\vocab.json',
 './gpt2-dogs\\merges.txt',
 './gpt2-dogs\\added_tokens.json',
 './gpt2-dogs\\tokenizer.json')

In [13]:
finetuned_tokenizer = AutoTokenizer.from_pretrained("./gpt2-dogs", use_fast=True)
finetuned_model = AutoModelForCausalLM.from_pretrained("./gpt2-dogs").to(device)



In [47]:
prompt = "An  important  feature  of  the  treatment  is "
inputs = finetuned_tokenizer(prompt, return_tensors="pt")
inputs = {k: v.to(device) for k, v in inputs.items()}

generated_ids = finetuned_model.generate(
    **inputs,
    max_length=100,
    do_sample=True,
    top_p=0.95,
    temperature=0.8,
    no_repeat_ngram_size=2,
    eos_token_id=finetuned_tokenizer.eos_token_id,
)

print(finetuned_tokenizer.decode(generated_ids[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


An  important  feature  of  the  treatment  is  that  it  does  not  cause the  bowels  and  abdomen  are  full  in  a  few  minutes,  but  this  becomes  much  more  frequent  as "Dr.  Clayton, — 'I  have  never  seen  anything  like  such  practice  shown  by  my  own self, 'cause  he  had
