In [47]:
import torch
import math
import gc
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset

In [2]:
torch.cuda.is_available()

True

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
gc.collect()
torch.cuda.empty_cache()

In [7]:
model_name = "gpt2"
dataset_name = "vicgalle/alpaca-gpt4"

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [17]:
dataset = load_dataset(dataset_name, split='train')
dataset.to_pandas()

Unnamed: 0,instruction,input,output,text
0,Give three tips for staying healthy.,,1. Eat a balanced and nutritious diet: Make su...,Below is an instruction that describes a task....
1,What are the three primary colors?,,"The three primary colors are red, blue, and ye...",Below is an instruction that describes a task....
2,Describe the structure of an atom.,,An atom is the basic building block of all mat...,Below is an instruction that describes a task....
3,How can we reduce air pollution?,,There are several ways to reduce air pollution...,Below is an instruction that describes a task....
4,Describe a time when you had to make a difficu...,,"As an AI assistant, I do not have my own perso...",Below is an instruction that describes a task....
...,...,...,...,...
51997,Generate an example of what a resume should li...,,**John Doe**\n\n**Contact Information:**\n\n12...,Below is an instruction that describes a task....
51998,Arrange the items given below in the order to ...,"cake, me, eating",Me eating cake.,"Below is an instruction that describes a task,..."
51999,Write an introductory paragraph about a famous...,Michelle Obama,"Michelle Obama, born January 17, 1964, in Chic...","Below is an instruction that describes a task,..."
52000,Generate a list of five things one should keep...,,1. Evaluate your reasons: It's important to ta...,Below is an instruction that describes a task....


In [18]:
dataset['text'][0], len(dataset['text'][0])

('Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.',
 930)

In [19]:
lens = [len(i) for i in dataset['text']]
max_lens = 0
list_of_lens = []
more_than_1024 = 0
for i in lens:
    if i > max_lens:
        max_lens = i
    if i not in list_of_lens:
        list_of_lens.append(i)
    if i > 1024:
        more_than_1024+=1

In [20]:
max_lens, more_than_1024

(4929, 18842)

In [21]:
dataset = dataset.remove_columns(['instruction', 'input', 'output'])
dataset

Dataset({
    features: ['text'],
    num_rows: 52002
})

In [22]:
max_length = 1024 # Need to Tune
def tokenize_datasets(dataset):
  tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['text'],
          truncation=True,
          max_length=max_length,
          ),
      batched=True,
      remove_columns=['text']
  )
  return tokenized_dataset

In [23]:
dataset = dataset.shuffle(42).select(range(52002)).train_test_split(test_size=0.1, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 46801
    })
    test: Dataset({
        features: ['text'],
        num_rows: 5201
    })
})

In [24]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [25]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

Map:   0%|          | 0/5201 [00:00<?, ? examples/s]

In [26]:
token_len = [len(i) for i in train_dataset['input_ids'] if len(i)>=512 ]
len(token_len)

1040

In [27]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [28]:
batch_size = 2
training_args = TrainingArguments(
    output_dir="./models/convo_gpt2",
    gradient_accumulation_steps=batch_size,
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    load_best_model_at_end=True,
    save_strategy="no",
    save_total_limit=2,
    fp16=True,
    learning_rate=1e-05,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to="none",
)

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=2,
gradient_checkpointing=False,
greater_is_better=False,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=

In [29]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [30]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.6643
1000,1.9699
1500,1.902
2000,1.8591
2500,1.8139
3000,1.8037
3500,1.7793
4000,1.7713
4500,1.7358
5000,1.7565


TrainOutput(global_step=35100, training_loss=1.6693442202837039, metrics={'train_runtime': 20988.2038, 'train_samples_per_second': 6.69, 'train_steps_per_second': 1.672, 'total_flos': 1.9396275028992e+16, 'train_loss': 1.6693442202837039, 'epoch': 3.0})

In [33]:
MODEL_PATH = "Sharathhebbar24/convo_bot_gpt_v1"
HF_TOKEN = "<YOUR HF TOKEN>"
tokenizer.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)
model.push_to_hub(
    MODEL_PATH,
    token=HF_TOKEN
)

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/498M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/convo_bot_gpt_v1/commit/17a1d2959229059a8427b3144619546d2ad36f51', commit_message='Upload model', commit_description='', oid='17a1d2959229059a8427b3144619546d2ad36f51', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
results = trainer.evaluate()
print("Perplexity:", results["perplexity"])

In [35]:
results

{'eval_loss': 1.5569698810577393,
 'eval_runtime': 209.5887,
 'eval_samples_per_second': 24.815,
 'eval_steps_per_second': 12.41,
 'epoch': 3.0}

In [37]:
test_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 5201
})

In [45]:
eval_loss = results['eval_loss']
eval_loss

1.5569698810577393

In [48]:
perplexity = math.exp(eval_loss)
print("Perplexity:", perplexity)

Perplexity: 4.744423275536728
