In [1]:
#demonstrate the process of training a GPT model to follow instructions effectively, using a custom dataset for varied tasks.

In [2]:
# Run this code inside a virtual environment (python 3.10 for myself) and import all things inside the virtual environment using cmd***
# open cmd as administrator where the virtual env is (D:\MACHINE LEARNING\LLM\Codes\python 3.10 virtual)
# 3.10env\Scripts\activate

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset




In [5]:
dataset = load_dataset("hakurei/open-instruct-v1", split='train') #instruction-tuning dataset
dataset.to_pandas().sample(10)

Unnamed: 0,output,input,instruction
193942,"sorted_array = sorted(['this', 'is', 'a', 'tes...",,Reorder the items in an array according to the...
158917,The most effective way to lose weight quickly ...,,How can I lose weight quickly.
207040,7 times,,How many times Lewis Hamilton won the F1 Champ...
271091,The best ways to engage with customers on soci...,,What is the best way to engage with customers ...
253975,"Peanut butter, cheese, milk, fruit and popcorn",,Give me a list of five healthy snacks for kids
379395,"1. Coffee is a natural source of antioxidants,...",,What are the top 5 health benefits of drinking...
175545,Eat a balanced diet that includes plenty of fr...,,What are the best tips for healthy living.
197654,"string = ""Hello World!""\nstring = string.repla...",,"Using the input string, write a code to replac..."
157408,Advantages of using mobile phones:\n- Allows f...,,What are the advantages and disadvantages of u...
392863,5050,,Find out a solution to the following math prob...


In [6]:
dataset[1]

{'output': 'The three primary colors are red, blue, and yellow.',
 'input': '',
 'instruction': 'What are the three primary colors?'}

In [7]:
def preprocess(example):
    example['prompt'] = f"{example['instruction']} {example['input']} {example['output']}"

    return example

In [8]:
def tokenize_datasets(dataset):
    tokenized_dataset = dataset.map(lambda example: tokenizer(example['prompt'], truncation=True, max_length=128), batched=True, remove_columns=['prompt'])

    return tokenized_dataset

In [9]:
dataset = dataset.map(preprocess, remove_columns=['instruction', 'input', 'output'])
# dataset =  dataset.shuffle(42).select(range(100000)).train_test_split(test_size=0.1, seed=42)
dataset =  dataset.shuffle(42).select(range(10000)).train_test_split(test_size=0.1, seed=42) # using 10,000 for lowering the training time :(

In [10]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [11]:
train_dataset[0]

{'prompt': 'Rewrite the following sentence using the hypernym of the underlined word: Kevin loves to watch YouTube videos. Kevin loves to watch videos.'}

In [12]:
MODEL_NAME = "microsoft/DialoGPT-medium"

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token # Adding pad token because DialoGPT doesn't have this in default

In [14]:
train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [15]:
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

In [16]:
# In the context of training transformer-based language models (like GPT), 
# the data collator is responsible for batching data in a way that is appropriate for the model training. 
# It helps prepare the dataset for training by padding, creating input masks, and ensuring that the input data is in the right format for the model.

In [17]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [18]:
# configure the training parameters
traing_args = TrainingArguments(output_dir="../Models/diablo_gpt",
                                num_train_epochs=1,
                                per_device_train_batch_size=32,
                                per_device_eval_batch_size=32)

In [19]:
# initiate the training process using our prepared datasets.
trainer = Trainer(model=model,
                    args=traing_args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    data_collator=data_collator)

In [20]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=282, training_loss=2.8788707543772163, metrics={'train_runtime': 20903.117, 'train_samples_per_second': 0.431, 'train_steps_per_second': 0.013, 'total_flos': 2089576562688000.0, 'train_loss': 2.8788707543772163, 'epoch': 1.0})