# Install Necessary packages


In [2]:
# # !pip install 'transformers[torch]'
# !pip install datasets

# Importing Libraries

In [1]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset




In [7]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
torch.cuda.empty_cache()

In [4]:
# model_name = "gpt2-medium"
model_name = "gpt2"
dataset_name = "databricks/databricks-dolly-15k"

In [5]:
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [6]:
dataset = load_dataset(dataset_name, split='train')
# dataset.to_pandas().sample(2000)
dataset.to_pandas()

Unnamed: 0,instruction,context,response,category
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa
1,Which is a species of fish? Tope or Rope,,Tope,classification
2,Why can camels survive for long without water?,,Camels use the fat in their humps to keep them...,open_qa
3,"Alice's parents have three daughters: Amy, Jes...",,The name of the third daughter is Alice,open_qa
4,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa
...,...,...,...,...
15006,How do i accept the change,,Embrace the change and see the difference,brainstorming
15007,What is a laser and who created it?,A laser is a device that emits light through a...,A laser is a device that emits light from an e...,summarization
15008,What is the difference between a road bike and...,,Road bikes are built to be ridden on asphalt a...,open_qa
15009,How does GIS help in the real estate investmen...,,"Real estate investors depend on precise, accur...",general_qa


In [8]:
def preprocess(example):
  # example['prompt'] = f'{example["instruction"]} {example["input"]} {example["output"]}'
  example['prompt'] = f'{example["context"]} {example["instruction"]} {example["response"]}'
  return example


def tokenize_datasets(dataset):
  tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['prompt'],
          truncation=True,
          max_length=128,
          ),
      batched=True,
      remove_columns=['prompt']
  )
  return tokenized_dataset

In [9]:
dataset = dataset.map(
    preprocess, remove_columns=['context', 'instruction', 'response']
)
dataset = dataset.shuffle(42).select(range(15000)).train_test_split(test_size=0.1, seed=42)

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'prompt'],
        num_rows: 13500
    })
    test: Dataset({
        features: ['category', 'prompt'],
        num_rows: 1500
    })
})

In [11]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

In [13]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [14]:
batch_size = 8
training_args = TrainingArguments(
    output_dir="./models/tuned_text_gen",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)


training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=au

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [16]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,3.1764
1000,3.0974
1500,3.0548
2000,2.8956
2500,2.8086
3000,2.7986
3500,2.7537
4000,2.6731
4500,2.6647
5000,2.6688


TrainOutput(global_step=5064, training_loss=2.8565759854670763, metrics={'train_runtime': 1736.109, 'train_samples_per_second': 23.328, 'train_steps_per_second': 2.917, 'total_flos': 2638849471488000.0, 'train_loss': 2.8565759854670763, 'epoch': 3.0})

In [17]:
MODEL_PATH = "Sharathhebbar24/Instruct_GPT_small_v1"
model.push_to_hub(
    MODEL_PATH, token="<HF_Token>"
)


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/Instruct_GPT_small_v1/commit/c539b4779b1dee491a69c09bfc571889610a3b6e', commit_message='Upload model', commit_description='', oid='c539b4779b1dee491a69c09bfc571889610a3b6e', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
mod = "Sharathhebbar24/Instruct_GPT_small"
mod1 = AutoModelForCausalLM.from_pretrained(mod)

config.json:   0%|          | 0.00/907 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors:   0%|          | 0.00/498M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [19]:
def generate_text(prompt):
  inputs = tokenizer.encode(prompt, return_tensors='pt')
  outputs = mod1.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated[:generated.rfind(".")+1]

In [20]:
generate_text("What is the best way to cook paneer")

'What is the best way to cook paneer? The best way to cook paneer is to cook it in a pan.  You can use a stove top or a pan that is not too hot.  You can also use a pan that is not too hot.'

In [21]:
generate_text("Should I Invest in stocks")

'Should I Invest in stocks? Yes, you should invest in stocks.  You should invest in stocks because they are a safe investment.  They are a safe investment because they are not subject to market fluctuations.  They are not subject to the whims of the market.'

In [22]:
generate_text("What is the fastest route from NY City to Boston")

'What is the fastest route from NY City to Boston? The fastest route from NY City to Boston is the Boston Marathon. The fastest route from NY City to Boston is the Boston Marathon. The fastest route from NY City to Boston is the Boston Marathon. The fastest route from NY City to Boston is the Boston Marathon.'

In [23]:
generate_text("Who is the PM of India")

'Who is the PM of India? Narendra Modi is the Prime Minister of India. He is the son of former Prime Minister Manmohan Singh and his wife, Prabhupada. He is the son of former Prime Minister Manmohan Singh and his wife, Prabhupada.'