# Install Necessary packages


In [2]:
!pip install 'transformers[torch]'
!pip install datasets

Collecting transformers[torch]
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers[torch])
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m38.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers[torch])
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m103.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers[torch])
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m86.8 MB/s

# Importing Libraries

In [3]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset

In [1]:
model_name = "gpt2-medium"
dataset_name = "databricks/databricks-dolly-15k"

In [4]:
model = AutoModelForCausalLM.from_pretrained(model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
dataset = load_dataset(dataset_name, split='train')
dataset.to_pandas().sample(20)

Downloading readme:   0%|          | 0.00/8.20k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/13.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Unnamed: 0,instruction,context,response,category
4534,Could a nuclear explosion change the Earth's r...,,Nuclear explosions release more energy than an...,open_qa
5655,Was Endless Nights released in November or Dec...,Endless Nights in Aurora (Chinese: 極光之愛) is a ...,December,closed_qa
6436,give me a list of places I have to visit in Rome.,,1. Colosseum\n2. Fontana di trevi\n3. Vatican ...,brainstorming
654,Give me a list of some of the most popular son...,,1. Plastic Love\n2. Stay with Me\n3. September...,brainstorming
5310,What are the typical ingredient for Italian to...,,While there are so many variations of making I...,open_qa
9329,Where is Peloton Interactive based?,"Peloton Interactive, Inc. is an American exerc...","Peloton Interactive is based in New York City,...",summarization
3254,What is Twitch Interactive?,Twitch is an American video live streaming ser...,Twitch is a video live streaming service that ...,information_extraction
6653,"Tell me which of these is a Country, a City, a...",,New York - City\nNile - River\nAmazon - River\...,classification
2680,What are the pros and cons of denormalizing da...,,The pro for denormalizing data is to optimize ...,open_qa
13774,"Given this article about Operation Aurora, How...",Operation Aurora was a series of cyber attacks...,"The attack was named ""Operation Aurora"" by Dmi...",closed_qa


In [6]:
def preprocess(example):
  # example['prompt'] = f'{example["instruction"]} {example["input"]} {example["output"]}'
  example['prompt'] = f'{example["context"]} {example["instruction"]} {example["response"]}'
  return example


def tokenize_datasets(dataset):
  tokenized_dataset = dataset.map(
      lambda example: tokenizer(
          example['prompt'],
          truncation=True,
          max_length=128,
          ),
      batched=True,
      remove_columns=['prompt']
  )
  return tokenized_dataset

In [7]:
dataset = dataset.map(
    preprocess, remove_columns=['context', 'instruction', 'response']
)
dataset = dataset.shuffle(42).select(range(15000)).train_test_split(test_size=0.1, seed=42)

Map:   0%|          | 0/15011 [00:00<?, ? examples/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['category', 'prompt'],
        num_rows: 13500
    })
    test: Dataset({
        features: ['category', 'prompt'],
        num_rows: 1500
    })
})

In [9]:
train_dataset = dataset['train']
test_dataset = dataset['test']

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

train_dataset = tokenize_datasets(train_dataset)
test_dataset = tokenize_datasets(test_dataset)

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/13500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
data_collator

DataCollatorForLanguageModeling(tokenizer=GPT2TokenizerFast(name_or_path='gpt2-medium', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

In [12]:
batch_size = 16
training_args = TrainingArguments(
    output_dir="./models/tuned_text_gen",
    num_train_epochs=1,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size
)


training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [14]:
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,2.7788


TrainOutput(global_step=844, training_loss=2.7411062954726377, metrics={'train_runtime': 1265.68, 'train_samples_per_second': 10.666, 'train_steps_per_second': 0.667, 'total_flos': 3134364844032000.0, 'train_loss': 2.7411062954726377, 'epoch': 1.0})

In [21]:
MODEL_PATH = "Sharathhebbar24/Instruct_GPT"
model.push_to_hub(
    MODEL_PATH, token="<token_id>"
)


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Sharathhebbar24/Instruct_GPT/commit/58092fab32e44a62db5586e7bf2a19ae92ef1579', commit_message='Upload model', commit_description='', oid='58092fab32e44a62db5586e7bf2a19ae92ef1579', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
mod = "Sharathhebbar24/Instruct_GPT"
mod1 = AutoModelForCausalLM.from_pretrained(mod)

Downloading (…)lve/main/config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/119 [00:00<?, ?B/s]

In [27]:
def generate_text(prompt):
  inputs = tokenizer.encode(prompt, return_tensors='pt')
  outputs = mod1.generate(inputs, max_length=64, pad_token_id=tokenizer.eos_token_id)
  generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
  return generated[:generated.rfind(".")+1]

In [28]:
generate_text("What is the best way to cook paneer")

'What is the best way to cook paneer? The best way to cook paneer is to use a rotisserie chicken.  The chicken should be cooked on a flat surface, with the skin on the outside and the skin on the inside.'

In [29]:
generate_text("Should I Invest in stocks")

"Should I Invest in stocks? Investing in stocks is a great way to diversify your portfolio.  You can invest in stocks based on the market's performance, or you can invest in stocks based on the company's performance."

In [30]:
generate_text("What is the fastest route from NY City to Boston")

'What is the fastest route from NY City to Boston? The fastest route from NY City to Boston is the Long Island Expressway.  The route takes about 2 hours and 30 minutes.  The route is a direct route from the city of New York to Boston.'