## Importing the Dependencies

In [1]:
import os
import torch
import pandas as pd
import time

from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
from llama import BasicModelRunner

## Getting the Pretrained Base LLM from HuggingFace

In [2]:
model_name = 'EleutherAI/pythia-410m'

## Getting the Dataset that we prepared

In [3]:
dataset_path = 'PiyushLavaniya/Small_Alpaca_Instruct'
use_hf = True

## Defining Config

In [4]:
training_config = {
    'model' : {
        'model_name' : model_name,
        'max_length' : 2048
    },
    'datasets' : {
        'use_hf' : use_hf,
        'path' : dataset_path
    },
    'verbose' : True
}

In [5]:
##Geting the Tokenizer from Huggingface and loading the Dataset
tokenizer = AutoTokenizer.from_pretrained(model_name)  ##Tokenizer must be same as the Model and also that you used when you tokenized your Data.
tokenizer.pad_token = tokenizer.eos_token
dataset = load_dataset(dataset_path)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

In [7]:
train_dataset = dataset['train']
test_dataset = dataset['test']

print(train_dataset)
print(test_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 9000
})
Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1000
})


In [8]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)  ##Downloading the Model from Huggingface for Finetuning

In [9]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-410m')

In [10]:
device_count = torch.cuda.device_count()  ##Getting the Device Count

In [11]:
if device_count > 0:
  device = torch.device('cuda')

else:
  device = torch.device('cpu')

In [12]:
device

device(type='cuda')

In [13]:
base_model.to(device)  ##To view the Summary of the Model

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

## Creating a Function for Inference

In [14]:
def inference(text, model, tokenizer, max_input_tokens=1000, max_output_tokens=100):
  # Tokenize
  input_ids = tokenizer.encode(
          text,
          return_tensors="pt",
          truncation=True,
          max_length=max_input_tokens
  )

  # Generate
  device = model.device
  generated_tokens_with_prompt = model.generate(
    input_ids=input_ids.to(device),
    max_length=max_output_tokens
  )

  # Decode
  generated_text_with_prompt = tokenizer.batch_decode(generated_tokens_with_prompt, skip_special_tokens=True)

  # Strip the prompt
  generated_text_answer = generated_text_with_prompt[0][len(text):]

  return generated_text_answer

In [15]:
##This is the Models's Output before Instruction finetuning
test_text = test_dataset[0]['input']
print('Question: ', test_text)
test_text_answer = test_dataset[0]['output']
print('Original Answer: ', test_text_answer)
response = inference(test_text, base_model, tokenizer)
print('Model Answer: ', response)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question:  Below is the instruction that describes the task, paired with an input that further describes the context. Write a response that appropirately completes the request.

### Instruction:
Create a plan for introducing a new technology

### Input:
The new technology is a robotic assistant that helps with customer service and retail operations.

### Response:
Original Answer:  The introduction of a robotic assistant to help with customer service and retail operations can be a daunting task, but with careful planning, it can be a success. Here are some steps to consider in the process: 

1. Research: Research the technology and understand its capabilities so that you can know how best to deploy it.
2. Test: Test the technology in a controlled environment and make sure it is ready for use in the real world.
3. Implement: Install the technology in the desired environment and begin training employees on how to use it.
4. Monitor: Monitor the technology and its performance regularly to

## Finetuning the Model

In [28]:
max_steps = 1200  ##We will finetune or train the Model for 1200 steps.
trained_model_name = f"instruction_finetuned_{max_steps}_steps"
output_dir = trained_model_name

### Specifying the Training Arguments to use during Training

In [29]:
training_arguments = TrainingArguments(
    learning_rate = 1.0e-5,

    num_train_epochs = 1,
    max_steps = max_steps,

    per_device_train_batch_size = 1,

    output_dir = output_dir,

    # Other arguments
    overwrite_output_dir=False, # Overwrite the content of the output directory
    disable_tqdm=False, # Disable progress bars
    eval_steps=120, # Number of update steps between two evaluations
    save_steps=120, # After # steps model is saved
    warmup_steps=1, # Number of warmup steps for learning rate scheduler
    per_device_eval_batch_size=1, # Batch size for evaluation
    evaluation_strategy="steps",
    logging_strategy="steps",
    logging_steps=1,
    optim="adafactor",
    gradient_accumulation_steps = 4,
    gradient_checkpointing=False,

    # Parameters for early stopping
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [34]:
import locale  ##This cell is optional. If you are unable to execute commands in the Notebook then run this cell.
locale.getpreferredencoding = lambda: "UTF-8"

### Creating the Trainer Instance for training
#### Here we will be using:
- The Model that we want to finetune. 
- The training_arguments that we specified above.
- And the Dataset we want to finetune our Model upon.

In [30]:
trainer = Trainer(
    model = base_model,
    args = training_arguments,
    train_dataset = train_dataset,
    eval_dataset = test_dataset
)

In [31]:
training_output = trainer.train()  ##Start the Training. This is a very GPU Intensive Process so it is not recommended to finetune LLMs that are larger in size until you have a very Powerful GPU.

Step,Training Loss,Validation Loss
120,1.4451,1.297023
240,1.2442,1.28371
360,1.2821,1.255742
480,1.2628,1.245138
600,1.2579,1.235909
720,1.3943,1.22394
840,1.11,1.214069
960,1.2382,1.207902
1080,1.54,1.199358
1200,1.0787,1.196713


In [32]:
save_dir = f'{output_dir}/final'  ##Saving the Model

trainer.save_model(save_dir)
print('saved Model to: ', save_dir)

saved Model to:  instruction_finetuned_1200_steps/final


In [62]:
!huggingface-cli login  ##Logging into the HuggingFace. You will need to type your token that should be of 'write' type.


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root

In [44]:
trainer.push_to_hub()  ##Pushing the Model to Huggingface hub

pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

'https://huggingface.co/PiyushLavaniya/instruction_finetuned_1200_steps/tree/main/'

In [77]:
tokenizer.push_to_hub('Pythia-410m-1200_steps') ## 'Pythia-410m-1200_steps' -> name of your Model Repository. In my case, I changed the name of my Model Repository to 'Pythia-410m-1200_steps'.

tokenizer config file saved in /tmp/tmpe09a0ncg/tokenizer_config.json
Special tokens file saved in /tmp/tmpe09a0ncg/special_tokens_map.json
Uploading the following files to PiyushLavaniya/Pythia-410m-1200_steps: tokenizer_config.json,special_tokens_map.json,tokenizer.json


CommitInfo(commit_url='https://huggingface.co/PiyushLavaniya/Pythia-410m-1200_steps/commit/3b55004843d450c45dd1c2295fc96fad756e3fca', commit_message='Upload tokenizer', commit_description='', oid='3b55004843d450c45dd1c2295fc96fad756e3fca', pr_url=None, pr_revision=None, pr_num=None)

In [36]:
##This is if you have trained the Model on Google Collab.
!tar -czf instruction_finetuned_1200 instruction_finetuned_1200_steps

## Loading our Instruction Finetuned Model from the Directory

In [45]:
finetuned_slightly_model = AutoModelForCausalLM.from_pretrained(save_dir, local_files_only = True)

In [46]:
finetuned_slightly_model.to(device)

GPTNeoXForCausalLM(
  (gpt_neox): GPTNeoXModel(
    (embed_in): Embedding(50304, 1024)
    (emb_dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x GPTNeoXLayer(
        (input_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (post_attention_dropout): Dropout(p=0.0, inplace=False)
        (post_mlp_dropout): Dropout(p=0.0, inplace=False)
        (attention): GPTNeoXAttention(
          (rotary_emb): GPTNeoXRotaryEmbedding()
          (query_key_value): Linear(in_features=1024, out_features=3072, bias=True)
          (dense): Linear(in_features=1024, out_features=1024, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (mlp): GPTNeoXMLP(
          (dense_h_to_4h): Linear(in_features=1024, out_features=4096, bias=True)
          (dense_4h_to_h): Linear(in_features=4096, out_features=1024, bias=True)
  

### Inference using the Finetuned Model
- Notice that now the Model is able to understand that we are asking it a Question or giving it an Instruction and responds to that.
- Ofcourse this Model will not produce very good results because we just trained it for 1200 steps and it is also a pretty small Model.
- This Model was only of 410M Parameters. A larger Model having around 1B Parameters would work even better.
- Our goal was to Instruction Finetune our Base LLM to make it able to understand the Instruction and respond to that. 
- And now the Model is able to understand the Instruction and producing the Response based on that.
- Since Finetuning an LLM is an expensive process but if you have resources then you may try to train it for longer to produce even better results.

In [58]:
test_text = test_dataset[5]['input']
print("Question input (test):", test_text)
print(f"Correct answer: {test_dataset[5]['output']}")
print("Model's answer: ")
print(inference(test_text, finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Question input (test): Below is the instruction that describes the task, paired with an input that further describes the context. Write a response that appropirately completes the request.

### Instruction:
Express the following number in words.

### Input:
1275

### Response:
Correct answer: One thousand two hundred seventy-five.
Model's answer: 
1275 is a number. It is a number. It is a number. It is a number. It is a number. It is a number. It is a number. It is a number. It is a


In [59]:
print(inference("what is Earth?", finetuned_slightly_model, tokenizer))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




A:

The Earth is a sphere with a diameter of about 6,000 miles. It is made of a solid core surrounded by an atmosphere of gases and particles. The atmosphere is composed of hydrogen, helium, and other elements. The Earth's surface is covered with a layer of ice, which is a thin layer of water. The atmosphere is composed of hydrogen and helium, and is the most important part of the atmosphere.

A:

The


In [76]:
##This is to push the Model to HuggingFace.

##finetuned_slightly_model.push_to_hub('instruction-finetuned-1200-steps')
##tokenizer.push_to_hub('instruction-finetuned-1200-steps')

Configuration saved in /tmp/tmpktlyzw0v/config.json
Configuration saved in /tmp/tmpktlyzw0v/generation_config.json
Model weights saved in /tmp/tmpktlyzw0v/pytorch_model.bin
Uploading the following files to PiyushLavaniya/instruction-finetuned-1200-steps: config.json,generation_config.json,pytorch_model.bin


pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

tokenizer config file saved in /tmp/tmp5qw1h63f/tokenizer_config.json
Special tokens file saved in /tmp/tmp5qw1h63f/special_tokens_map.json
Uploading the following files to PiyushLavaniya/instruction-finetuned-1200-steps: tokenizer_config.json,special_tokens_map.json,tokenizer.json


CommitInfo(commit_url='https://huggingface.co/PiyushLavaniya/instruction-finetuned-1200-steps/commit/37e4387c4458d00189e52949d4306ac490f03e38', commit_message='Upload tokenizer', commit_description='', oid='37e4387c4458d00189e52949d4306ac490f03e38', pr_url=None, pr_revision=None, pr_num=None)

In [79]:
from transformers import pipeline  ##Let's download our Finetuned Model from HugingFace and test it using Huggingface's pipeline

In [88]:
model_name = 'PiyushLavaniya/Pythia-410m-1200_steps'

In [93]:
pipe = pipeline(
    'text-generation',
    model = model_name,
    tokenizer = tokenizer,
    eos_token_id = tokenizer.eos_token_id,
    max_length = 60,
    device = 'cuda'
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--PiyushLavaniya--Pythia-410m-1200_steps/snapshots/3b55004843d450c45dd1c2295fc96fad756e3fca/config.json
Model config GPTNeoXConfig {
  "_name_or_path": "PiyushLavaniya/Pythia-410m-1200_steps",
  "architectures": [
    "GPTNeoXForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.1,
  "eos_token_id": 0,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 2048,
  "model_type": "gpt_neox",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "rope_scaling": null,
  "rotary_emb_base": 10000,
  "rotary_pct": 0.25,
  "tie_word_embeddings": false,
  "torch_dtype": "float32",
  "transformers_version": "4.34.0",
  "use_cache": true,
  "use_parallel_residual": true,
  "vocab_size": 50304
}

loading configuration file config.json from 

In [101]:
model_output = pipe('What do you think of Mars?')
print(model_output[0]['generated_text'].split('###')[-1])  ##As you can see that our Model really was able to understand what we were asking it.
                                                           ##It also outputs an Answer based on the Instruction.

Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0
}

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


What do you think of Mars?

I think it's a great place to explore. It's a great place to live and work. It's a great place to learn and grow. It's a great place to explore the universe. It's a great place to live and work. It


In [65]:
model = AutoModelForCausalLM.from_pretrained('PiyushLavaniya/Pythia-410m-1200_steps')  ##Downloading the Model

Downloading (…)lve/main/config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.62G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [78]:
tokenizer = AutoTokenizer.from_pretrained('PiyushLavaniya/Pythia-410m-1200_steps')  ##Downloading the Tokenizer

Downloading (…)okenizer_config.json:   0%|          | 0.00/4.80k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

loading file vocab.json from cache at None
loading file merges.txt from cache at None
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--PiyushLavaniya--Pythia-410m-1200_steps/snapshots/3b55004843d450c45dd1c2295fc96fad756e3fca/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--PiyushLavaniya--Pythia-410m-1200_steps/snapshots/3b55004843d450c45dd1c2295fc96fad756e3fca/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--PiyushLavaniya--Pythia-410m-1200_steps/snapshots/3b55004843d450c45dd1c2295fc96fad756e3fca/tokenizer_config.json
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [102]:
print(inference('what do you think of Mars?', model, tokenizer))  ##Inference using the Model

Generate config GenerationConfig {
  "bos_token_id": 0,
  "eos_token_id": 0
}

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.




I think of Mars as a place where we can explore the unknown and explore the unknowns. We can explore the unknowns and explore the unknowns. We can explore the unknowns and explore the unknowns. We can explore the unknowns and explore the unknowns. We can explore the unknowns and explore the unknowns. We can explore the unknowns and explore the unknowns. We can explore the unknowns and explore the unknowns.


## Finetuning larger LLMs
- If you want to Finetune a larger LLM on limited resources, you may want to use 'Qlora' or 'Ludwig'.
- Qlora stands for 'Quantized Low-Rank Adaptation' of Large language Models.
- Ludwig and Qlora are open-source libraries that makes it really easy to Finetune large language Models on limited Resources.
### Refer to the following link if you want to learn how to Finetune large LLMs using Qlora and Ludwig:-
- https://github.com/PiyushLavaniya/Finetuning-Llama2

## Also the link to the Finetuned Model is :-
- https://huggingface.co/PiyushLavaniya/Pythia-410m-1200_steps
- Go to this link and test out the Model yourself.