## Importing the Dependencies

In [None]:
import pandas as pd
import jsonlines
from transformers import AutoTokenizer
from datasets import load_dataset
import itertools

## Loading a Tokenizer from a Pretrained LLM

In [None]:
tokenizer = AutoTokenizer.from_pretrained('EleutherAI/pythia-410m')

Downloading (…)okenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

## Downloading the Alpaca Dataset from HuggingFace

In [None]:
instruction_tuned_dataset = load_dataset("tatsu-lab/alpaca", split="train", streaming=True)

Downloading readme:   0%|          | 0.00/7.47k [00:00<?, ?B/s]

In [None]:
##So this is what our Dataset initially looks like
m = 10
top_m = list(itertools.islice(instruction_tuned_dataset, m))
for j in top_m:
  print(j)

{'instruction': 'Give three tips for staying healthy.', 'input': '', 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}
{'instruction': 'What are the three primary colors?', 'input': '', 'output': 'The three primary colors are red, blue, and yellow.', 'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary colors?\n\n### Respo

## Creating Prompt Templates for Finetuning the Dataset
- There are two formats that we will use.
- One would be the Prompt Template with the Input Specified in the User's query
- Second would be the Prompt Template without the Input Specified in the User's query

In [None]:
prompt_with_input_template = """Below is the instruction that describes the task, paired with an input that further describes the context. Write a response that appropirately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""

In [None]:
prompt_without_input_template = """Below is the instruction that describes the task, Write a response that appropirately completes the request.

### Instruction:
{instruction}

### Response:"""

In [None]:
##We are taking 10000 examples for preparing our Final Dataset.
n = 10000
examples = list(itertools.islice(instruction_tuned_dataset, n))

In [None]:
len(examples)

10000

In [None]:
examples[:5]

[{'instruction': 'Give three tips for staying healthy.',
  'input': '',
  'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
  'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'},
 {'instruction': 'What are the three primary colors?',
  'input': '',
  'output': 'The three primary colors are red, blue, and yellow.',
  'text': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nWhat are the three primary color

In [None]:
##Here we have Formatted our Prompts based on the Examples in the Dataset
finetuning_dataset = []
for j in examples:
  if not j['input']:
    processed_prompt = prompt_without_input_template.format(instruction = j['instruction'])

  else:
    processed_prompt = prompt_with_input_template.format(instruction = j['instruction'], input = j['input'])

  finetuning_dataset.append({'input': processed_prompt, 'output': j['output']})

In [None]:
from pprint import pprint
pprint(finetuning_dataset[0])  ##Now we will have two keys that is input and output and the Corresponding values.

{'input': 'Below is the instruction that describes the task, Write a response '
          'that appropirately completes the request.\n'
          '\n'
          '### Instruction:\n'
          'Give three tips for staying healthy.\n'
          '\n'
          '### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits '
           'and vegetables. \n'
           '2. Exercise regularly to keep your body active and strong. \n'
           '3. Get enough sleep and maintain a consistent sleep schedule.'}


In [None]:
##You can also save your Prepared Dataset like this
with jsonlines.open(f'alpaca_processed_10000.jsonl', 'w') as writer:
  writer.write_all(finetuning_dataset)

In [None]:
type(finetuning_dataset)

list

In [None]:
##This is how you can load your saved dataset.
##data = []

##with jsonlines.open(f'alpaca_processed_10000.jsonl', 'r') as reader:
##  for line in reader:
##   data.append(line)

In [None]:
print(finetuning_dataset[0]['input'])
print(finetuning_dataset[0]['output'])

Below is the instruction that describes the task, Write a response that appropirately completes the request.

### Instruction:
Give three tips for staying healthy.

### Response:
1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. 
2. Exercise regularly to keep your body active and strong. 
3. Get enough sleep and maintain a consistent sleep schedule.


In [None]:
text = finetuning_dataset[0]['input'] + finetuning_dataset[0]['output']

In [None]:
text

'Below is the instruction that describes the task, Write a response that appropirately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'

In [None]:
##Looking at the how the dataset looks like after Tokenizing
tokenizer.pad_token = tokenizer.eos_token
tokenized_text = tokenizer(text,
                          padding = True,
                          truncation = True,
                          max_length = 3
                          )

In [None]:
tokenized_text['input_ids']  ##'input_ids' contain the tokenized values for your Sentence or Sequence

[30003, 310, 253]

## Creating the Tokenize Function

In [None]:
def tokenize_function(examples):
    if "question" in examples and "answer" in examples:
      text = examples["question"][0] + examples["answer"][0]
    elif "input" in examples and "output" in examples:
      text = examples["input"][0] + examples["output"][0]
    else:
      text = examples["text"][0]

    tokenizer.pad_token = tokenizer.eos_token
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        padding=True,
    )

    max_length = min(
        tokenized_inputs["input_ids"].shape[1],
        2048
    )
    tokenizer.truncation_side = "left"
    tokenized_inputs = tokenizer(
        text,
        return_tensors="np",
        truncation=True,
        max_length=max_length
    )

    return tokenized_inputs

In [None]:
filename = 'alpaca_processed_10000.jsonl'  ##This is the saved Dataset.

In [None]:
finetuning_dataset_loaded = load_dataset("json", data_files = filename, split = 'train')  ##Getting our Dataset into the 'datasets' object.

In [None]:
type(finetuning_dataset_loaded)

datasets.arrow_dataset.Dataset

In [None]:
print(finetuning_dataset_loaded)

Dataset({
    features: ['input', 'output'],
    num_rows: 10000
})


In [None]:
finetuning_dataset[0]

{'input': 'Below is the instruction that describes the task, Write a response that appropirately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

In [None]:
finetuning_dataset_loaded[0]

{'input': 'Below is the instruction that describes the task, Write a response that appropirately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.'}

In [None]:
finetuning_dataset_loaded['input'][0]

'Below is the instruction that describes the task, Write a response that appropirately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:'

## Tokenizing the Dataset

In [None]:
##Now we can apply the 'tokenize' function on our dataset since it is a 'datasets' instance or object now.

In [None]:
tokenized_dataset = finetuning_dataset_loaded.map(
    tokenize_function,
    batched = True,
    batch_size = 1,
    drop_last_batch = True
)

In [None]:
print(tokenized_dataset)

Dataset({
    features: ['input', 'output', 'input_ids', 'attention_mask'],
    num_rows: 10000
})


In [None]:
tokenized_dataset[0]

{'input': 'Below is the instruction that describes the task, Write a response that appropirately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:',
 'output': '1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule.',
 'input_ids': [30003,
  310,
  253,
  9775,
  326,
  8631,
  253,
  4836,
  13,
  19566,
  247,
  2380,
  326,
  1192,
  5378,
  1523,
  29141,
  253,
  2748,
  15,
  187,
  187,
  4118,
  41959,
  27,
  187,
  19735,
  1264,
  12192,
  323,
  14596,
  5875,
  15,
  187,
  187,
  4118,
  19371,
  27,
  18,
  15,
  38,
  255,
  247,
  16645,
  6196,
  285,
  1056,
  2119,
  281,
  2486,
  9828,
  273,
  18098,
  285,
  15737,
  15,
  209,
  187,
  19,
  15,
  40626,
  11719,
  281,
  1978,
  634,
  2133,
  3939,
  285,
  2266,
  15,
  209,
  187,
  20,
  15,
  5057,
  2217,
  46

In [None]:
tokenized_dataset = tokenized_dataset.add_column('labels', tokenized_dataset['input_ids'])  ##Here we are adding a 'labels' column in our Dataset.

## Splitting the Dataset into Train and Test Partitions

In [None]:
split_dataset = tokenized_dataset.train_test_split(test_size = 0.1, shuffle = True, seed = 123)

In [None]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
    test: Dataset({
        features: ['input', 'output', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1000
    })
})

## Uploading the Dataset on HuggingFace

In [None]:
!pip install huggingface_hub



In [None]:
!huggingface-cli login  ##You will need to type your Access token that you can easily get from the Settings in your Huggingface Accout.
                        ##Also make sure that the Access token is 'write' type.


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|
    
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token: 
Add token as git credential? (Y/n) n
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [None]:
split_dataset.push_to_hub('Small_Alpaca_Instruct')  ##Pushing the Dataset to the HuggingFace hub

### If you want to look at how the Dataset looks like, go to the following link:
- https://huggingface.co/datasets/PiyushLavaniya/Small_Alpaca_Instruct