In [1]:
# Preparing Dataset for instruction Tuning

In [2]:
!pip install wandb

Defaulting to user installation because normal site-packages is not writeable
Collecting wandb
  Downloading wandb-0.16.2-py3-none-any.whl.metadata (9.8 kB)
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.41-py3-none-any.whl.metadata (14 kB)
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.40.0-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting docker-pycreds>=0.4.0 (from wandb)
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting setproctitle (from wandb)
  Downloading setproctitle-1.3.3-cp310-cp310-win_amd64.whl.metadata (10 kB)
Collecting protobuf!=4.21.0,<5,>=3.19.0 (from wandb)
  Downloading protobuf-4.25.2-cp310-abi3-win_amd64.whl.metadata (541 bytes)
Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)
  Using cached gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)
  Using cached smmap-5.0.1-py3-none-any.whl.meta


[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [8]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mutshavpaudel46[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [9]:
#@ Loading data
import json
with open("alpaca_gpt4_data.json", "r") as f:
    alpaca = json.load(f)

In [10]:
len(alpaca)                                         #52 k alpaca datasets

52002

In [11]:
with wandb.init(project="alpaca_finetuning"):                                       # initializing login
    at = wandb.Artifact(                                                            # creating artifact with details
        name="alpaca_gpt4",
        type="dataset",
        description="A gpt4 generated alpaca like dataset for instruction finetuning",
        metadata = {"url":"https://github.com/Instruction-Tuning-with-GPT-4/GPT-4-LLM#how-good-is-the-data"},
        )
    at.add_file("alpaca_gpt4_data.json")                                           # adding files to artifact
    
    # table
    table = wandb.Table(columns=list(alpaca[0].keys()))                            # turning data into table form
    for row in alpaca:                                                             # adding values in table
        table.add_data(*row.values())


VBox(children=(Label(value='0.001 MB of 0.023 MB uploaded\r'), FloatProgress(value=0.049806082874055926, max=1…

# Dataset prepartion and tokenization

In [12]:
# inspecting datasets
alpaca[232]

{'instruction': 'Sort the following list in alphabetical order.',
 'input': 'Camouflage, Furniture, Plaster',
 'output': 'Camouflage, Furniture, Plaster sorted in alphabetical order:\nCamouflage, Furniture, Plaster'}

In [13]:
# preparing functions for fromatting datasets to feed LLM
def prompt_no_input(row):
    return ("Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Response:\n".format_map(row)
)
def prompt_input(row):
    return ("Below is an instruction that describes a task, paired with an input that provides further context. "
            "Write a response that appropriately completes the request.\n\n"
            "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format_map(row))


In [14]:
# lets check the preprocessing fucntions
row = alpaca[232]

print(row)
print(prompt_input(row))


{'instruction': 'Sort the following list in alphabetical order.', 'input': 'Camouflage, Furniture, Plaster', 'output': 'Camouflage, Furniture, Plaster sorted in alphabetical order:\nCamouflage, Furniture, Plaster'}
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Sort the following list in alphabetical order.

### Input:
Camouflage, Furniture, Plaster

### Response:



In [15]:
# merging both functions
def create_prompt(row):
    return(prompt_no_input(row) if row["input"] == "" else prompt_input(row))
    
prompts = [create_prompt(row) for row in alpaca]                         # all llm inputs are here

In [16]:
# adding EOS tokens so the model know when to stop producing text
EOS = "</s>"                                                             # for llama model
outputs = [row['output'] + EOS for row in alpaca]

In [17]:
# observing the eos tokens is added or not
outputs[0]

'1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.</s>'

In [18]:
# preparing final dataset
dataset = [{"prompts":p, "output":o, "example": p+o} for p,o in zip(prompts, outputs)]

In [19]:
dataset[0]

{'prompts': 'Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nGive three tips for staying healthy.\n\n### Response:\n',
 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.</s>',
 'example': 'Below is an 

In [20]:
# creating train-eval split
import random
import pandas as pd
random.shuffle(dataset)                              # suffling dataset

train_dataset = dataset[:-1000]                      # creating trainset
eval_dataset = dataset[-1000:]                       # creating valset

train_table = wandb.Table(dataframe=pd.DataFrame(train_dataset))       # converting dataset to table format
eval_table = wandb.Table(dataframe=pd.DataFrame(eval_dataset))

with wandb.init(project="alpaca_finetuning", job_type="split_data"):   # initializing on wandb 
    wandb.log({"train_dataset":train_table, "eval_dataset":eval_table})

VBox(children=(Label(value='7.384 MB of 114.471 MB uploaded\r'), FloatProgress(value=0.06450538370829999, max=…

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoTokenizer
model_id = 'meta-llama/Llama-2-7b-hf' 
tokenizer = AutoTokenizer.from_pretrained(model_id, token='')   # enter your hugging face token
tokenizer.pad_token = tokenizer.eos_token                       # so that model thinks padding as end of sentence

tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [21]:
#@ packing dataset i.e example row to length of 1024 so training become efficients

max_seq_len = 1024


def pack(dataset, max_seq_len=1024):
    tkds_ids = tokenizer([s["example"] for s in dataset])["input_ids"]           # return list of input ids by tokenizing example i.e instruction + outptus
    
    all_token_ids = []
    for tokenized_input in tkds_ids:
        all_token_ids.extend(tokenized_input + [tokenizer.eos_token_id])         # puts the eos after every example sequence
    
    packed_ds = []
    for i in range(0, len(all_token_ids), max_seq_len+1):                        
        input_ids = all_token_ids[i : i + max_seq_len+1]                        # packs the tokne equals to window size = 1024
        if len(input_ids) == (max_seq_len+1):
            packed_ds.append({"input_ids": input_ids[:-1], "labels": input_ids[1:]})  
    return packed_ds


train_ds_packed = pack(train_dataset)
eval_ds_packed = pack(eval_dataset)


In [22]:
#@ storing the packed datasets
import json
def save_jsonl(data, filename):
    with open(filename, 'w') as file:
        for entry in data:
            json.dump(entry, file)
            file.write('\n')


# dump everything to jsonl files
save_jsonl(train_ds_packed, "train_packed_alpaca.jsonl")
save_jsonl(eval_ds_packed, "eval_packed_alpaca.jsonl")


# Create a W&B artifact
packed_at = wandb.Artifact(
    name="packed_alpaca",
    type="dataset",
    description="Alpaca dataset packed in sequences",
    metadata={"max_seq_len":1024, "model_id":model_id})


packed_at.add_file("train_packed_alpaca.jsonl")
packed_at.add_file("eval_packed_alpaca.jsonl")


# log the artifact to the project, we can give this run a job_type like `preprocess`
with wandb.init(project="alpaca_ft", job_type="preprocess"):
    wandb.log_artifact(packed_at)


VBox(children=(Label(value='0.001 MB of 130.961 MB uploaded\r'), FloatProgress(value=8.89149343540894e-06, max…