Intructional Finetuning: Prepare dataset -> Finetunee -> Evaluate.

In [9]:
import torch 
from torch.utils.data import Dataset

In [1]:
# current dataset has instruction, i/p, respone. I am using Alpaca prompt, hihi
def format_input(entry):
    instruction_text = (
            f"Below is an instruction that describes a task. "
            f"Write a response that appropriately completes the request."
            f"\n\n## Instruction:\n{entry['instruction']}"
            )
    input_text = f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    return instruction_text + input_text

In [2]:
import pandas as pd
data= pd.read_json("instruction-data.json")
data.head()

Unnamed: 0,instruction,input,output
0,Evaluate the following phrase by transforming ...,freind --> friend,"The spelling of the given phrase ""freind"" is i..."
1,Edit the following sentence for grammar.,He go to the park every day.,He goes to the park every day.
2,Convert 45 kilometers to meters.,,45 kilometers is 45000 meters.
3,Rewrite this sentence to start with 'Although'...,,"Although it was raining, they went for a walk."
4,What are the first 10 square numbers?,,"1, 4, 9, 16, 25, 36, 49, 64, 81, 100."


In [8]:
model_ip= format_input(data.iloc[1])
desired_output= f"\n\n### Respone:\n{data.iloc[1]['output']}"
print(model_ip+ desired_output)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

## Instruction:
Edit the following sentence for grammar.

### Input:
He go to the park every day.

### Respone:
He goes to the park every day.


In [10]:
class Instruction_dataset(Dataset):
    def __init__(self, data, tokenizer):
        self.data= data
        self.encoded_texts=[]
        for entry in data:
            Instruction_plus_ip= format_input(entry)
            response_txt= f"\n\n### Repsone:\n{entry['output']}"
            full_txt= Instruction_plus_ip+ response_txt
            self.encoded_texts.append(
                tokenizer.encode(full_txt)
            )
            
    def __getitem__(self, index):
         return self.encoded_texts[index]
    
    def __len__(self):
        return len(self.data)

In [11]:
# custom collate function coz why not? hehe, well it is needed to pad training egs to same len
# but diff batches can have diff lens, this minimizes unnecessary padding.
def custom_collate_draft(
        batch,
        pad_token_id= 50256,
        device= "cpu"
):
    batch_max_length= max(len(item)+1 for item in batch)
    input_lst= []

    for item in batch:
        new_item=item.copy()
        new_item+=[pad_token_id]

        padded=(
            new_item+[pad_token_id]* (batch_max_length-len(new_item))
        )

        inputs= torch.tensor(padded[:-1])
        input_lst.append(inputs)
    
    inputs_tensor= torch.stack(input_lst).to(device)
    return inputs_tensor

In [12]:
ip1= [12,131,12,1,122]
ip2= [90,98,90,1]
ip3=[0]
batch=(ip1, ip2, ip3)
print(custom_collate_draft(batch))

tensor([[   12,   131,    12,     1,   122],
        [   90,    98,    90,     1, 50256],
        [    0, 50256, 50256, 50256, 50256]])


In [16]:
# this custom_collate () is to return target token ids with addition to input token ids
def custom_collate(
        batch,
        pad_token_id= 50256,
        device= "cpu"
):
    batch_max_length= max(len(item)+1 for item in batch)
    input_lst, target_lsts= [],[]

    for item in batch:
        new_item=item.copy()
        new_item+=[pad_token_id]

        padded=(
            new_item+[pad_token_id]* (batch_max_length-len(new_item))
        )

        inputs= torch.tensor(padded[:-1])
        targets= torch.tensor(padded[:-1])

        input_lst.append(inputs)
        target_lsts.append(targets)
    
    inputs_tensor= torch.stack(input_lst).to(device)
    target_tensor= torch.stack(target_lsts).to(device)

    return inputs_tensor, target_tensor

In [18]:
ip1= [12,131,12,1,122]
ip2= [90,98,90,1]
ip3=[0]
batch=(ip1, ip2, ip3)
ips, tar= custom_collate(batch)
print(ips)
print(tar)

tensor([[   12,   131,    12,     1,   122],
        [   90,    98,    90,     1, 50256],
        [    0, 50256, 50256, 50256, 50256]])
tensor([[   12,   131,    12,     1,   122],
        [   90,    98,    90,     1, 50256],
        [    0, 50256, 50256, 50256, 50256]])


In [23]:
# this custom_collate_fn () is to assign -100 to all padding tokens
# why? to make sure they don't contribute to train loss calc
# to ensure only meaningful data influence our cutie model 
# why -100? coz cross_entropy function in pytorch ignores target labeled with -100
def custom_collate_fn(
        batch,
        pad_token_id= 50256,
        ignore_idx=-100,
        allowed_max_len=None,
        device= "cpu"
):
    batch_max_length= max(len(item)+1 for item in batch)
    input_lst, target_lsts= [],[]

    for item in batch:
        new_item=item.copy()
        new_item+=[pad_token_id]

        padded=(
            new_item+[pad_token_id]* (batch_max_length-len(new_item))
        )

        inputs= torch.tensor(padded[:-1])
        targets= torch.tensor(padded[:-1])

        mask = targets == pad_token_id
        indices= torch.nonzero(mask).squeeze()
        if indices.numel() > 1:
            targets[indices[1:]]=ignore_idx
        if allowed_max_len is not None:
            inputs= inputs[:allowed_max_len]
            targets= targets[:allowed_max_len]

        input_lst.append(inputs)
        target_lsts.append(targets)
    
    inputs_tensor= torch.stack(input_lst).to(device)
    target_tensor= torch.stack(target_lsts).to(device)

    return inputs_tensor, target_tensor

In [22]:
ip1= [12,131,12,1,122]
ip2= [90,98,90,1]
ip3=[0]
batch=(ip1, ip2, ip3)
ips, tar= custom_collate_fn(batch)
print(ips)
print(tar)

tensor([[   12,   131,    12,     1,   122],
        [   90,    98,    90,     1, 50256],
        [    0, 50256, 50256, 50256, 50256]])
tensor([[   12,   131,    12,     1,   122],
        [   90,    98,    90,     1, 50256],
        [    0, 50256,  -100,  -100,  -100]])
