# Fine-tuning to follow instructions
## Prepare dataset for supervised instruction fine-tuning

In [17]:
import os, json, urllib, torch

def download_and_load_file(file_path, url):
    # compose .json file
    if not os.path.exists(file_path):
        with urllib.request.urlopen(url) as response:
            text_data = response.read().decode("utf-8")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(text_data)
    else:
        with open(file_path, "r", encoding="utf-8") as file:
            text_data = file.read()
    # read .json file to get <data>
    with open(file_path, "r") as file:
        data = json.load(file)
    return data
        
file_path = "instruction-data.json"
url = "https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch07/01_main-chapter-code/instruction-data.json"
data = download_and_load_file(file_path, url)
print("Number of entries:", len(data))

Number of entries: 1100


In [4]:
# Print 1 entry to test 
# => {instruction:"", input:"", output:""}
print("Example entry:\n", data[50])

Example entry:
 {'instruction': 'Identify the correct spelling of the following word.', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}


In [5]:
print("Another example entry:\n", data[999])

Another example entry:
 {'instruction': "What is an antonym of 'complicated'?", 'input': '', 'output': "An antonym of 'complicated' is 'simple'."}


### Convert "data" to Alpaca-style prompt format

In [8]:
def format_input(entry):
    instruction_text = (
        f"Below is an instruction that describes a task. Write a response that appropriately completes the request."
        f"\n\n### Instruction:\n{entry['instruction']}"
    )
    input_text = (
        f"\n\n### Input:\n{entry['input']}" if entry["input"] else ""
    )
    return instruction_text + input_text

model_input = format_input(data[50])
desired_response = f"\n\n### Response:\n{data[50]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Identify the correct spelling of the following word.

### Input:
Ocassion

### Response:
The correct spelling is 'Occasion.'


In [9]:
# In this case the field "input" is missing
model_input = format_input(data[999])
desired_response = f"\n\n### Response:\n{data[999]['output']}"
print(model_input + desired_response)

Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
What is an antonym of 'complicated'?

### Response:
An antonym of 'complicated' is 'simple'.


### Partioning the data in to train/val/test

In [12]:
train_portion = int(len(data)*0.85)
test_portion = int(len(data)*0.1)
val_portion = len(data) - train_portion - test_portion

train_data = data[:train_portion]
test_data = data[train_portion:train_portion+test_portion]
val_data = data[train_portion+test_portion:]

print("Training set length:", len(train_data)) # 85%
print("Validation set length:", len(val_data)) # 5%
print("Test set length:", len(test_data)) # 10%

Training set length: 935
Validation set length: 55
Test set length: 110


## Organizing data into training batches

In [None]:
import torch
from torch.utils.data import Dataset

class InstructionDataset(Dataset):
    # data: list of {'instruction': 'Identify...', 'input': 'Ocassion', 'output': "The correct spelling is 'Occasion.'"}
    def __init__(self, data, tokenizer):
        self.data = data 
        self.encoded_texts = []
        for entry in data:
            instruction_plus_input = format_input(entry)
            response_text = f"\n\n### Response:\n{entry['output']}"
            full_text = instruction_plus_input + response_text
            # list of e.g [23,1,244,...]/[2,456,..]/; each is encoded conversation text
            self.encoded_texts.append(
                tokenizer.encode(full_text)
            )
            
    def __getitem__(self, index):
        return self.encoded_texts[index]

    def __len__(self):
        return len(self.data)
    

In [13]:
# Check id of the padding <|endoftext|> token
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
print(tokenizer.encode("<|endoftext|>", allowed_special={"<|endoftext|>"}))

[50256]


In [21]:
# Padding to the longest length IN THE BATCH ONLY
def custom_collate_draft_1(batch, pad_token_id=50256, device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch) # max length of each prompt PLUS 1
    
    inputs_lst = []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = new_item + [pad_token_id]*(batch_max_length-len(new_item)) # even the longest prompt with be padded with <50256>
        inputs = torch.tensor(padded[:-1]) # last <50256> token removed => tensor of (max_len,)
        inputs_lst.append(inputs)
    inputs_tensor = torch.stack(inputs_lst).to(device) # (batch,max_len)
    return inputs_tensor

inputs_1 = [0,1,2,3,4]
inputs_2 = [5,6]
inputs_3 = [7,8,9]
batch = (inputs_1, inputs_2, inputs_3)
print(custom_collate_draft_1(batch))

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])


In [23]:
# Create TARGET TOKEN IDs
# which is input-token-ids shifted by 1 to the left; then a PADDING-TOKEN at the end
def custom_collate_draft_2(batch, pad_token_id=50256, device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch) # max length of each prompt PLUS 1
    
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]
        padded = new_item + [pad_token_id]*(batch_max_length-len(new_item)) # even the longest prompt will be padded with <50256>
        inputs = torch.tensor(padded[:-1]) # last <50256> token removed => tensor of (max_len,)
        targets = torch.tensor(padded[1:]) # old sentence shifted left, then an additional <50256> on the right => tensor of (max_len,)
        inputs_lst.append(inputs)
        targets_lst.append(targets)

    inputs_tensor = torch.stack(inputs_lst).to(device) # (batch,max_len)
    targets_tensor = torch.stack(targets_lst).to(device) # (batch,max_len)
    return inputs_tensor, targets_tensor

inputs, targets = custom_collate_draft_2(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256, 50256, 50256, 50256],
        [    8,     9, 50256, 50256, 50256]])


In [37]:
# Replace padding tokens with <-100> tokens (EXCEPT the EndOfLine)
# batch_max_length: length of the longest sentence in that batch
# allowed_max_length: if a sentence is longer than e.g. 1024 tokens => trim it down
def custom_collate_fn(batch, pad_token_id=50256, ignore_index=-100, allowed_max_length=None, device="cpu"):
    batch_max_length = max(len(item)+1 for item in batch) # max length of each sentence PLUS 1
    inputs_lst, targets_lst = [], []
    for item in batch:
        new_item = item.copy()
        new_item += [pad_token_id]

        padded = new_item + [pad_token_id]*(batch_max_length-len(new_item)) # even the longest prompt will be padded with <50256>
        inputs = torch.tensor(padded[:-1]) # last <50256> token removed => tensor of (max_len,)
        targets = torch.tensor(padded[1:]) # old sentence shifted left, then an additional <50256> on the right => tensor of (max_len,)

        mask = targets == pad_token_id # (max_len,) of bools; True where <50256>s are
        # location of nonzero locations (num_nonzeros,1) => squeeze to (num_nonzeros,); each is index location
        indices = torch.nonzero(mask).squeeze() 
        if indices.numel() > 1:
            targets[indices[1:]] = ignore_index # change every last indices to -100 except one

        # trim down too long sentences
        if allowed_max_length is not None:
            inputs = inputs[:allowed_max_length]
            targets = targets[:allowed_max_length]

        inputs_lst.append(inputs) # (batch,max_len)
        targets_lst.append(targets) # (batch,max_len)

    inputs_tensor = torch.stack(inputs_lst).to(device)
    targets_tensor = torch.stack(targets_lst).to(device)
    return inputs_tensor, targets_tensor

inputs, targets = custom_collate_fn(batch)
print(inputs)
print(targets)

tensor([[    0,     1,     2,     3,     4],
        [    5,     6, 50256, 50256, 50256],
        [    7,     8,     9, 50256, 50256]])
tensor([[    1,     2,     3,     4, 50256],
        [    6, 50256,  -100,  -100,  -100],
        [    8,     9, 50256,  -100,  -100]])
