# Data preprocessing for different prompt formats

## Importing packages

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
# from unsloth import FastLanguageModel
# leaving Trainer out for now to use SFTTrainer instead
from trl import SFTTrainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import os
import torch


  from .autonotebook import tqdm as notebook_tqdm


## Loading in model

In [2]:
# model_id = "mistralai/Mistral-7B-v0.1"
# model_id = "filipealmeida/Mistral-7B-v0.1-sharded"
# model_id = "imiraoui/OpenHermes-2.5-Mistral-7B-sharded"
# model_id = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"
# model_id = "teknium/OpenHermes-2.5-Mistral-7B"
# model_id = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4")
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, torch_dtype=torch.bfloat16, device_map='auto')
# model = FastLanguageModel.from_pretrained(model_id, quantization_config=bnb_config, torch_dtype=torch.bfloat16, device_map='auto')

Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  2.15it/s]


## Aphasia Dataset

In [None]:
dataset = load_dataset("TuringsSolutions/Aphasia500")

train_test_valid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

print(dataset)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def add_text_dataset_prompt(sample):
    # input = dataset[i]['input']
    input = sample['Prompt']
    # output = dataset[i]['output']
    output = sample['Response']
    # debugging
    # print(f'input is: {input}')
    # print(f'output is: {output}')
    prompt = f"""[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images.
    Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity.
    - Be empathetic and direct.
    - Look for deeper meanings in the input.
    - Keep the tone practical and straightforward.
    input: {input} [/INST] {output} </s>"""

    # return {'text': prompt}
    return prompt

In [None]:
print(dataset['train'][2])

In [None]:
# Transform each subset of the dataset
# TODO: use Dataset.map() functionality instead for these, if possible
train_texts = [add_text_dataset_prompt(example) for example in dataset['train']]
test_texts = [add_text_dataset_prompt(example) for example in dataset['test']]
validation_texts = [add_text_dataset_prompt(example) for example in dataset['validation']]

# Combine these into a DatasetDict
text_dataset_with_prompt = DatasetDict({
    'train': Dataset.from_dict({"text": train_texts}),
    'test': Dataset.from_dict({"text": test_texts}),
    'validation': Dataset.from_dict({"text": validation_texts})
})

print(text_dataset_with_prompt)

## My dataset

In [3]:
dataset = load_dataset('json', data_files='data/processed_dataset_full.jsonl')

train_test_valid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 422
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 53
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 53
    })
})


## Prompt Formatting

### No format

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def add_text_dataset_prompt(sample):
    input = sample['input']
    output = sample['output']
    prompt = f"""[INST] input: {input} [/INST] {output} </s>"""

    return prompt

### \<s> [INST] [/INST] <\/s> Format

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def add_text_dataset_prompt(sample):
    # input = dataset[i]['input']
    input = sample['input']
    # output = dataset[i]['output']
    output = sample['output']
    # debugging
    # print(f'input is: {input}')
    # print(f'output is: {output}')
    prompt = f"""[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images.
    Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity.
    - Be empathetic and direct.
    - Look for deeper meanings in the input.
    - Keep the tone practical and straightforward.
    input: {input} [/INST] {output} </s>"""

    # return {'text': prompt}
    return prompt

#### Formatting text dataset (for SFTTrainer)

Doesn't actually need to manually tokenizer, SFTTrainer takes care of this

In [None]:
# Transform each subset of the dataset
# TODO: use Dataset.map() functionality instead for these, if possible
train_texts = [add_text_dataset_prompt(example) for example in dataset['train']]
test_texts = [add_text_dataset_prompt(example) for example in dataset['test']]
validation_texts = [add_text_dataset_prompt(example) for example in dataset['validation']]

# Combine these into a DatasetDict
text_dataset_with_prompt = DatasetDict({
    'train': Dataset.from_dict({"text": train_texts}),
    'test': Dataset.from_dict({"text": test_texts}),
    'validation': Dataset.from_dict({"text": validation_texts})
})

print(text_dataset_with_prompt)

#### Sanity check on text dataset with prompts

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", text_dataset_with_prompt['train'][i]['text'])
    print()

#### Sanity check on tokenized dataset with prompts

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", tokenizer.encode(text_dataset_with_prompt['train'][i]['text']))
    print()

In [None]:
# quick sanity check
print(f'tokenizer.bos_token: {tokenizer.bos_token}')
print(f'tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'tokenizer.pad_token: {tokenizer.pad_token}')
print(f'tokenizer.pad_token_id: {tokenizer.pad_token_id}')
print(f'tokenizer.eos_token: {tokenizer.eos_token}')
print(f'tokenizer.eos_token_id: {tokenizer.eos_token_id}')

### [INST] [/INST] Format

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def add_text_dataset_prompt(sample):
    # input = dataset[i]['input']
    input = sample['input']
    # output = dataset[i]['output']
    output = sample['output']
    # debugging
    # print(f'input is: {input}')
    # print(f'output is: {output}')
    # experimenting with adding BOS <s> token manually
    prompt = f"""[INST]Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images.
    Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity.
    - Be empathetic and direct.
    - Look for deeper meanings in the input.
    - Keep the tone practical and straightforward.[/INST]
    input: {input}
    output: {output}"""

    # return {'text': prompt}
    return prompt

In [None]:
# quick sanity check
print(f'tokenizer.bos_token: {tokenizer.bos_token}')
print(f'tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'tokenizer.pad_token: {tokenizer.pad_token}')
print(f'tokenizer.pad_token_id: {tokenizer.pad_token_id}')
print(f'tokenizer.eos_token: {tokenizer.eos_token}')
print(f'tokenizer.eos_token_id: {tokenizer.eos_token_id}')

#### Formatting text dataset (for SFTTrainer)

Doesn't actually need to manually tokenizer, SFTTrainer takes care of this

In [None]:
# Transform each subset of the dataset
# TODO: use Dataset.map() functionality instead for these, if possible
train_texts = [add_text_dataset_prompt(example) for example in dataset['train']]
test_texts = [add_text_dataset_prompt(example) for example in dataset['test']]
validation_texts = [add_text_dataset_prompt(example) for example in dataset['validation']]

# Combine these into a DatasetDict
text_dataset_with_prompt = DatasetDict({
    'train': Dataset.from_dict({"text": train_texts}),
    'test': Dataset.from_dict({"text": test_texts}),
    'validation': Dataset.from_dict({"text": validation_texts})
})

print(text_dataset_with_prompt)

#### Sanity check on text dataset with prompts

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", text_dataset_with_prompt['train'][i]['text'])
    print()

#### Sanity check on tokenized dataset with prompts

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", tokenizer.encode(text_dataset_with_prompt['train'][i]['text']))
    print()

In [None]:
# quick sanity check
print(f'tokenizer.bos_token: {tokenizer.bos_token}')
print(f'tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'tokenizer.pad_token: {tokenizer.pad_token}')
print(f'tokenizer.pad_token_id: {tokenizer.pad_token_id}')
print(f'tokenizer.eos_token: {tokenizer.eos_token}')
print(f'tokenizer.eos_token_id: {tokenizer.eos_token_id}')

### ChatML Format

##### Tokenizing dataset for ChatML format

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_chat_ml(sample):
    input_str = ''.join(sample['input'])
    output_str = ''.join(sample['output'])
    
    # debugging
    print(f'input_str: {input_str}')
    print(f'output_str: {output_str}')

    # prompt = [
    #     {"role": "system", "content": f"[INST]Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straightforward.[/INST]"},
    #     {"role": "user", "content": input_str},
    #     {"role": "assistant", "content": output_str}
    # ]

    # for Mistral 7B Instruct v0.2 specifically, because apparently chat template has no "system" part
    prompt = [
        {"role": "user", "content": f"Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straightforward." + input_str},
        {"role": "assistant", "content": output_str}
    ]

    tokenized_input = tokenizer.apply_chat_template(prompt, tokenize=False, return_tensors="pt", add_generation_prompt=False)
    return {"text": tokenized_input}

chat_ml_text_dataset = dataset.map(preprocess_chat_ml, batched=False, remove_columns=dataset['train'].column_names)

#### Sanity check on text dataset with ChatML format

In [None]:
# Print out the first few examples from chat_ml_text_dataset
for i in range(3):
    print(f"Example {i} from chat_ml_text_dataset:")
    print("Text:", chat_ml_text_dataset['train'][i])
    print()

#### Sanity check on tokenized dataset with ChatML format

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", tokenizer.encode(text_dataset_with_prompt['train'][i]['text']))
    print()

In [None]:
# quick sanity check
print(f'tokenizer.bos_token: {tokenizer.bos_token}')
print(f'tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'tokenizer.pad_token: {tokenizer.pad_token}')
print(f'tokenizer.pad_token_id: {tokenizer.pad_token_id}')
print(f'tokenizer.eos_token: {tokenizer.eos_token}')
print(f'tokenizer.eos_token_id: {tokenizer.eos_token_id}')

### Gemma Format

##### Tokenizing dataset for Gemma format

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, add_eos_token=True, add_bos_token=True)

if tokenizer.pad_token is None:
    # debugging
    print('pad token is none')
    tokenizer.pad_token = tokenizer.eos_token

def preprocess_chat_ml(sample):
    input_str = ''.join(sample['input'])
    output_str = ''.join(sample['output'])
    
    # debugging
    # print(f'input_str: {input_str}')
    # print(f'output_str: {output_str}')
    
    prompt = [
        {"role": "user", "content": f"Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straightforward." + input_str},
        {"role": "assistant", "content": output_str}
    ]

    tokenized_input = tokenizer.apply_chat_template(prompt, tokenize=False, return_tensors="pt", add_generation_prompt=False)
    tokenized_input = tokenized_input[:-1]
    tokenized_input += "<eos>"
    return {"text": tokenized_input}

chat_ml_text_dataset = dataset.map(preprocess_chat_ml, batched=False, remove_columns=dataset['train'].column_names)

Map: 100%|██████████| 422/422 [00:00<00:00, 16709.59 examples/s]
Map: 100%|██████████| 53/53 [00:00<00:00, 12975.61 examples/s]
Map: 100%|██████████| 53/53 [00:00<00:00, 12963.50 examples/s]


#### Sanity check on text dataset with ChatML format

In [11]:
# Print out the first few examples from chat_ml_text_dataset
for i in range(3):
    print(f"Example {i} from chat_ml_text_dataset:")
    print("Text:", chat_ml_text_dataset['train'][i])
    print()

Example 0 from chat_ml_text_dataset:
Text: {'text': "<bos><start_of_turn>user\nAssist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straightforward.sad, hug<end_of_turn>\n<start_of_turn>model\nI'm sad and need a hug.<end_of_turn><eos>"}

Example 1 from chat_ml_text_dataset:
Text: {'text': "<bos><start_of_turn>user\nAssist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straightforward.X, dinner<end_of_turn>\n<start_of_turn>model\nI don't want to go to dinner.<

#### Sanity check on tokenized dataset with ChatML format

In [None]:
# Print out the first few examples from the tokenized dataset
for i in range(3):  # Adjust the range to inspect more examples
    print(f"Example {i}:")
    print("Prompt:", tokenizer.encode(text_dataset_with_prompt['train'][i]['text']))
    print()

In [None]:
# quick sanity check
print(f'tokenizer.bos_token: {tokenizer.bos_token}')
print(f'tokenizer.bos_token_id: {tokenizer.bos_token_id}')
print(f'tokenizer.pad_token: {tokenizer.pad_token}')
print(f'tokenizer.pad_token_id: {tokenizer.pad_token_id}')
print(f'tokenizer.eos_token: {tokenizer.eos_token}')
print(f'tokenizer.eos_token_id: {tokenizer.eos_token_id}')