In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict

  from .autonotebook import tqdm as notebook_tqdm


# Load in Dataset

In [3]:
dataset = load_dataset('json', data_files='processed_dataset.jsonl')

train_test_valid = dataset['train'].train_test_split(test_size=0.2)
# Split the 10% test + valid in half test, half valid
test_valid = train_test_valid['test'].train_test_split(test_size=0.5)
# gather everyone if you want to have a single DatasetDict
dataset = DatasetDict({
    'train': train_test_valid['train'],
    'test': test_valid['test'],
    'validation': test_valid['train']})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 223
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 28
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 28
    })
})


In [4]:
model_path = "./Mistral-7B-Instruct-v0.2/"

# Load in Tokenizer and use it to apply chat template

In [5]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

def preprocess_chat_ml(sample):
    input_str = ''.join(sample['input'])
    output_str = ''.join(sample['output'])
    
    # debugging
    # print(f'input_str: {input_str}')
    # print(f'output_str: {output_str}')

    # for Mistral 7B Instruct v0.2 specifically, because apparently chat template has no "system" part
    prompt = [
        {"role": "user", "content": f"Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straight forward." + input_str},
        {"role": "assistant", "content": output_str}
    ]

    tokenized_input = tokenizer.apply_chat_template(prompt, tokenize=False, return_tensors="pt", add_generation_prompt=False)
    return {"text": tokenized_input}

chat_ml_text_dataset = dataset.map(preprocess_chat_ml, batched=False, remove_columns=dataset['train'].column_names)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map:   0%|          | 0/223 [00:00<?, ? examples/s]
No chat template is defined for this tokenizer - using the default template for the LlamaTokenizer class. If the default is not appropriate for your model, please set `tokenizer.chat_template` to an appropriate template. See https://huggingface.co/docs/transformers/main/chat_templating for more information.

Map: 100%|██████████| 223/223 [00:00<00:00, 6306.20 examples/s]


input_str: sad, music
output_str: I want to listen to sad music
input_str: tree, climb
output_str: I want to go on the tree house.
input_str: stars, fascinating
output_str: Stars fascinate me.
input_str: bubble_bath, relaxing
output_str: I want to relax, can I take a bubble bath?
input_str: hot, water
output_str: I am feeling hot and need some water.
input_str: hungry, pizza
output_str: I want pizza for lunch.
input_str: sleepy, story
output_str: I want to hear a bedtime story please.
input_str: scared, dark
output_str: I am scared of the dark.
input_str: sunshine, warm
output_str: The sunshine makes me feel warm and happy.
input_str: swimming, relaxed
output_str: Swimming makes me feel relaxed.
input_str: Dog, Bird, Water_fountain
output_str: I want to go to a place where I can see a dog, a bird, and a water fountain.
input_str: coffee, drink, morning
output_str: You want to drink coffee in the morning.
input_str: garden, relaxing
output_str: The garden is relaxing
input_str: bird_son

Map: 100%|██████████| 28/28 [00:00<00:00, 4159.84 examples/s]


input_str: swim, pool
output_str: Can we go to the pool? I want to swim.
input_str: wind_chimes, melodic
output_str: The wind chimes sound melodic.
input_str: colors, exciting
output_str: Bright colors are exciting to me.
input_str: Me, Volleyball, Weightlifting
output_str: I like playing volleyball and lifting weights to stay fit.
input_str: movie, happy
output_str: Watching movies makes me really happy.
input_str: loud, headphones
output_str: The noise is too loud. I need my headphones.
input_str: book, story
output_str: I want to hear a story from the book.
input_str: sunshine, warm
output_str: It's too warm, can we get out of the sunshine?
input_str: crowded, uncomfortable
output_str: I feel uncomfortable in crowded places.
input_str: Listen_to_music, Sun
output_str: I feel happy listening to music in the sunshine.
input_str: cooking_class
output_str: I want to cook something.
input_str: bike, ride
output_str: I want to go for a bike ride, please.
input_str: tired, nap
output_str: 

Map: 100%|██████████| 28/28 [00:00<00:00, 5651.88 examples/s]


input_str: fireflies, magical
output_str: These fireflies are so magical!
input_str: lunch, eat, restaurant
output_str: You want to eat lunch at a restaurant.
input_str: train, play
output_str: I want to play with my train set.
input_str: bored, game
output_str: I am bored. Let's play a game!
input_str: cake, bake, birthday
output_str: You want to bake a cake for a birthday.
input_str: listen_to_music, headphones
output_str: I want to listen to music with my headphones on.
input_str: sleepy, bed
output_str: I am feeling sleepy and want to go to bed.
input_str: swimming, pool
output_str: I want to swim in the pool.
input_str: Dog, Moon, Pancakes
output_str: "I want to eat pancakes while looking at the moon with a dog."
input_str: library, quiet
output_str: I like the library because it's quiet.
input_str: rainbow, beautiful
output_str: I think rainbows are beautiful.
input_str: dog, walk
output_str: The dog needs to go for a walk.
input_str: bored, game
output_str: I am bored of playing

Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 416.31ba/s]
Saving the dataset (1/1 shards): 100%|██████████| 223/223 [00:00<00:00, 106190.94 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1125.99ba/s]
Saving the dataset (1/1 shards): 100%|██████████| 28/28 [00:00<00:00, 15477.14 examples/s]
Creating json from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1004.86ba/s]
Saving the dataset (1/1 shards): 100%|██████████| 28/28 [00:00<00:00, 16971.17 examples/s]


# Save datasets to disk as .jsonl

In [None]:
train_file = "train.jsonl"
test_file = "test.jsonl"
valid_file = "validation.jsonl"

for split, dataset in chat_ml_text_dataset.items():
    dataset.to_json(f"data/{split}.jsonl")
    # dataset.save_to_disk(f"{split}.jsonl")

# Sanity check

In [6]:
# Print out the first few examples from chat_ml_text_dataset
for i in range(3):
    print(f"Example {i} from chat_ml_text_dataset:")
    print("Text:", chat_ml_text_dataset['train'][i])
    print()

Example 0 from chat_ml_text_dataset:
Text: {'text': '<s>[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straight forward.sad, music [/INST] I want to listen to sad music </s>'}

Example 1 from chat_ml_text_dataset:
Text: {'text': '<s>[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images. Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. - Be empathetic and direct. - Look for deeper meanings in the input. - Keep the tone practical and straight forward.tree, climb [/INST] I want to go on the tree house. </s>'}

Example 2 from chat_ml_text_dataset:
Text: {'text': '<s>[INST] Assist a non-ver