In [3]:
# https://www.databricks.com/blog/efficient-fine-tuning-lora-guide-llms
import pandas as pd
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


## Data Loading and Preprocessing
Convert CSV into instruction-format for finetuning 

In [55]:
similes_ds = load_dataset('csv', data_files='./lora_data/textfx/data/similes.csv')
similes_df = pd.DataFrame(similes_ds['train'])

In [56]:
similes_df.head()

Unnamed: 0,input,output
0,Academic Advisor,"A good academic advisor is like a lighthouse, ..."
1,Academic Advisor,"An academic advisor is like a tour guide, lead..."
2,Academic Advisor,"He was like a lighthouse, guiding me through t..."
3,Academic Advisor,"My academic advisor was like a lighthouse, gui..."
4,Academic Advisor,"My academic advisor was like a lighthouse, gui..."


In [57]:
# Combine the two attributes into an instruction string
# https://textfx.withgoogle.com/
similes_df['instruction'] = 'Create a simile that illustrates this concept: ' + similes_df['input']
similes_df = similes_df[['instruction', 'output']]

similes_df.head()

Unnamed: 0,instruction,output
0,Create a simile that illustrates this concept:...,"A good academic advisor is like a lighthouse, ..."
1,Create a simile that illustrates this concept:...,"An academic advisor is like a tour guide, lead..."
2,Create a simile that illustrates this concept:...,"He was like a lighthouse, guiding me through t..."
3,Create a simile that illustrates this concept:...,"My academic advisor was like a lighthouse, gui..."
4,Create a simile that illustrates this concept:...,"My academic advisor was like a lighthouse, gui..."


In [58]:
similes_df_sample = similes_df.sample(n=20, random_state=42) # TODO: change for full fine-tuning

instruction_template = """<s>[INST] {} [/INST]

### Instruction:

{}

### Response:\n"""

similes_df_sample['prompt'] = similes_df_sample["instruction"].apply(lambda x: instruction_template.format(x))
similes_df_sample.rename(columns={'output': 'response'}, inplace=True)

In [59]:
similes_df_sample.head()

Unnamed: 0,instruction,response,prompt
1752,Create a simile that illustrates this concept:...,She held her principles like a lighthouse in a...,Below is an instruction that describes a task....
748,Create a simile that illustrates this concept:...,"Family dinner was like a three-ring circus, wi...",Below is an instruction that describes a task....
194,Create a simile that illustrates this concept:...,The building rose up from the ground like a ph...,Below is an instruction that describes a task....
1099,Create a simile that illustrates this concept:...,"An idea is like a seed, planted in the fertile...",Below is an instruction that describes a task....
1178,Create a simile that illustrates this concept:...,Karaoke night is like an episode of American I...,Below is an instruction that describes a task....


In [60]:
similes_df_sample['response'] = similes_df_sample['response'] + "\n### End"
similes_df_sample = similes_df_sample[['prompt', 'response']]

similes_df_sample['text'] = similes_df_sample["prompt"] + similes_df_sample["response"]
similes_df_sample.drop(columns=['prompt', 'response'], inplace=True)

In [63]:
similes_df_sample.head()

Unnamed: 0,text
1752,Below is an instruction that describes a task....
748,Below is an instruction that describes a task....
194,Below is an instruction that describes a task....
1099,Below is an instruction that describes a task....
1178,Below is an instruction that describes a task....


## Train LoRA on Phi2

In [64]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")

In [66]:
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")

messages = [
    {"role": "user", "content": "What is your favourite condiment?"},
    {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
    {"role": "user", "content": "Do you have mayonnaise recipes?"}
]
encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
print(encodeds)

model-00001-of-00003.safetensors:  17%|█▋        | 818M/4.94G [01:52<09:27, 7.27MB/s]
Downloading shards:   0%|          | 0/3 [01:53<?, ?it/s]


KeyboardInterrupt: 