# Evaluating fine tuned model

## Importing packages

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
# from unsloth import FastLanguageModel
# leaving Trainer out for now to use SFTTrainer instead
from trl import SFTTrainer
from datasets import load_dataset, Dataset, DatasetDict
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
import os
import torch


In [None]:
# toggles

chat = False
# chat = True
# prompt_format = "llama"
prompt_format = "mistral"
anton = False

In [None]:
# NOTE: restart kernel so that we don't run out of memory re-loading base model

# Loading base model again
# model_id = "mistralai/Mistral-7B-v0.1"
# model_id = "filipealmeida/Mistral-7B-v0.1-sharded"
# model_id = "filipealmeida/Mistral-7B-Instruct-v0.1-sharded"
# model_id = "imiraoui/OpenHermes-2.5-Mistral-7B-sharded"
# model_id = "teknium/OpenHermes-2.5-Mistral-7B"
# model_id = "mistralai/Mistral-7B-Instruct-v0.2"
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
# model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
# model_id = "google/gemma-7b-it"
# model_id = "google/gemma-2b-it"

bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4")
# bnb_config = BitsAndBytesConfig(load_in_8bit=True)
base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map='auto')

# for anton adapters, need to resize model and token embeddings for added token special vocabularly
if anton:
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, add_bos_token=True)
    DEFAULT_PAD_TOKEN = "<|pad|>"
    DEFAULT_EOS_TOKEN = "<|endoftext|>"
    DEFAULT_UNK_TOKEN = "<|unk|>"

    special_tokens_dict = dict()
    if tokenizer.pad_token is None:
        special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
    if tokenizer.eos_token is None:
        special_tokens_dict["eos_token"] = DEFAULT_EOS_TOKEN
    if tokenizer.unk_token is None:
        special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN

    tokenizer.add_special_tokens(special_tokens_dict)
    base_model.resize_token_embeddings(len(tokenizer))

if prompt_format == "llama":
    llama = True
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, add_bos_token=True)
    if tokenizer.pad_token is None:
        DEFAULT_PAD_TOKEN = "<|pad|>"
        DEFAULT_UNK_TOKEN = "<|unk|>"
        special_tokens_dict = dict()
        if tokenizer.pad_token is None:
            special_tokens_dict["pad_token"] = DEFAULT_PAD_TOKEN
        if tokenizer.unk_token is None:
            special_tokens_dict["unk_token"] = DEFAULT_UNK_TOKEN
        tokenizer.add_special_tokens(special_tokens_dict)
        base_model.resize_token_embeddings(len(tokenizer))


# Merging base model with fine-tuned LoRa adapter
# WORKS
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-02-11-24-anton")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-final-anton")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-03-02-24-4-breaking-test-2")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-gemma-7b-it-03-01-24/")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-03-02-24-4-breaking-test-3-50-epochs")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-1")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-2")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-3")
# TESTING
# ft_model = PeftModel.from_pretrained(base_model, "./models/gemma-2b-it-03-02-24")
# ft_model = PeftModel.from_pretrained(base_model, "./models/kto-ft-mistral-7b-instruct-v02-1")
# ft_model = PeftModel.from_pretrained(base_model, "./models/kto-ft-mistral-7b-instruct-v02-2")
# ft_model = PeftModel.from_pretrained(base_model, "./models/kto-ft-mistral-7b-instruct-v02-3")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-KTO-1")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-KTO-2")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v03-04")
ft_model = PeftModel.from_pretrained(base_model, './models/ft-mistral-7b-instruct-v03-10')
# GOOD MODEL TO GO WITH
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-03-02-24-3")
# ft_model = PeftModel.from_pretrained(base_model, "./models/gemma-2b-it-03-03-24-2")
# ft_model = PeftModel.from_pretrained(base_model, "./models/ft-mistral-7b-instruct-v02-final")
# ft_model = PeftModel.from_pretrained(base_model, "./models/llama-3-8b-instruct-4")
ft_model.to("cuda")

### Evaluating for ChatML formatted prompt

In [None]:
if chat:
    if prompt_format == "mistral":
        # for mistral
        ft_model_input = [
            {"role": "user", "content": """Assist a non-verbal autistic individual in communicating their thoughts or needs through selected words.
             Your task: infer and articulate the message in first-person, i.e. using I and pretending you are the user.
             - Be concise and only provide the answer to the following input.
             - Look for deeper meanings in the input.
             - Keep the tone practical and straightforward.
                input: fins, mask, snorkel, scuba""",
            },
            { "role": "assistant", "content": ""}
            # { "role": "model", "content": ""}
        ]
    elif prompt_format == "llama":
        # for llama
        ft_model_input = [
            {"role": "system", "content": f"""
             Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images.
             Your task: infer and articulate the message in first-person, i.e. using I and pretending you are the user.
             - Be concise and only provide the answer to the following input.
             - Be empathetic and direct.
             - Look for deeper meanings in the input.
             - Keep the tone practical and straightforward."""},
            {"role": "user", "content": "fins, mask, snorkel"},
            {"role": "assistant", "content": ""}
        ]

    # Re-init the tokenizer so it doesn't add padding or eos token
    if prompt_format == "mistral":
        # for mistral
        ft_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, padding_side="left")
    else:
        ft_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, padding_side="right")

    # DEFAULT_PAD_TOKEN = "<|pad|>"
    # DEFAULT_EOS_TOKEN = "<|endoftext|>"
    # DEFAULT_UNK_TOKEN = "<|unk|>"

    if ft_tokenizer.pad_token is None:
        ft_tokenizer.pad_token = ft_tokenizer.unk_token
        # ft_tokenizer.pad_token = DEFAULT_PAD_TOKEN
        # ft_tokenizer.eos_token = DEFAULT_EOS_TOKEN

    ft_model_input_tokenized = ft_tokenizer.apply_chat_template(ft_model_input, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")

    # print(f'ft_model_input_tokenized: {ft_model_input_tokenized}')

    # ft_model.eval()

    generated_output = ft_model.generate(
        ft_model_input_tokenized,
        max_new_tokens=500,
        pad_token_id=ft_tokenizer.pad_token_id
    )

    ft_model_output = ft_tokenizer.decode(generated_output[0], skip_special_tokens=True)
    print(ft_model_output)


### Evaluating for non-ChatML formatted prompt

In [None]:
if not chat:    
    ft_model_input = f"""[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected text input.
    Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. Be concise. 
    Only give the output for the input provided. Do not come up with new inputs after. Assume the user is trying to communicate with someone.
    Usually those are wants, desires, needs, etc.

    ### Input: scuba, fins, mask, snorkel, store [/INST]
    ### Output:"""

    # Re-init the tokenizer so it doesn't add padding or eos token
    ft_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, padding_side="left")

    DEFAULT_PAD_TOKEN = "<|pad|>"
    DEFAULT_EOS_TOKEN = "<|endoftext|>"
    DEFAULT_UNK_TOKEN = "<|unk|>"

    if ft_tokenizer.pad_token is None:
        ft_tokenizer.pad_token = ft_tokenizer.unk_token
        # ft_tokenizer.unk_token = DEFAULT_UNK_TOKEN
        # ft_tokenizer.pad_token = DEFAULT_PAD_TOKEN
        # ft_tokenizer.eos_token = DEFAULT_EOS_TOKEN

    # debugging
    # print(f'tokenizer.pad_token: {ft_tokenizer.pad_token}')
    # print(f'tokenizer.eos_token: {ft_tokenizer.eos_token}')
    # print(f'tokenizer.unk_token: {ft_tokenizer.unk_token}')

    ft_model_input_tokenized = ft_tokenizer(ft_model_input, return_tensors="pt").to("cuda")

    ft_model.eval()

    # Generate model output with specified parameters
    generated_output = ft_model.generate(
        **ft_model_input_tokenized,
        max_new_tokens=100,
        pad_token_id=ft_tokenizer.unk_token_id,
        do_sample=False,
        output_scores=True
    )
    ft_model_output = ft_tokenizer.decode(generated_output[0], skip_special_tokens=False)
    print(ft_model_output)

### Evaluating Base Model to compare

In [None]:
# from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments, Trainer
# # from unsloth import FastLanguageModel
# # leaving Trainer out for now to use SFTTrainer instead
# from trl import SFTTrainer
# from datasets import load_dataset, Dataset, DatasetDict
# from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel
# import os
# import torch

# model_type = "mistral"

# if model_type == "chat_ml":
#     model_id = "google/gemma-2b-it"
# else:
#     model_id = "mistralai/Mistral-7B-Instruct-v0.2"
# bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_type=torch.float16, bnb_4bit_quant_type="nf4")
# base_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map='auto')
# if model_type == "chat_ml":
#     base_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, add_bos_token=True)
#     base_model_input = [
#         {
#             "role": "user", "content": """Assist a non-verbal autistic individual in communicating their thoughts or needs through selected images.
#             Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity.
#             - Be empathetic and direct.
#             - Look for deeper meanings in the input.
#             - Keep the tone practical and straightforward.
#             Only give the output for the input provided. Do not come up with new inputs abaseer.
#             input: fins, mask, snorkel""",
#         },
#         # { "role": "assistant", "content": ""}
#         { "role": "model", "content": ""}
#     ]
#     base_model_input_tokenized = base_tokenizer.apply_chat_template(base_model_input, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
#     generated_output = base_model.generate(
#         base_model_input_tokenized,
#         max_new_tokens=200,
#         pad_token_id=base_tokenizer.pad_token_id
#     )
# else:
#     base_tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, padding_side="left")
#     if base_tokenizer.pad_token is None:
#         base_tokenizer.pad_token = base_tokenizer.unk_token
#     base_model_input = f"""[INST] Assist a non-verbal autistic individual in communicating their thoughts or needs through selected text input.
#     Your task: infer and articulate the message in first-person, using simple, direct language with empathy and clarity. Be concise. 
#     Only give the output for the input provided. Do not come up with new inputs after. 

#     ### Input: Porter Robinson, Worlds, Nurture, Look at the Sky [/INST]
#     ### Output:"""
#     base_model_input_tokenized = base_tokenizer(base_model_input, return_tensors="pt").to("cuda")
#     # print(f'base_model_input_tokenized: {base_model_input_tokenized}')
#     # base_model.eval()
#     generated_output = base_model.generate(
#         **base_model_input_tokenized,
#         max_new_tokens=200,
#         pad_token_id=base_tokenizer.pad_token_id
#     )

# base_model_output = base_tokenizer.decode(generated_output[0], skip_special_tokens=False)
# print(base_model_output)

# Using the functions from eval_ft.py

In [None]:
from eval_ft import load_model, generate_tokenizer, merge_model, format_prompt, generate_output, read_user_inputs, eval_base_model
import random

unsloth = True
anton = False

# gemma 2b
# model_id = "unsloth/gemma-2b-it-bnb-4bit"
# model_path = "./models/gemma-2b-it-06-13-24-unsloth-1"
# prompt_format = "gemma"
# chat = True

# phi 3
# model_id = "unsloth/phi-3-instruct-v0.3-bnb-4bit"
# model_path = "./models/phi-3-06-29-24-unsloth-1"
# prompt_format = "chat_ml"
# chat = True

# mistral
model_id = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit"
model_path = './models/ft-mistral-7b-instruct-v03-10'
prompt_format = "mistral"
chat = False

input_file_path = None
# input_file_path = "./data/processed_dataset_full.jsonl"
num_inputs = 1
user_input = "fish, camp, pee"


base_model = load_model(model_id, unsloth)
ft_tokenizer = generate_tokenizer(model_id, base_model, anton, prompt_format, unsloth)
ft_model = merge_model(base_model, model_path, unsloth)
if input_file_path:
    user_inputs = read_user_inputs(input_file_path)
    for i in range(num_inputs):
        user_input = user_inputs[random.randint(0, len(user_inputs)-1)]
        prompt = format_prompt(user_input, chat, prompt_format)
        generate_output(ft_tokenizer, ft_model, prompt, chat, prompt_format)
elif user_input:
    prompt = format_prompt(user_input, chat, prompt_format)
    generate_output(ft_tokenizer, ft_model, prompt, chat, prompt_format)
else:
    raise ValueError('No input or input file path provided to evaluate model.')

# if eval_base_model:
#     eval_base_model()