In [1]:
"""
    check if gsm9k scores remain same after stopping criteria fix
    check what batch outputs
"""

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys

sys.path.append('/workspace/llmsearch/')

import awq
import torch
import transformers

import llmsearch

print(awq.__version__, torch.__version__, transformers.__version__, llmsearch.__version__)

Monkey Patching .generate function of `transformers` library
0.2.4 2.2.0+cu121 4.38.2 0.1.0


In [2]:
import re
import textwrap
from pathlib import Path

import datasets

from awq import AutoAWQForCausalLM
from sklearn.model_selection import GridSearchCV
from transformers import StoppingCriteriaList, AutoTokenizer

from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import gc_cuda
from llmsearch.utils.common_utils import json_load, json_dump
from llmsearch.utils.model_downloader import download_model_from_hf
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria

In [3]:
seed = 42
batch_size = 1
num_tune_samples = 150
num_test_samples = 500
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"
device = "cuda:0"

In [4]:
def load_model_and_tokenizer(model_id, temp_model_dir):
    temp_model_dir.mkdir(exist_ok=True, parents=True)
    output_folder = download_model_from_hf(model_id, save_dir=temp_model_dir, branch="main")

    gc_cuda()

    model = AutoAWQForCausalLM.from_quantized(
        quant_path=output_folder, fuse_layers=True, device_map={"": device}, local_files_only=True
    )

    tokenizer = AutoTokenizer.from_pretrained(
        output_folder, local_files_only=True, legacy=False, use_fast=False
    )
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.padding_side = "left"

    return model, tokenizer

def load_dataset():

    def preprocess_dataset(
        dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt=True
    ):

        def wrapper(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            messages = (
                []
                if system_prompt is None
                else [{"role": "system", "content": system_prompt}]
            )
            formatted_pt = pt.format(**{pt_col: sample[pt_col] for pt_col in pt_cols})
            messages.append(
                {
                    "role": "user",
                    "content": formatted_pt,
                }
            )
            formatted_pt_with_ct = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=add_generation_prompt
            )
            return formatted_pt_with_ct

        def actual_input(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            return sample[pt_cols[0]]

        pt_dataset = dataset.map(
            lambda sample: {
                "X": wrapper(sample),
                "actual input": actual_input(sample),
            }
        )

        return pt_dataset


    # 2-shot prompt template - https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
    pt = textwrap.dedent(
    """\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}"""
    )
    pt_cols = ["question"]
    system_prompt = "Solve the following math problems, end with The answer is"
    gsm8k_dataset = datasets.load_dataset("gsm8k", "main")


    processed_dataset = preprocess_dataset(
        gsm8k_dataset["train"],
        tokenizer,
        pt=pt,
        pt_cols=pt_cols,
        system_prompt=system_prompt,
        add_generation_prompt=True,
    )

    shuffled_dataset = processed_dataset.shuffle(seed=seed)

    samples_to_tune_on = shuffled_dataset.select(range(num_tune_samples))
    remaining_indices = range(num_tune_samples, num_tune_samples + num_test_samples)
    test_dataset = shuffled_dataset.select(remaining_indices)
    return samples_to_tune_on, test_dataset

def get_score(y_true, y_pred):
    def standardize(s):
        if s is None:
            return s
        s = s.replace(",", "")
        if s.endswith("."):
            s = s[:-1]
        return s.strip()

    def extract_answer_from_out(s):
        pattern = re.compile(r"The answer is ((\d|\-)((\d|\,|\.)+)?\d?)")
        match = pattern.search(s)
        if match:
            return match.group(1).strip()
        else:
            return None

    scores = []

    for y_t, y_p in zip(y_true, y_pred):
        y_t_answer = y_t["answer"].split("####")[-1].strip()
        y_p_answer = extract_answer_from_out(y_p)

        y_t_answer = standardize(y_t_answer)
        y_p_answer = standardize(y_p_answer)

        # print("y_pred - ", y_p_answer)
        # print("y_true - ", y_t_answer)

        if y_t_answer == y_p_answer:
            scores.append(1)
        else:
            scores.append(0)
    return sum(scores) / len(scores)


In [5]:
from pathlib import Path

# Load Model, Tokenizer, Dataset
temp_model_dir = Path(f"./temp_dir/")
temp_model_dir.mkdir(exist_ok=True, parents=True)

model, tokenizer = load_model_and_tokenizer(model_id, temp_model_dir)

# Dataset we will use to find the best generation parameters
samples_to_tune_on,test_dataset = load_dataset()

multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[32000])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

Model already exists in temp_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-AWQ. Checking the model files...
Checksum validated: model.safetensors  645dfc7f09074aaf25e642f3c6a4f7ea399a0ff2605fa650e4e74078832546de
Checksum validated: tokenizer.model  dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
[+] Validated checksums of all model files!


Replacing layers...: 100%|██████████| 32/32 [00:03<00:00,  9.21it/s]
Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 14.23it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using the latest cached version of the dataset since gsm8k couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'main' at /root/.cache/huggingface/datasets/gsm8k/main/0.0.0/e53f048856ff4f594e959d75785d2c2d37b678ee (last modified on Fri May 24 19:05:38 2024).


In [8]:
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=samples_to_tune_on,
    device="cuda:0",
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest", "add_special_tokens": False},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    scorer=get_score,
    prompt_template="{X}",
    seed=seed,
    column_mapping={"input_cols": ["X"], "eval_cols": ["answer"]},
    callbacks_after_inference=callbacks_after_inference,
)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [9]:
gen_params1 = {
    "max_new_tokens": 500,
    "stopping_criteria": stopping_criteria,
    "generation_seed": 42,
}

scores_before, outputs_before = tuner_ob.get_score(gen_params1)

  0%|          | 0/150 [00:00<?, ?it/s]

In [10]:
print("Before tuning - ", scores_before)

Before tuning -  0.72


In [11]:
best_params = {
    'do_sample' : True,
    'top_k' : 10,
    'top_p' : 0.8,

    'generation_seed' : 42,
    'max_new_tokens' : 500,
    'no_repeat_ngram_size' : 0,
    'stopping_criteria' : stopping_criteria,
}

scores_after, outputs_after = tuner_ob.get_score(best_params)

print("After tuning - ", scores_after)

  0%|          | 0/150 [00:00<?, ?it/s]

After tuning -  0.7266666666666667


In [16]:
del tuner_ob

In [20]:
# TODO
# clear memory and do a batch run - worked fine
# understand what other sequences in the batch produce when stopping criteria is reached - after eos token is reached other batches keep producing <unk> token (0) which is cleaned up by skip_special_tokens

from llmsearch.utils.mem_utils import gc_cuda

gc_cuda()

tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=datasets.Dataset.from_dict(samples_to_tune_on[:4]),
    device="cuda:0",
    batch_size=4,
    tokenizer_encode_args={"padding": "longest", "add_special_tokens": False},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    scorer=get_score,
    prompt_template="{X}",
    seed=seed,
    column_mapping={"input_cols": ["X"], "eval_cols": ["answer"]},
    callbacks_after_inference=callbacks_after_inference,
)

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [21]:
gen_params1 = {
    "max_new_tokens": 500,
    "stopping_criteria": stopping_criteria,
    "generation_seed": 42,
}

scores_before, outputs_before = tuner_ob.get_score(gen_params1)

  0%|          | 0/1 [00:00<?, ?it/s]

Currrent token idx - 4
Current batch idx - 0
Done tracker - tensor([False], device='cuda:0')
Current token - 28741
Current batch idx - 1
Done tracker - tensor([False], device='cuda:0')
Current token - 28741
Current batch idx - 2
Done tracker - tensor([False], device='cuda:0')
Current token - 28741
Current batch idx - 3
Done tracker - tensor([False], device='cuda:0')
Current token - 28741


Currrent token idx - 4
Current batch idx - 0
Done tracker - tensor([False], device='cuda:0')
Current token - 28747
Current batch idx - 1
Done tracker - tensor([False], device='cuda:0')
Current token - 28747
Current batch idx - 2
Done tracker - tensor([False], device='cuda:0')
Current token - 28747
Current batch idx - 3
Done tracker - tensor([False], device='cuda:0')
Current token - 28747


Currrent token idx - 4
Current batch idx - 0
Done tracker - tensor([False], device='cuda:0')
Current token - 351
Current batch idx - 1
Done tracker - tensor([False], device='cuda:0')
Current token - 3169
Current ba

In [22]:
tokenizer.decode([28723])

'.'

In [23]:
outputs_before

['A: Mimi picked up 2 dozen seashells, which is 2 * 12 = 24 seashells. Kyle found twice as many shells as Mimi, so he found 24 * 2 = 48 seashells. Leigh grabbed one-third of the shells that Kyle found, so she grabbed 48 / 3 = 16 seashells. The answer is 16.',
 "A: Let's break down the information given:\n\n1. Frankie has x number of cats.\n2. Frankie has x + 6 number of snakes.\n3. Frankie has x - 1 number of parrots.\n4. Frankie has 6 pets with 4 legs (which can be any kind of pets).\n5. Frankie has 2 dogs.\n\nNow, we can use this information to find out the total number of pets:\n\nFrom 4, we know that 6 pets have 4 legs, which means that these 6 pets are the 4-legged pets. So, the remaining pets must be the 4-legged pets (cats and snakes). Since there are 2 dogs, there must be 2 cats as well (because Frankie has 6 snakes and 2 dogs more than cats). So, the total number of 4-legged pets is 6 snakes + 2 cats = 8 pets.\n\nFrom 5, we know that Frankie has 2 dogs. Since dogs are 4-legged