In [18]:
"""

Check how the model pads input - done
Check how different enecoding & decoding params affect the encoding & decoding - done
todo : check performance of model on a sample set at different batch sizes (inadvertently check how the model performs when padding is present) - till now 1 bs gives best performance

"""

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch')

import awq
import torch
import transformers

import llmsearch

print(awq.__version__, torch.__version__, transformers.__version__, llmsearch.__version__)

import re
import textwrap
from pathlib import Path

import datasets

from awq import AutoAWQForCausalLM
from sklearn.model_selection import GridSearchCV
from transformers import StoppingCriteriaList, AutoTokenizer

from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import gc_cuda
from llmsearch.utils.common_utils import json_load, json_dump
from llmsearch.utils.model_downloader import download_model_from_hf
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
0.2.4 2.2.0+cu121 4.38.2 0.1.0


In [2]:
seed = 42
num_samples = 100
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"
device = "cuda:0"

In [10]:
def load_model_and_tokenizer(model_id, temp_model_dir):
    temp_model_dir.mkdir(exist_ok=True, parents=True)
    output_folder = download_model_from_hf(model_id, save_dir=temp_model_dir, branch="main")

    gc_cuda()

    model = AutoAWQForCausalLM.from_quantized(
        quant_path=output_folder, fuse_layers=True, device_map={"": device}, local_files_only=True
    )

    tokenizer = AutoTokenizer.from_pretrained(
        output_folder, local_files_only=True, legacy=False, use_fast=False
    )
    tokenizer.pad_token = tokenizer.unk_token
    tokenizer.padding_side = "left"

    return model, tokenizer

def load_dataset():

    def preprocess_dataset(
        dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt=True
    ):

        def wrapper(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            messages = (
                []
                if system_prompt is None
                else [{"role": "system", "content": system_prompt}]
            )
            formatted_pt = pt.format(**{pt_col: sample[pt_col] for pt_col in pt_cols})
            messages.append(
                {
                    "role": "user",
                    "content": formatted_pt,
                }
            )
            formatted_pt_with_ct = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=add_generation_prompt
            )
            return formatted_pt_with_ct

        def actual_input(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            return sample[pt_cols[0]]

        pt_dataset = dataset.map(
            lambda sample: {
                "X": wrapper(sample),
                "actual input": actual_input(sample),
            }
        )

        return pt_dataset


    # 2-shot prompt template - https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
    pt = textwrap.dedent(
    """\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}"""
    )
    pt_cols = ["question"]
    system_prompt = "Solve the following math problems, end with The answer is"
    gsm8k_dataset = datasets.load_dataset("gsm8k", "main")


    processed_dataset = preprocess_dataset(
        gsm8k_dataset["train"],
        tokenizer,
        pt=pt,
        pt_cols=pt_cols,
        system_prompt=system_prompt,
        add_generation_prompt=True,
    )

    shuffled_dataset = processed_dataset.shuffle(seed=seed)

    samples = shuffled_dataset.select(range(num_samples))
    return samples

def get_score(y_true, y_pred):
    def standardize(s):
        if s is None:
            return s
        s = s.replace(",", "")
        if s.endswith("."):
            s = s[:-1]
        return s.strip()

    def extract_answer_from_out(s):
        pattern = re.compile(r"The answer is ((\d|\-)((\d|\,|\.)+)?\d?)")
        match = pattern.search(s)
        if match:
            return match.group(1).strip()
        else:
            return None

    scores = []

    for y_t, y_p in zip(y_true, y_pred):
        y_t_answer = y_t["answer"].split("####")[-1].strip()
        y_p_answer = extract_answer_from_out(y_p)

        y_t_answer = standardize(y_t_answer)
        y_p_answer = standardize(y_p_answer)

        # print("y_pred - ", y_p_answer)
        # print("y_true - ", y_t_answer)

        if y_t_answer == y_p_answer:
            scores.append(1)
        else:
            scores.append(0)
    return sum(scores) / len(scores)


In [6]:
# Load Model, Tokenizer, Dataset
temp_model_dir = Path(f"./temp_dir/")
temp_model_dir.mkdir(exist_ok=True, parents=True)

model, tokenizer = load_model_and_tokenizer(model_id, temp_model_dir)

# Dataset we will use to find the best generation parameters
samples = load_dataset()

multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[32000])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

Model already exists in temp_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-AWQ. Checking the model files...
Checksum validated: model.safetensors  645dfc7f09074aaf25e642f3c6a4f7ea399a0ff2605fa650e4e74078832546de
Checksum validated: tokenizer.model  dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
[+] Validated checksums of all model files!


Replacing layers...: 100%|██████████| 32/32 [00:04<00:00,  6.87it/s]
Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 14.41it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
import time
from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import gc_cuda

batch_size_list = [1, 2, 4, 8, 16, 32]
score_at_diff_batch_sizes = {}

for batch_size in batch_size_list:
    print(f"Batch Size - {batch_size}")

    gc_cuda()

    tuner_ob = Tuner(
        model=model,
        tokenizer=tokenizer,
        dataset=samples,
        device="cuda:0",
        batch_size=batch_size,
        tokenizer_encode_args={"padding": "longest", "add_special_tokens": False},
        tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
        scorer=get_score,
        prompt_template="{X}",
        seed=seed,
        column_mapping={"input_cols": ["X"], "eval_cols": ["answer"]},
        callbacks_after_inference=callbacks_after_inference,
    )


    # is stopping criteria cache not being reset properly?
    # check if reset condition is working properly


    gen_params1 = {
        'max_new_tokens' : 500,
        # max_new_tokens take precendece over stopping criteria
        'stopping_criteria' : stopping_criteria,
        'generation_seed' : 42,
    }

    start = time.time()
    scores, outputs = tuner_ob.get_score(gen_params1)
    end = time.time()

    score_at_diff_batch_sizes[batch_size] = {
        'score' : scores,
        'outputs' : outputs,
        'optimal_batch_size' : tuner_ob.estimator._optimal_batch_size,
        'latency_mins' : (end  - start) / 60,
    }

    print(f'score at batch size - {batch_size} - {scores}')


Batch Size - 1


100%|██████████| 100/100 [04:09<00:00,  2.50s/it]


score at batch size - 1 - 0.72
Batch Size - 2


100%|██████████| 50/50 [02:42<00:00,  3.25s/it]


score at batch size - 2 - 0.65
Batch Size - 4


100%|██████████| 25/25 [01:41<00:00,  4.07s/it]


score at batch size - 4 - 0.58
Batch Size - 8


100%|██████████| 13/13 [01:08<00:00,  5.27s/it]


score at batch size - 8 - 0.51
Batch Size - 16


  0%|          | 0/7 [00:02<?, ?it/s]
100%|██████████| 13/13 [01:08<00:00,  5.26s/it]


score at batch size - 16 - 0.51
Batch Size - 32


  0%|          | 0/4 [00:01<?, ?it/s]
  0%|          | 0/7 [00:01<?, ?it/s]
100%|██████████| 13/13 [01:08<00:00,  5.26s/it]

score at batch size - 32 - 0.51





In [12]:
for batch_size in score_at_diff_batch_sizes.keys():
    print(f"Batch Size - {batch_size}")
    print(f"Score - {score_at_diff_batch_sizes[batch_size]['score']}")
    print('\n\n')

Batch Size - 1
Score - 0.72



Batch Size - 2
Score - 0.65



Batch Size - 4
Score - 0.58



Batch Size - 8
Score - 0.51



Batch Size - 16
Score - 0.51



Batch Size - 32
Score - 0.51





In [17]:
for bs_1, bs_2 in zip(score_at_diff_batch_sizes[1]['outputs'], score_at_diff_batch_sizes[2]['outputs']):
    if bs_1 != bs_2:
        print(f"Batch Size - 1 - {bs_1}")
        print(f"Batch Size - 2 - {bs_2}")
        print('\n\n')

Batch Size - 1 - A: Let's break down the information given:

1. Frankie has 6 more snakes than cats. Let's represent the number of cats as C. So, the number of snakes is C + 6.
2. Frankie has one less parrot than cats. So, the number of parrots is C - 1.
3. Six of his pets have four legs, which are the dogs.
4. He has 2 dogs.

Now, let's find the total number of pets:

1. C (cats) + (C + 6) (snakes) + (C - 1) (parrots) + 2 (dogs) + 6 (four-legged pets)
2. C + C + 6 + C - 1 + 6 + 2
3. 4C + 5

Since we know that Frankie has 2 dogs, we can substitute 2 for 6 in the equation:

4C + 5 = total number of pets
4C + 5 = C + 6 + (C - 1) + 2

Now, let's solve for C:

4C + 5 = 2C + 5 + C + 6 + 2
4C - 2C = 13
2C = 13
C = 13/2
C = 6.5

However, since we can't have a fraction of a pet, we made an assumption that not all pets were accounted for in the given information. In this case, we can conclude that Frankie has at least 6 cats, which means he has 6 + 6 = 12 pets in total (6 snakes and 1 parrot). 

In [None]:
# pad token = eos token
# padding side - left

for output in score_at_diff_batch_sizes[1]['outputs']:
    print(output)
for batch_size in score_at_diff_batch_sizes.keys():
    print(f"Batch Size - {batch_size}")
    print(f"Score - {score_at_diff_batch_sizes[batch_size]['score']}")
    print('\n\n')

In [None]:
# print out the results

# proves that llmsearch should be run on a batch size of 1

for batch_size, score_dict in score_at_diff_batch_sizes.items():
    print(f"Batch Size - {batch_size}")
    print(f"Score - {score_dict['score']}")
    print(f"Latency - {score_dict['latency_mins']} mins")
    print(f"Optimal Batch Size - {score_dict['optimal_batch_size']}")
    print("\n\n")

In [None]:
score_at_diff_batch_sizes

In [None]:
score_at_diff_batch_sizes[batch_size] = {
        'score' : scores,
        'outputs' : outputs,
        'optimal_batch_size' : tuner_ob.estimator._optimal_batch_size,
        'latency_mins' : (end  - start) / 60,
    }

In [None]:






print(scores_before)

In [None]:
from llmsearch.tuner import Tuner

cm()
batch_size = 1

tuner_ob = Tuner(
    model = model,
    tokenizer = tokenizer,
    dataset = bm_samples,
    device = 'cuda:0',
    batch_size = batch_size,
    tokenizer_encode_args={'padding': 'longest', 'add_special_tokens' : False},
    tokenizer_decode_args={'spaces_between_special_tokens' : False},
    scorer = get_score,
    prompt_template = langchain.PromptTemplate.from_template("{X}"),
    is_encoder_decoder = False,
    seed = seed,
    column_mapping = {'input_cols' : ["X"],'eval_cols' : ['answer']},
)

In [None]:
tuner_ob._optimal_batch_size

In [None]:






print(scores_before)

In [None]:
scores_before

In [None]:
# decoder parser is working as expected
# TODO : check scores at different bs then llmsearch

stopping_criteria = StoppingCriteriaList([MultiTokenEOSCriteria(sequence_ids = [32000])])

gen_params1 = {
    'max_new_tokens' : 500,
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,
}

scores_before, outputs_before = tuner_ob.get_score(gen_params1)

scores_before