In [1]:
"""

Check how the model pads input - done
Check how different enecoding & decoding params affect the encoding & decoding - done
check performance of model on a sample set at different batch sizes (inadvertently check how the model performs when padding is present) - till now 1 bs gives best performance

"""

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch')

import gc
import torch
import ctypes
import json
import nltk
import math
import torch
import random
import evaluate
import datasets
import langchain
import numpy as np
import pandas as pd
import transformers
from transformers import GPTQConfig, BitsAndBytesConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import PreTrainedModel, PretrainedConfig, GenerationConfig, StoppingCriteria, AutoTokenizer, StoppingCriteriaList, AutoModel, AutoModelForCausalLM

import os
import gc
import ctypes
import traceback
from pathlib import Path
from typing import Any, Dict, Optional, Union, List

import time
import textwrap
from tqdm.auto import tqdm

from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
    ExLlamaV2Cache_8bit,
    ExLlamaV2Config
)

from datasets import load_dataset
from llmsearch.model_downloader import download_model_from_hf
from llmsearch.utils.model_utils import batcher, decoder_parser

import awq

from awq import AutoAWQForCausalLM

def pretty_print_dict(d, indent = 4):
    print(json.dumps(d, indent = indent, default = str))

Monkey Patching .generate function of `transformers` library


In [2]:
gsm8k_dataset = load_dataset("gsm8k", 'main')

torch.__version__, awq.__version__

('2.2.0+cu121', '0.2.4')

In [3]:

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


class SingleTokenStoppingCriteria(StoppingCriteria):
    """End generation if end token is encountered
    does not support batched implementation yet"""

    def __init__(self, token_id):
      super().__init__()
      self.token_id =  token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        res = []

        last_token_id = input_ids[0][-1]
        if last_token_id == self.token_id:
            return True
        return False


def cm():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False



def perform_single_example_inference(example, model, tokenizer,gen_kwargs):

    tokenized_input = tokenizer(example, return_tensors = "pt", add_special_tokens = False)
    tokenized_input['input_ids'] = tokenized_input['input_ids'].to('cuda:0')

    tokenized_input['attention_mask'] = tokenized_input['attention_mask'].to('cuda:0')
    # tokenized_input.to(device)
    # print(tokenized_input)

    model_out = model.generate(**tokenized_input, **gen_kwargs)
    prompt_tokens = len(tokenized_input['input_ids'][0])
    print(f"Prompt tokens - {prompt_tokens}")
    # print(model_out.tolist()[0])

    output_token_ids = model_out.tolist()[0]
    decoded_output = tokenizer.decode(output_token_ids, spaces_between_special_tokens = False)

    print(decoded_output)
    completion_tokens = len(output_token_ids) - prompt_tokens

    print(f"Completion Tokens - {completion_tokens}")

    return decoded_output, prompt_tokens, completion_tokens

In [4]:
# loaders

class MultiTokenEOSCriteria(transformers.StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence.

    This code is not thread safe. The same object cannot be used simultaneously in multiple threads.
    """

    def __init__(
        self,
        sequence_ids : List[int],
    ) -> None:
        self.sequence_ids = torch.tensor(sequence_ids, dtype = torch.int32, device = "cuda:0")
        # we look back for 2 more tokens than it takes to encode our stop sequence
        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
        # and we don't want to mistakenly not stop a generation because our
        # (string) stop sequence was output in a different tokenization
        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
        self.sequence_id_len = self.sequence_ids.shape[0] + 2
        self.state_initialized = False
        self.input_length = None
        self.state_initialized = False

    def set_state(self, batch_size, input_length):
        self.batch_size = batch_size
        self.input_length = input_length
        self.done_tracker = [False] * batch_size
        self.state_initialized = True

    def reset(self):
        self.batch_size = None
        self.input_length = None
        self.state_initialized = False


    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence

        ret_val = False

        if not self.state_initialized:
            # 1st call to __call__ for this batch
            self.set_state(input_ids.shape[0], input_ids.shape[1])

        # IDs of all the tokens except the prompt
        lookback_ids_batch = input_ids[:, self.input_length :]
        # look back for 2 more tokens than it takes to encode our stop sequence
        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]

        # no elements yet to look back
        if lookback_ids_batch.nelement() == 0:
            return False

        for i, done in enumerate(self.done_tracker):
            if not done:
                # look back only as far as the last token of the stop sequence
                self.done_tracker[i] = self.sequence_ids == lookback_ids_batch[i][-(self.sequence_ids.shape[0]):]
        ret_val = False not in self.done_tracker
        if ret_val:
            # print(f"finish, ", self.sequence_ids, lookback_ids_batch)
            self.reset()
        return ret_val


def load_model_with_awq_backend(model_id, model_loader_kwargs, tokenizer_kwargs,temp_model_dir, model_branch = "main"):
    output_folder = download_model_from_hf(model_id, save_dir = temp_model_dir, branch = model_branch)

    model_loader_kwargs['pretrained_model_name_or_path'] = output_folder
    tokenizer_loader_kwargs['pretrained_model_name_or_path'] = output_folder

    model_name_or_path = model_loader_kwargs.pop('pretrained_model_name_or_path')
    model = AutoAWQForCausalLM.from_quantized(
        quant_path=model_name_or_path,
        **model_loader_kwargs
    )
    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_kwargs, local_files_only=True)

    # pad token is null in config -https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ/blob/eb64c310c44905321d012962db9ac0d47c3a64fa/tokenizer_config.json#L53
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model_loader_backend_map = {
    # "exllama_2_hf": load_model_with_exllama_2_hf_backend,
    # "hf": load_model_with_hf_backend,
    # 'auto_gptq' : load_model_with_autogptq_backend,
    'awq' : load_model_with_awq_backend,
}

In [5]:
# https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

temp_model_dir = Path(f"/workspace/temp_model_dir/")
temp_model_dir.mkdir(exist_ok = True, parents = True)

model_loader_kwargs = {
    'device_map' : {'' : 0},
    'fuse_layers' : True,
}

tokenizer_loader_kwargs = {
    'use_fast' : False,
    'legacy' : False,
    'padding_side' : 'left',
}

model, tokenizer = load_model_with_awq_backend(model_id, model_loader_kwargs, tokenizer_loader_kwargs,temp_model_dir, model_branch = "main")

Model already exists in /workspace/temp_model_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-AWQ. Checking the model files...
Checksum validated: model.safetensors  645dfc7f09074aaf25e642f3c6a4f7ea399a0ff2605fa650e4e74078832546de
Checksum validated: tokenizer.model  dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
[+] Validated checksums of all model files!


Replacing layers...: 100%|██████████| 32/32 [00:04<00:00,  6.48it/s]
Fusing layers...: 100%|██████████| 32/32 [00:00<00:00, 50.63it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
def preprocess_dataset(dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt = True):

    def wrapper(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        messages = [] if system_prompt is None else [{"role": "system", "content": system_prompt}]
        formatted_pt = pt.format(**{pt_col : sample[pt_col] for pt_col in pt_cols})
        messages.append(
            {
                "role": "user",
                "content": formatted_pt,
            }
        )
        formatted_pt_with_ct = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=add_generation_prompt)
        return formatted_pt_with_ct

    def actual_input(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        return sample[pt_cols[0]]



    pt_dataset = dataset.map(
        lambda sample : {
            "X" : wrapper(sample),
            'actual input' : actual_input(sample),
        }
    )

    return pt_dataset

In [7]:
pt = textwrap.dedent("""\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}""")
pt_cols = ['question']
system_prompt = "Solve the following math problems, end with The answer is"

# Add prompt template
processed_dataset = preprocess_dataset(gsm8k_dataset['train'], tokenizer,pt = pt, pt_cols = pt_cols, system_prompt = system_prompt, add_generation_prompt = True)

In [8]:
import re

def extract_answer_from_out(s):
    pattern = re.compile(r"The answer is (\d+(?:\.\d+)?)")
    match = pattern.search(s)
    if match:
        return match.group(1).strip()
    else:
        return None

def get_score(y_true, y_pred):
    scores = []

    for y_t, y_p in zip(y_true, y_pred):
        y_t_answer = y_t['answer'].split("####")[-1].strip()
        y_p_answer = extract_answer_from_out(y_p)


        if y_t_answer == y_p_answer:
            scores.append(1)
        else:
            scores.append(0)
    return sum(scores)/len(scores)

In [9]:
from llmsearch.utils.logging_utils import set_verbosity_info, set_verbosity_debug, set_verbosity_warning
set_verbosity_info()

In [10]:
seed = 42
bm_sample_size = 50
bm_samples = processed_dataset.shuffle(seed = seed).select(range(bm_sample_size))

In [11]:

from llmsearch.tuner import Tuner

# TODO : bs = 16 error, figure this out
# TODO : run llmsearch

class MultiTokenEOSCriteria2(transformers.StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence.

    This code is not thread safe. The same object cannot be used simultaneously in multiple threads.
    """

    def __init__(
        self,
        sequence_ids : List[int],
    ) -> None:
        self.sequence_ids = torch.tensor(sequence_ids, dtype = torch.int32, device = "cuda:0")
        # we look back for 2 more tokens than it takes to encode our stop sequence
        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
        # and we don't want to mistakenly not stop a generation because our
        # (string) stop sequence was output in a different tokenization
        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
        self.sequence_id_len = self.sequence_ids.shape[0] + 2
        self.state_initialized = False
        self.prompt_length = None
        self.state_initialized = False

    def set_state(self, batch_size, prompt_length):
        self.batch_size = batch_size
        self.prompt_length = prompt_length
        self.done_tracker = [False] * batch_size
        self.state_initialized = True

    def reset(self):
        # print("Resetting")
        self.batch_size = None
        self.prompt_length = None
        self.state_initialized = False


    def __call__(self, input_ids, scores, **kwargs) -> bool:
        """
        This is called after a new token is generated
        """
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence

        ret_val = False

        if not self.state_initialized:
            # Every batch should set this state
            # print(f"Setting state, batch_size - {input_ids.shape[0]}, batch prompt length - {input_ids.shape[1] - 1}")
            self.set_state(input_ids.shape[0], input_ids.shape[1] - 1)

        # IDs of all the tokens except the prompt
        lookback_ids_batch = input_ids[:, self.prompt_length :]
        # look back for 2 more tokens than it takes to encode our stop sequence
        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]

        # print(f"Current input length - {input_ids.shape[1]}, completion length - {abs(self.prompt_length - input_ids.shape[1])}")
        # print(f"Current input - {tokenizer.batch_decode(input_ids, **{'spaces_between_special_tokens' : False})}")

        # no elements yet to look back
        if lookback_ids_batch.nelement() == 0:
            return False

        for i, done in enumerate(self.done_tracker):
            if not done:
                # look back only as far as the last token of the stop sequence
                # print(len(self.done_tracker), lookback_ids_batch.shape, self.batch_size, self.prompt_length)
                self.done_tracker[i] = self.sequence_ids == lookback_ids_batch[i][-(self.sequence_ids.shape[0]):]
        ret_val = False not in self.done_tracker
        if ret_val:
            # ASSUMPTION: Relies on the assumption that generation will only stop when the stop token is generated
            self.reset()
        return ret_val


batch_size_list = [1, 2, 4, 8, 16, 32]
score_at_diff_batch_sizes = {}

for batch_size in batch_size_list:
    print(f"Batch Size - {batch_size}")

    cm()

    multi_token_stop_criteria_ob = MultiTokenEOSCriteria2(sequence_ids = [32000])
    stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])

    tuner_ob = Tuner(
        model = model,
        tokenizer = tokenizer,
        dataset = bm_samples,
        device = 'cuda:0',
        batch_size = batch_size,
        tokenizer_encoding_kwargs={'padding': 'longest', 'add_special_tokens' : False},
        tokenizer_decoding_kwargs={'spaces_between_special_tokens' : False},
        scorer = get_score,
        prompt_template = langchain.PromptTemplate.from_template("{X}"),
        is_encoder_decoder = False,
        seed = seed,
        column_mapping = {'input_cols' : ["X"],'eval_cols' : ['answer']},
        callbacks_after_inference = [multi_token_stop_criteria_ob.reset],
    )


    # is stopping criteria cache not being reset properly?
    # check if reset condition is working properly


    gen_params1 = {
        'max_new_tokens' : 500,
        # max_new_tokens take precendece over stopping criteria
        'stopping_criteria' : stopping_criteria,
        'generation_seed' : 42,
    }

    start = time.time()
    scores, outputs = tuner_ob.get_score(gen_params1)
    end = time.time()

    score_at_diff_batch_sizes[batch_size] = {
        'score' : scores,
        'outputs' : outputs,
        'optimal_batch_size' : tuner_ob.estimator._optimal_batch_size,
        'latency_mins' : (end  - start) / 60,
    }

# Done - Problem - Stopping criteria does not reset when generation reaches max new tokens so no stop token is generated
# When does generation stop - max new tokens is reached or stop token is generated

# 2m44s for 50 samples
# 100 samples - 5m28s

2024-03-31 09:42:34.449 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac13d390>], 'generation_seed': 42}
2024-03-31 09:42:34.450 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 1
2024-03-31 09:42:34.451 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 1


  0%|          | 0/50 [00:00<?, ?it/s]

Batch 1/50
Batch 2/50
Batch 3/50
Batch 4/50
Batch 5/50
Batch 6/50
Batch 7/50
Batch 8/50
Batch 9/50
Batch 10/50
Batch 11/50
Batch 12/50
Batch 13/50
Batch 14/50
Batch 15/50
Batch 16/50
Batch 17/50
Batch 18/50
Batch 19/50
Batch 20/50
Batch 21/50
Batch 22/50
Batch 23/50
Batch 24/50
Batch 25/50
Batch 26/50
Batch 27/50
Batch 28/50
Batch 29/50
Batch 30/50
Batch 31/50
Batch 32/50
Batch 33/50
Batch 34/50
Batch 35/50
Batch 36/50
Batch 37/50
Batch 38/50
Batch 39/50
Batch 40/50
Batch 41/50
Batch 42/50
Batch 43/50
Batch 44/50
Batch 45/50
Batch 46/50
Batch 47/50
Batch 48/50
Batch 49/50
Batch 50/50


2024-03-31 09:45:19.481 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 165.030281 secs
2024-03-31 09:45:19.646 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac13c490>], 'generation_seed': 42}
2024-03-31 09:45:19.648 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 2
2024-03-31 09:45:19.649 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 2


  0%|          | 0/25 [00:00<?, ?it/s]

Batch 1/25
Batch 2/25
Batch 3/25
Batch 4/25
Batch 5/25
Batch 6/25
Batch 7/25
Batch 8/25
Batch 9/25
Batch 10/25
Batch 11/25
Batch 12/25
Batch 13/25
Batch 14/25
Batch 15/25
Batch 16/25
Batch 17/25
Batch 18/25
Batch 19/25
Batch 20/25
Batch 21/25
Batch 22/25
Batch 23/25
Batch 24/25
Batch 25/25


2024-03-31 09:47:09.802 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 110.153773 secs
2024-03-31 09:47:09.972 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac0f2170>], 'generation_seed': 42}
2024-03-31 09:47:09.974 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 4
2024-03-31 09:47:09.974 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 4


  0%|          | 0/13 [00:00<?, ?it/s]

Batch 1/13
Batch 2/13
Batch 3/13
Batch 4/13
Batch 5/13
Batch 6/13
Batch 7/13
Batch 8/13
Batch 9/13
Batch 10/13
Batch 11/13
Batch 12/13
Batch 13/13


2024-03-31 09:48:18.874 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 68.899233 secs
2024-03-31 09:48:19.082 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac13cb80>], 'generation_seed': 42}
2024-03-31 09:48:19.083 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 8
2024-03-31 09:48:19.083 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 8


  0%|          | 0/7 [00:00<?, ?it/s]

Batch 1/7
Batch 2/7
Batch 3/7
Batch 4/7
Batch 5/7
Batch 6/7
Batch 7/7


2024-03-31 09:48:56.765 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 37.682039 secs
2024-03-31 09:48:56.960 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac0f1cf0>], 'generation_seed': 42}
2024-03-31 09:48:56.961 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 16
2024-03-31 09:48:56.962 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 16


  0%|          | 0/4 [00:00<?, ?it/s]

Batch 1/4
Batch 2/4
Batch 3/4
Batch 4/4


2024-03-31 09:49:24.041 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 27.078649 secs
2024-03-31 09:49:24.217 - llmsearch.utils.mem_utils:153 - INFO - Starting inference with generation parameters - {'max_new_tokens': 500, 'stopping_criteria': [<__main__.MultiTokenEOSCriteria2 object at 0x7f10ac13c340>], 'generation_seed': 42}
2024-03-31 09:49:24.217 - llmsearch.utils.mem_utils:157 - INFO - Performing inference with batch_size - 32
2024-03-31 09:49:24.218 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


Batch Size - 32


  0%|          | 0/2 [00:00<?, ?it/s]

Batch 1/2


2024-03-31 09:49:27.279 - llmsearch.utils.mem_utils:191 - INFO - Unable to fit batch size - 32, Reducing batch size to - 16
2024-03-31 09:49:27.461 - llmsearch.utils.model_utils:99 - INFO - Detected generation type - Greedy Decoding


  0%|          | 0/4 [00:00<?, ?it/s]

Batch 1/4
Batch 2/4
Batch 3/4
Batch 4/4


2024-03-31 09:49:54.694 - llmsearch.utils.mem_utils:187 - INFO - Finished running inference, took 30.476015 secs


In [14]:
for batch_size in score_at_diff_batch_sizes.keys():
    print(f"Batch Size - {batch_size}")
    print(f"Score - {score_at_diff_batch_sizes[batch_size]['score']}")
    print('\n\n')

Batch Size - 1
Score - 0.64



Batch Size - 2
Score - 0.5



Batch Size - 4
Score - 0.32



Batch Size - 8
Score - 0.22



Batch Size - 16
Score - 0.1



Batch Size - 32
Score - 0.1





In [None]:
len(tokenizer(bm_samples[0]['X'], **{'padding': 'longest', 'add_special_tokens' : False})['input_ids'])

In [16]:
score_at_diff_batch_sizes[1]['outputs']

['A: Mimi picked up 2 dozen seashells, which is 2 * 12 = 24 seashells. Kyle found twice as many shells as Mimi, so he found 24 * 2 = 48 seashells. Leigh grabbed one-third of the shells that Kyle found, so Leigh had 48 / 3 = 16 seashells. The answer is 16.<|im_end|>',
 "A: Let's break down the information given:\n\n1. Frankie has 6 more snakes than cats. Let's represent the number of cats as C. So, the number of snakes is C + 6.\n2. Frankie has one less parrot than cats. So, the number of parrots is C - 1.\n3. Six of his pets have four legs, which are the dogs.\n4. He has 2 dogs.\n\nNow, let's find the total number of pets:\n\n1. C (cats) + (C + 6) (snakes) + (C - 1) (parrots) + 2 (dogs) + 6 (four-legged pets)\n2. C + C + 6 + C - 1 + 6 + 2\n3. 4C + 5\n\nSince we know that Frankie has 2 dogs, we can substitute 2 for 6 in the equation:\n\n4C + 5 = total number of pets\n4C + 5 = C + 6 + (C - 1) + 2\n\nNow, let's solve for C:\n\n4C + 5 = 2C + 5 + C + 6 + 2\n4C - 2C = 13\n2C = 13\nC = 13/2\n

In [15]:
# print out the results

# proves that llmsearch should be run on a batch size of 1

for batch_size, score_dict in score_at_diff_batch_sizes.items():
    print(f"Batch Size - {batch_size}")
    print(f"Score - {score_dict['score']}")
    print(f"Latency - {score_dict['latency_mins']} mins")
    print(f"Optimal Batch Size - {score_dict['optimal_batch_size']}")
    print("\n\n")

Batch Size - 1
Score - 0.64
Latency - 2.7505770603815716 mins
Optimal Batch Size - 1



Batch Size - 2
Score - 0.5
Latency - 1.8359658439954123 mins
Optimal Batch Size - 2



Batch Size - 4
Score - 0.32
Latency - 1.1483861525853476 mins
Optimal Batch Size - 4



Batch Size - 8
Score - 0.22
Latency - 0.6280998746554057 mins
Optimal Batch Size - 8



Batch Size - 16
Score - 0.1
Latency - 0.45137432018915813 mins
Optimal Batch Size - 16



Batch Size - 32
Score - 0.1
Latency - 0.507991349697113 mins
Optimal Batch Size - 32





In [None]:
score_at_diff_batch_sizes

In [None]:
score_at_diff_batch_sizes[batch_size] = {
        'score' : scores,
        'outputs' : outputs,
        'optimal_batch_size' : tuner_ob.estimator._optimal_batch_size,
        'latency_mins' : (end  - start) / 60,
    }

In [None]:






print(scores_before)

In [None]:
from llmsearch.tuner import Tuner

cm()
batch_size = 1

tuner_ob = Tuner(
    model = model,
    tokenizer = tokenizer,
    dataset = bm_samples,
    device = 'cuda:0',
    batch_size = batch_size,
    tokenizer_encoding_kwargs={'padding': 'longest', 'add_special_tokens' : False},
    tokenizer_decoding_kwargs={'spaces_between_special_tokens' : False},
    scorer = get_score,
    prompt_template = langchain.PromptTemplate.from_template("{X}"),
    is_encoder_decoder = False,
    seed = seed,
    column_mapping = {'input_cols' : ["X"],'eval_cols' : ['answer']},
)

In [None]:
tuner_ob._optimal_batch_size

In [None]:






print(scores_before)

In [None]:
scores_before

In [None]:
# decoder parser is working as expected
# TODO : check scores at different bs then llmsearch

stopping_criteria = StoppingCriteriaList([MultiTokenEOSCriteria(sequence_ids = [32000])])

gen_params1 = {
    'max_new_tokens' : 500,
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,
}

scores_before, outputs_before = tuner_ob.get_score(gen_params1)

scores_before