In [1]:
"""

record inf time of different model loaders

exllama
wget https://github.com/turboderp/exllamav2/releases/download/v0.0.14/exllamav2-0.0.14+cu121-cp310-cp310-linux_x86_64.whl
pip install -q exllamav2-0.0.14+cu121-cp310-cp310-linux_x86_64.whl


pip install auto-gptq

"""

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch')

import gc
import torch
import ctypes
import json
import nltk
import math
import torch
import random
import evaluate
import datasets
import langchain
import numpy as np
import pandas as pd
import transformers
from transformers import GPTQConfig, BitsAndBytesConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import PreTrainedModel, PretrainedConfig, GenerationConfig, StoppingCriteria, AutoTokenizer, StoppingCriteriaList, AutoModel, AutoModelForCausalLM

import os
import gc
import ctypes
import traceback
from pathlib import Path
from typing import Any, Dict, Optional, Union, List

import time
import textwrap
from tqdm.auto import tqdm

from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
    ExLlamaV2Cache_8bit,
    ExLlamaV2Config
)

from datasets import load_dataset
from llmsearch.model_downloader import download_model_from_hf
from llmsearch.utils.model_utils import batcher, decoder_parser
from auto_gptq.modeling._base import BaseGPTQForCausalLM

def pretty_print_dict(d, indent = 4):
    print(json.dumps(d, indent = indent, default = str))


Monkey Patching .generate function of `transformers` library


In [2]:
gsm8k_dataset = load_dataset("gsm8k", 'main')

torch.__version__

'2.2.0+cu121'

In [3]:
gsm8k_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [4]:
import exllamav2

In [5]:
version_dict = {
    'exllama_v2' : exllamav2.__version__,
    'torch' : torch.__version__,
    'transformers' : transformers.__version__,
}

version_dict

{'exllama_v2': '0.0.14', 'torch': '2.2.0+cu121', 'transformers': '4.38.2'}

In [6]:

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


class SingleTokenStoppingCriteria(StoppingCriteria):
    """End generation if end token is encountered
    does not support batched implementation yet"""

    def __init__(self, token_id):
      super().__init__()
      self.token_id =  token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        res = []

        last_token_id = input_ids[0][-1]
        if last_token_id == self.token_id:
            return True
        return False


def cm():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False



def perform_single_example_inference(example, model, tokenizer,gen_kwargs):

    tokenized_input = tokenizer(example, return_tensors = "pt", add_special_tokens = False)
    tokenized_input['input_ids'] = tokenized_input['input_ids'].to('cuda:0')

    tokenized_input['attention_mask'] = tokenized_input['attention_mask'].to('cuda:0')
    # tokenized_input.to(device)
    # print(tokenized_input)

    model_out = model.generate(**tokenized_input, **gen_kwargs)
    prompt_tokens = len(tokenized_input['input_ids'][0])
    print(f"Prompt tokens - {prompt_tokens}")
    # print(model_out.tolist()[0])

    output_token_ids = model_out.tolist()[0]
    decoded_output = tokenizer.decode(output_token_ids, spaces_between_special_tokens = False)

    print(decoded_output)
    completion_tokens = len(output_token_ids) - prompt_tokens

    print(f"Completion Tokens - {completion_tokens}")

    return decoded_output, prompt_tokens, completion_tokens

In [7]:
# loaders

class MultiTokenEOSCriteria(transformers.StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence.

    This code is not thread safe. The same object cannot be used simultaneously in multiple threads.
    """

    def __init__(
        self,
        sequence_ids : List[int],
    ) -> None:
        self.sequence_ids = torch.tensor(sequence_ids, dtype = torch.int32, device = "cuda:0")
        # we look back for 2 more tokens than it takes to encode our stop sequence
        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
        # and we don't want to mistakenly not stop a generation because our
        # (string) stop sequence was output in a different tokenization
        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
        self.sequence_id_len = self.sequence_ids.shape[0] + 2
        self.state_initialized = False
        self.input_length = None
        self.state_initialized = False

    def set_state(self, batch_size, input_length):
        self.batch_size = batch_size
        self.input_length = input_length
        self.done_tracker = [False] * batch_size
        self.state_initialized = True

    def reset(self):
        self.batch_size = None
        self.input_length = None
        self.state_initialized = False


    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence

        ret_val = False

        if not self.state_initialized:
            # 1st call to __call__ for this batch
            self.set_state(input_ids.shape[0], input_ids.shape[1])

        # IDs of all the tokens except the prompt
        lookback_ids_batch = input_ids[:, self.input_length :]
        # look back for 2 more tokens than it takes to encode our stop sequence
        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]

        # no elements yet to look back
        if lookback_ids_batch.nelement() == 0:
            return False

        for i, done in enumerate(self.done_tracker):
            if not done:
                # look back only as far as the last token of the stop sequence
                self.done_tracker[i] = self.sequence_ids == lookback_ids_batch[i][-(self.sequence_ids.shape[0]):]
        ret_val = False not in self.done_tracker
        if ret_val:
            # print(f"finish, ", self.sequence_ids, lookback_ids_batch)
            self.reset()
        return ret_val


# exllama 2 backend loader
class Exllamav2HF(PreTrainedModel):
    def __init__(self, config: ExLlamaV2Config):
        super().__init__(PretrainedConfig())
        self.ex_config = config
        self.ex_model = ExLlamaV2(config)
        split = None
        if shared.args.gpu_split:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

        self.ex_model.load(split)
        self.generation_config = GenerationConfig()
        self.loras = None

        if shared.args.cache_8bit:
            self.ex_cache = ExLlamaV2Cache_8bit(self.ex_model)
        else:
            self.ex_cache = ExLlamaV2Cache(self.ex_model)

        self.past_seq = None
        if shared.args.cfg_cache:
            if shared.args.cache_8bit:
                self.ex_cache_negative = ExLlamaV2Cache_8bit(self.ex_model)
            else:
                self.ex_cache_negative = ExLlamaV2Cache(self.ex_model)

            self.past_seq_negative = None

    def _validate_model_class(self):
        pass

    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
        pass

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {'input_ids': input_ids, **kwargs}

    @property
    def device(self) -> torch.device:
        return torch.device(0)

    def __call__(self, *args, **kwargs):
        use_cache = kwargs.get('use_cache', True)
        labels = kwargs.get('labels', None)
        past_key_values = kwargs.get('past_key_values', None)

        if len(args) > 0:
            if not shared.args.cfg_cache:
                print("Please enable the cfg-cache option to use CFG with ExLlamav2_HF.")
                return

            input_ids = args[0]
            is_negative = True
            past_seq = self.past_seq_negative
            ex_cache = self.ex_cache_negative
        else:
            input_ids = kwargs['input_ids']
            is_negative = False
            past_seq = self.past_seq
            ex_cache = self.ex_cache

        seq = input_ids[0].tolist()
        if is_negative and past_key_values is not None:
            seq = past_key_values + seq

        seq_tensor = torch.tensor(seq)
        reset = True

        # Make the forward call
        if labels is None:
            if past_seq is not None:
                min_length = min(past_seq.shape[0], seq_tensor.shape[0])
                indices = torch.nonzero(~torch.eq(past_seq[:min_length], seq_tensor[:min_length]))
                if len(indices) > 0:
                    longest_prefix = indices[0].item()
                else:
                    longest_prefix = min_length

                if longest_prefix > 0:
                    reset = False
                    ex_cache.current_seq_len = longest_prefix
                    if len(seq_tensor) - longest_prefix > 1:
                        self.ex_model.forward(seq_tensor[longest_prefix:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)
                    elif len(seq_tensor) == longest_prefix:
                        # Very tricky: if the prefix we are reusing *is* the input_ids, then we have to back up the cache pointer by one,
                        # because we feed input_ids[-1] to forward() below, but that last token is already in the cache!
                        ex_cache.current_seq_len -= 1

            if reset:
                ex_cache.current_seq_len = 0
                if len(seq_tensor) > 1:
                    self.ex_model.forward(seq_tensor[:-1].view(1, -1), ex_cache, preprocess_only=True, loras=self.loras)

            logits = self.ex_model.forward(seq_tensor[-1:].view(1, -1), ex_cache, loras=self.loras).to(input_ids.device).float()
        else:
            ex_cache.current_seq_len = 0
            logits = self.ex_model.forward(seq_tensor.view(1, -1), ex_cache, last_id_only=False, loras=self.loras).float()

        if is_negative:
            self.past_seq_negative = seq_tensor
        else:
            self.past_seq = seq_tensor

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, logits.shape[-1])
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

        return CausalLMOutputWithPast(logits=logits, past_key_values=seq if use_cache else None, loss=loss)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
        if isinstance(pretrained_model_name_or_path, str):
            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)


        config = ExLlamaV2Config()
        config.model_dir = str(pretrained_model_name_or_path)
        config.prepare()

        config.max_seq_len = shared.args.max_seq_len
        config.scale_pos_emb = shared.args.compress_pos_emb
        config.scale_alpha_value = shared.args.alpha_value
        config.no_flash_attn = shared.args.no_flash_attn

        return Exllamav2HF(config)

# not working as expected, current_seq_len is somehow linked to past_len
# look at batch inference of exllama and undestand

# TODO : understand & incorprorate inputs from this issue
class Exllamav2HFBatched(PreTrainedModel):
    # TODO : incorporate code f
    def __init__(self, config: ExLlamaV2Config):
        super().__init__(PretrainedConfig())
        self.ex_config = config
        self.ex_model = ExLlamaV2(config)
        split = None
        if shared.args.gpu_split:
            split = [float(alloc) for alloc in shared.args.gpu_split.split(",")]

        self.ex_model.load(split)
        self.generation_config = GenerationConfig()
        self.loras = None

        self.past_seq = None

    def _validate_model_class(self):
        pass

    def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
        pass

    def prepare_inputs_for_generation(self, input_ids, **kwargs):
        return {'input_ids': input_ids, **kwargs}

    @property
    def device(self) -> torch.device:
        return torch.device(0)

    def __call__(self, *args, **kwargs):
        input_ids = kwargs['input_ids']
        past_key_values = kwargs.get('past_key_values')
        attention_mask = kwargs.get('attention_mask')
        use_cache = kwargs.get('return_dict')
        return_dict = kwargs.get('use_cache')
        loss = None

        if past_key_values is None:
            past_key_values = ExLlamaV2Cache(self.ex_model, input_ids.shape[0],-1)
            # process prompt
            self.ex_model.forward(input_ids[..., :-1], past_key_values, input_mask = attention_mask)

        logits = self.ex_model.forward(input_ids[..., -1:], past_key_values,input_mask = attention_mask).to(input_ids.device)

        if not return_dict:
            output = (logits, past_key_values if use_cache else None)
            return (loss, ) + output if loss is not None else output

        return CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values if use_cache else None, loss=loss)

    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs):
        assert len(model_args) == 0 and len(kwargs) == 0, "extra args is currently not supported"
        if isinstance(pretrained_model_name_or_path, str):
            pretrained_model_name_or_path = Path(pretrained_model_name_or_path)


        config = ExLlamaV2Config()
        config.model_dir = str(pretrained_model_name_or_path)
        config.prepare()

        config.max_seq_len = shared.args.max_seq_len
        config.scale_pos_emb = shared.args.compress_pos_emb
        config.scale_alpha_value = shared.args.alpha_value
        config.no_flash_attn = shared.args.no_flash_attn

        return Exllamav2HFBatched(config)



class Shared:
    class Args:
        def __init__(self):
            self.gpu_split = None

    def __init__(self):
        self.args = Shared.Args()

shared = Shared()
shared.args.gpu_split = None
shared.args.cache_8bit = None
shared.args.cfg_cache = None
# shared.args.model_dir = "/kaggle/input/"
shared.args.max_seq_len = 2048
shared.args.compress_pos_emb = 1
shared.args.alpha_value = 1
shared.args.no_flash_attn = 1


def load_model_with_exllama_2_hf_backend(model_loader_kwargs, tokenizer_kwargs):
    model = Exllamav2HFBatched.from_pretrained(model_loader_kwargs['pretrained_model_name_or_path'])

    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_kwargs, local_files_only=True)

    # make this dynamic
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

def load_model_with_hf_backend(model_loader_kwargs, tokenizer_kwargs):
    model = AutoModelForCausalLM.from_pretrained(**model_loader_kwargs, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_kwargs, local_files_only=True)

    # make this dynamic
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

from auto_gptq import AutoGPTQForCausalLM


def load_model_with_autogptq_backend(model_loader_kwargs, tokenizer_kwargs):
    model_name_or_path = model_loader_kwargs.pop('pretrained_model_name_or_path')
    model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,**model_loader_kwargs, local_files_only=True)
    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_kwargs, local_files_only=True)

    # make this dynamic
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model_loader_backend_map = {
    "exllama_2_hf": load_model_with_exllama_2_hf_backend,
    "hf": load_model_with_hf_backend,
    'auto_gptq' : load_model_with_autogptq_backend,
}

def preprocess_dataset(dataset, tokenizer, encoding_kwargs, decoding_kwargs, pt, pt_cols, system_prompt, add_generation_prompt = True):

    def wrapper(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        messages = [] if system_prompt is None else [{"role": "system", "content": system_prompt}]
        formatted_pt = pt.format(**{pt_col : sample[pt_col] for pt_col in pt_cols})
        messages.append(
            {
                "role": "user",
                "content": formatted_pt,
            }
        )
        formatted_pt_with_ct = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=add_generation_prompt)
        return formatted_pt_with_ct

    def actual_input(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        return sample[pt_cols[0]]



    pt_dataset = dataset.map(
        lambda sample : {
            "X" : wrapper(sample),
            'actual input' : actual_input(sample),
        }
    )

    return pt_dataset



def perform_inference_batched(model, tokenizer, model_inputs, gen_kwargs, tokenizer_encoding_kwargs,tokenizer_decoding_kwargs, batch_size, seed):

    batch_latency = []
    outputs = []

    batch_prompt_tokens = []
    batch_completion_tokens = []
    batch_tps = []
    batch_inputs = []



    seed_everything(seed)

    for batch in tqdm(batcher(iterable = model_inputs, batch_size = batch_size), total = math.ceil(len(model_inputs) / batch_size)):
        model_input = [f"{x['X']}" for x in batch]
        batch_inputs.extend(model_input)

        encoded_input = tokenizer(text = model_input, **tokenizer_encoding_kwargs, return_tensors = "pt")

        prompt_tokens = encoded_input['input_ids'].shape[1]

        input_ids = encoded_input['input_ids'].to('cuda:0')
        attention_mask = encoded_input['attention_mask'].to('cuda:0')

        start = time.time()
        if isinstance(model, BaseGPTQForCausalLM):
            # https://github.com/huggingface/optimum/blob/fd47a73267c3a71ea4e3c02f92260ae61c5ae372/tests/benchmark/benchmark_gptq.py#L183C9-L183C51
            output_ids = model.model.generate(input_ids, attention_mask = attention_mask, **gen_kwargs)
        else:
            output_ids = model.generate(input_ids, attention_mask = attention_mask, **gen_kwargs)

        # print(f"output ids - {output_ids.shape}")

        end = time.time()

        latency = end - start

        decoded_output = tokenizer.batch_decode(output_ids, **tokenizer_decoding_kwargs)

        # remove prompt
        decoded_output = decoder_parser(outputs = decoded_output, formatted_prompts = model_inputs, prepoc = lambda x : x.strip())
        output_tokens = output_ids.shape[1]

        batch_latency.append(latency)



        completion_tokens = output_tokens - prompt_tokens

        tps = completion_tokens / latency

        batch_prompt_tokens.append(prompt_tokens)
        batch_completion_tokens.append(completion_tokens)

        batch_tps.append(tps)

        print(f"latency - {latency}, prompt tokens - {prompt_tokens}, output tokens - {output_tokens}, completion tokens - {completion_tokens}, batch size - {batch_size}, tps - {tps}\n")

        outputs.extend(decoded_output)

    output_dict = {
        'inputs' : batch_inputs,
        'outputs' : outputs,
        'batch_latency' : batch_latency,
        'batch_prompt_tokens' : batch_prompt_tokens,
        'batch_completion_tokens' : batch_completion_tokens,
        'batch_tps' : batch_tps,
        'avg_tps' : sum(batch_tps) / len(batch_tps),
        'avg_latency' : sum(batch_latency) / len(batch_latency),
        'avg_prompt_tokens' : sum(batch_prompt_tokens) / len(batch_prompt_tokens),
        'avg_completion_tokens' : sum(batch_completion_tokens) / len(batch_completion_tokens),
        'total_time' : sum(batch_latency),
    }

    return output_dict

from auto_gptq import exllama_set_max_input_length



def benchmark_model(model_loader_kwargs, tokenizer_loader_kwargs, model_id, model_backend, tokenizer_encoding_kwargs,
                    tokenizer_decoding_kwargs, dataset, pt, pt_cols, system_prompt, gen_kwargs,exp_name, add_generation_prompt = True, batch_size = 1, seed = 42, bm_sample_size = 20, model_branch = "main"):

    cm()

    artifacts = {}

    benchmark_dict = {
        'model_id' : model_id,
        'model_loader_kwargs' : model_loader_kwargs,
        'tokenizer_loader_kwargs' : tokenizer_loader_kwargs,
        'model_backend' : model_backend,
        'model_branch' : model_branch,
        'tokenizer_encoding_kwargs' : tokenizer_encoding_kwargs,
        'tokenizer_decoding_kwargs' : tokenizer_decoding_kwargs,
        'pt' : pt,
        'pt_cols' : pt_cols,
        'system_prompt' : system_prompt,
        'batch_size' : batch_size,
        'seed' : seed,
        'bm_sample_size' : bm_sample_size,
        'gen_kwargs' : gen_kwargs,
        'exp_name' : exp_name
    }


    # 1. download model
    temp_model_dir = Path(f"/workspace/temp_model_dir/")
    temp_model_dir.mkdir(exist_ok = True, parents = True)
    output_folder = download_model_from_hf(model_id, save_dir = temp_model_dir, branch = model_branch)
    model_loader_kwargs['pretrained_model_name_or_path'] = output_folder
    tokenizer_loader_kwargs['pretrained_model_name_or_path'] = output_folder

    # 2. load model
    start = time.time()
    model, tokenizer = model_loader_backend_map[model_backend](model_loader_kwargs, tokenizer_loader_kwargs)

    # required for hf backend(uses exllama internally),gptq model with batch size - 8, has buffer issue
    # model = exllama_set_max_input_length(model, max_input_length=2500)

    end = time.time()

    artifacts['model'] = model
    artifacts['tokenizer'] = tokenizer

    # 3. process datset
    processed_dataset = preprocess_dataset(dataset, tokenizer, tokenizer_encoding_kwargs, tokenizer_decoding_kwargs, pt, pt_cols, system_prompt = system_prompt, add_generation_prompt = add_generation_prompt)

    # show 2 samples of processed dataset
    print("Processed Dataset:\n")
    for i in range(2):
        print(processed_dataset[i]['X'])
        print('\n')
        print('---' * 10)
        print('\n')

    model_loading_time = end - start
    benchmark_dict['model_loading_time'] = model_loading_time

    bm_samples = processed_dataset.shuffle(seed = seed).select(range(bm_sample_size))

    # 4. perform inference
    output_dict = perform_inference_batched(model, tokenizer, bm_samples, gen_kwargs, tokenizer_encoding_kwargs, tokenizer_decoding_kwargs, batch_size, seed)

    benchmark_dict['model_out'] = output_dict

    return artifacts, benchmark_dict


def append_dict_to_csv(data_dict, file_path):
    """
    Appends a dictionary as a new row to a CSV file. Creates the file with headers if it doesn't exist,
    or appends to it without headers if it does.

    Parameters:
    - data_dict: Dict. A dictionary representing a single row of data.
    - file_path: str. The path to the CSV file.
    """

    data_to_dump = data_dict.pop('model_out')

    data_to_dump = {
        **data_to_dump,
        **data_dict,
    }

    # Convert the dictionary to a DataFrame
    df = pd.DataFrame([data_to_dump])

    # Check if the file exists to determine if headers should be written
    file_exists = os.path.isfile(file_path)

    # Write or append the DataFrame to the CSV file
    df.to_csv(file_path, mode='a', header=not file_exists, index=False)

    data_dict = data_to_dump

    return df




model_loader_kwargs = {
    'device_map' : {'' : 0},
    'quantization_config' : BitsAndBytesConfig(
           load_in_4bit=True,
           bnb_4bit_quant_type="nf4",
           bnb_4bit_use_double_quant=True,
           bnb_4bit_compute_dtype=torch.bfloat16
    ),
}
tokenizer_loader_kwargs = {
    'use_fast' : False,
    'legacy' : False,
    'padding_side' : 'left',
}
tokenizer_encoding_kwargs = {
    # pad to longest seq in batch
    'padding' : True,
}
tokenizer_decoding_kwargs = {}

# model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ"
model_id = "argilla/CapybaraHermes-2.5-Mistral-7B"
model_backend = "hf"
dataset = gsm8k_dataset['train']
pt = textwrap.dedent("""\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}""")
pt_cols = ['question']
system_prompt = "Solve the following math problems, end with The answer is"


# stopping_criteria = StoppingCriteriaList([SingleTokenStoppingCriteria(token_id=32000)])
stopping_criteria = StoppingCriteriaList([MultiTokenEOSCriteria(sequence_ids = [32000])])

gen_kwargs = {
    'max_new_tokens' : 500,
    'stopping_criteria' : stopping_criteria
}
bm_sample_size = 100
batch_size = 1

exp_name = f"model with hf backend and bnb quant - 1"

artifacts, benchmark_dict = benchmark_model(
    model_loader_kwargs=model_loader_kwargs,
    tokenizer_loader_kwargs=tokenizer_loader_kwargs,
    model_id=model_id,
    model_backend=model_backend,
    dataset = dataset,
    tokenizer_encoding_kwargs=tokenizer_encoding_kwargs,
    tokenizer_decoding_kwargs=tokenizer_decoding_kwargs,
    pt=pt,
    pt_cols=pt_cols,
    system_prompt=system_prompt,
    gen_kwargs=gen_kwargs,
    batch_size=batch_size,
    bm_sample_size=bm_sample_size,
    exp_name = exp_name,
)
file_path = Path('./model-benchmark.csv')

benchmark_dict = {
    **version_dict,
    **benchmark_dict,
}

df = append_dict_to_csv(benchmark_dict, file_path)


Downloading the model to /workspace/temp_model_dir/argilla_CapybaraHermes-2.5-Mistral-7B


100%|██████████| 6.75k /6.75k  25.5MiB/s
100%|██████████| 115   /115    476kiB/s
100%|██████████| 654   /654    1.63MiB/s
100%|██████████| 51.0  /51.0   206kiB/s
100%|██████████| 23.9k /23.9k  36.0MiB/s
100%|██████████| 420   /420    1.49MiB/s
  0%|          | 1.05M /4.94G  9.65MiB/s
[A

  0%|          | 7.34M /4.94G  34.6MiB/s
[A

  0%|          | 13.6M /4.94G  44.8MiB/s
[A

  0%|          | 22.0M /4.94G  58.0MiB/s
[A

  1%|          | 31.5M /4.94G  68.9MiB/s
[A

100%|██████████| 1.80M /1.80M  3.16MiB/s
  1%|          | 38.8M /4.94G  64.1MiB/s
[A

[A[A
  1%|          | 46.1M /4.94G  58.7MiB/s

[A[A
  1%|          | 53.5M /4.94G  58.3MiB/s

[A[A
  1%|          | 60.8M /4.94G  58.2MiB/s

100%|██████████| 493k  /493k   8.45MiB/s

[A

  1%|▏         | 73.4M /4.94G  59.5MiB/s
[A

100%|██████████| 1.60k /1.60k  3.28MiB/s
  2%|▏         | 79.7M /4.94G  58.3MiB/s
[A

  2%|▏         | 86.0M /4.94G  53.7MiB/s
[A

  2%|▏         | 93.3M /4.94G  57.4MiB/s
[A

[A[A
  2%|▏       

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Processed Dataset:

<|im_start|>system
Solve the following math problems, end with The answer is<|im_end|>
<|im_start|>user
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant



------------------------------


<|im_start|>system
Solve the following math problems, end with The answer is<|im_end|>
<|im_start|>user
Q: There are 15 trees in the grove

  0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.231867551803589, prompt tokens - 245, output tokens - 344, completion tokens - 99, batch size - 1, tps - 15.886088588540048



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 13.114059448242188, prompt tokens - 242, output tokens - 501, completion tokens - 259, batch size - 1, tps - 19.749796088862205



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 11.79963755607605, prompt tokens - 290, output tokens - 517, completion tokens - 227, batch size - 1, tps - 19.2378790383362



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.028794288635254, prompt tokens - 256, output tokens - 374, completion tokens - 118, batch size - 1, tps - 19.572736164250816



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.587373733520508, prompt tokens - 276, output tokens - 408, completion tokens - 132, batch size - 1, tps - 20.038334750661686



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.82160472869873, prompt tokens - 298, output tokens - 514, completion tokens - 216, batch size - 1, tps - 19.960071118396268



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.413543939590454, prompt tokens - 245, output tokens - 344, completion tokens - 99, batch size - 1, tps - 18.287465864272555



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.77283239364624, prompt tokens - 294, output tokens - 507, completion tokens - 213, batch size - 1, tps - 19.771958962772526



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.217836141586304, prompt tokens - 243, output tokens - 324, completion tokens - 81, batch size - 1, tps - 19.204159972306645



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 9.48109769821167, prompt tokens - 272, output tokens - 463, completion tokens - 191, batch size - 1, tps - 20.145346676052768



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.498809337615967, prompt tokens - 247, output tokens - 354, completion tokens - 107, batch size - 1, tps - 16.464554419325687



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.466162443161011, prompt tokens - 225, output tokens - 371, completion tokens - 146, batch size - 1, tps - 19.554891969131436



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.258819103240967, prompt tokens - 227, output tokens - 291, completion tokens - 64, batch size - 1, tps - 19.63901584360746



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.882099390029907, prompt tokens - 281, output tokens - 426, completion tokens - 145, batch size - 1, tps - 18.396114134694997



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 8.479504585266113, prompt tokens - 262, output tokens - 422, completion tokens - 160, batch size - 1, tps - 18.86902688607706



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 15.717321395874023, prompt tokens - 249, output tokens - 561, completion tokens - 312, batch size - 1, tps - 19.850710699464578



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.448943614959717, prompt tokens - 237, output tokens - 364, completion tokens - 127, batch size - 1, tps - 19.693147836708647



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.9901816844940186, prompt tokens - 239, output tokens - 377, completion tokens - 138, batch size - 1, tps - 19.741976135773225



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.553161144256592, prompt tokens - 296, output tokens - 505, completion tokens - 209, batch size - 1, tps - 19.80449243056857



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.273419618606567, prompt tokens - 274, output tokens - 376, completion tokens - 102, batch size - 1, tps - 19.34228780886437



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.3793535232543945, prompt tokens - 250, output tokens - 386, completion tokens - 136, batch size - 1, tps - 18.42979870410412



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.612804174423218, prompt tokens - 260, output tokens - 381, completion tokens - 121, batch size - 1, tps - 18.29783504976599



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.551408052444458, prompt tokens - 235, output tokens - 382, completion tokens - 147, batch size - 1, tps - 19.466568218680063



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 23.774986743927002, prompt tokens - 243, output tokens - 727, completion tokens - 484, batch size - 1, tps - 20.357529752298653



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.727686166763306, prompt tokens - 258, output tokens - 345, completion tokens - 87, batch size - 1, tps - 18.402236724516428



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.8085081577301025, prompt tokens - 236, output tokens - 347, completion tokens - 111, batch size - 1, tps - 19.109898270914627



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.0274317264556885, prompt tokens - 242, output tokens - 325, completion tokens - 83, batch size - 1, tps - 16.509423601564162



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.564543008804321, prompt tokens - 240, output tokens - 380, completion tokens - 140, batch size - 1, tps - 18.507396922332905



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.57642674446106, prompt tokens - 263, output tokens - 371, completion tokens - 108, batch size - 1, tps - 19.367240878269225



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 17.438334226608276, prompt tokens - 269, output tokens - 600, completion tokens - 331, batch size - 1, tps - 18.981170775758144



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.403900384902954, prompt tokens - 240, output tokens - 306, completion tokens - 66, batch size - 1, tps - 19.38952158903489



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.398918390274048, prompt tokens - 267, output tokens - 385, completion tokens - 118, batch size - 1, tps - 18.44061649221102



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.955189943313599, prompt tokens - 252, output tokens - 336, completion tokens - 84, batch size - 1, tps - 16.951923329063774



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.9259631633758545, prompt tokens - 275, output tokens - 410, completion tokens - 135, batch size - 1, tps - 19.4918738109774



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.24393630027771, prompt tokens - 241, output tokens - 363, completion tokens - 122, batch size - 1, tps - 19.53895653845377



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.56210732460022, prompt tokens - 245, output tokens - 354, completion tokens - 109, batch size - 1, tps - 19.59688902763746



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 12.522862911224365, prompt tokens - 256, output tokens - 487, completion tokens - 231, batch size - 1, tps - 18.446261181454954



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.587297677993774, prompt tokens - 265, output tokens - 369, completion tokens - 104, batch size - 1, tps - 18.61364938718339



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.706825256347656, prompt tokens - 248, output tokens - 348, completion tokens - 100, batch size - 1, tps - 17.522877520872186



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.24220085144043, prompt tokens - 255, output tokens - 355, completion tokens - 100, batch size - 1, tps - 19.07595737628451



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.975024700164795, prompt tokens - 235, output tokens - 330, completion tokens - 95, batch size - 1, tps - 19.095382581086117



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.599429130554199, prompt tokens - 252, output tokens - 377, completion tokens - 125, batch size - 1, tps - 18.94103225099758



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.855531930923462, prompt tokens - 258, output tokens - 370, completion tokens - 112, batch size - 1, tps - 19.12721189487848



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.016924619674683, prompt tokens - 239, output tokens - 335, completion tokens - 96, batch size - 1, tps - 19.135228706351388



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.9478390216827393, prompt tokens - 251, output tokens - 323, completion tokens - 72, batch size - 1, tps - 18.23782570782496



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.436506509780884, prompt tokens - 270, output tokens - 412, completion tokens - 142, batch size - 1, tps - 19.094987654920242



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.929550409317017, prompt tokens - 284, output tokens - 396, completion tokens - 112, batch size - 1, tps - 18.888447229323834



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.6468048095703125, prompt tokens - 219, output tokens - 365, completion tokens - 146, batch size - 1, tps - 19.09294190656921



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.921313285827637, prompt tokens - 256, output tokens - 344, completion tokens - 88, batch size - 1, tps - 17.88140581365177



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.2699995040893555, prompt tokens - 254, output tokens - 382, completion tokens - 128, batch size - 1, tps - 17.6066036769329



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.820496559143066, prompt tokens - 237, output tokens - 349, completion tokens - 112, batch size - 1, tps - 19.24234450823031



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 13.454994201660156, prompt tokens - 348, output tokens - 597, completion tokens - 249, batch size - 1, tps - 18.506139524703542



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.783504724502563, prompt tokens - 238, output tokens - 429, completion tokens - 191, batch size - 1, tps - 17.71223780020282



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.514626741409302, prompt tokens - 263, output tokens - 387, completion tokens - 124, batch size - 1, tps - 19.034091272154026



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 11.869104146957397, prompt tokens - 262, output tokens - 475, completion tokens - 213, batch size - 1, tps - 17.945752043518954



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.419710159301758, prompt tokens - 236, output tokens - 319, completion tokens - 83, batch size - 1, tps - 18.77951200607975



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.236937999725342, prompt tokens - 231, output tokens - 411, completion tokens - 180, batch size - 1, tps - 17.58338284405253



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.255904197692871, prompt tokens - 234, output tokens - 314, completion tokens - 80, batch size - 1, tps - 18.797415609911535



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.893036603927612, prompt tokens - 221, output tokens - 314, completion tokens - 93, batch size - 1, tps - 19.00660214259371



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.1719722747802734, prompt tokens - 246, output tokens - 304, completion tokens - 58, batch size - 1, tps - 18.28515351825316



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 15.064690589904785, prompt tokens - 271, output tokens - 538, completion tokens - 267, batch size - 1, tps - 17.723563481544268



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.191914796829224, prompt tokens - 257, output tokens - 347, completion tokens - 90, batch size - 1, tps - 17.33464502440685



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.305077791213989, prompt tokens - 227, output tokens - 355, completion tokens - 128, batch size - 1, tps - 17.522058444599864



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.02809476852417, prompt tokens - 242, output tokens - 375, completion tokens - 133, batch size - 1, tps - 18.924047608983606



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.343905448913574, prompt tokens - 229, output tokens - 292, completion tokens - 63, batch size - 1, tps - 18.840245623711798



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.693711042404175, prompt tokens - 232, output tokens - 322, completion tokens - 90, batch size - 1, tps - 19.174593234844924



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.052724838256836, prompt tokens - 256, output tokens - 393, completion tokens - 137, batch size - 1, tps - 19.425116269509978



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.497284173965454, prompt tokens - 219, output tokens - 345, completion tokens - 126, batch size - 1, tps - 19.392718038235206



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.154847621917725, prompt tokens - 250, output tokens - 387, completion tokens - 137, batch size - 1, tps - 19.147857122815942



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.648231506347656, prompt tokens - 230, output tokens - 311, completion tokens - 81, batch size - 1, tps - 17.42598230948391



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 9.53200387954712, prompt tokens - 231, output tokens - 393, completion tokens - 162, batch size - 1, tps - 16.995377052626303



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.46451735496521, prompt tokens - 255, output tokens - 357, completion tokens - 102, batch size - 1, tps - 15.778440121543913



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 8.651639938354492, prompt tokens - 240, output tokens - 391, completion tokens - 151, batch size - 1, tps - 17.453338450966513



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.3434998989105225, prompt tokens - 232, output tokens - 333, completion tokens - 101, batch size - 1, tps - 18.901469432158635



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.9507904052734375, prompt tokens - 225, output tokens - 324, completion tokens - 99, batch size - 1, tps - 16.63644545643361



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.557014465332031, prompt tokens - 237, output tokens - 344, completion tokens - 107, batch size - 1, tps - 19.25494357942197



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.549602270126343, prompt tokens - 272, output tokens - 460, completion tokens - 188, batch size - 1, tps - 17.82057704036538



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 8.249440431594849, prompt tokens - 284, output tokens - 430, completion tokens - 146, batch size - 1, tps - 17.698170101432456



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 16.51624083518982, prompt tokens - 285, output tokens - 604, completion tokens - 319, batch size - 1, tps - 19.314322380207273



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 8.174191236495972, prompt tokens - 251, output tokens - 410, completion tokens - 159, batch size - 1, tps - 19.451465643487744



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.78066873550415, prompt tokens - 236, output tokens - 330, completion tokens - 94, batch size - 1, tps - 16.26109439945307



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.152137756347656, prompt tokens - 216, output tokens - 351, completion tokens - 135, batch size - 1, tps - 18.87547536122119



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 11.004028081893921, prompt tokens - 252, output tokens - 462, completion tokens - 210, batch size - 1, tps - 19.083920764027763



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 3.837355613708496, prompt tokens - 224, output tokens - 292, completion tokens - 68, batch size - 1, tps - 17.720536443658776



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 8.52479100227356, prompt tokens - 276, output tokens - 418, completion tokens - 142, batch size - 1, tps - 16.65729986367157



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.688625335693359, prompt tokens - 253, output tokens - 379, completion tokens - 126, batch size - 1, tps - 16.387844965609492



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 9.759671926498413, prompt tokens - 250, output tokens - 409, completion tokens - 159, batch size - 1, tps - 16.291531231526367



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.12687611579895, prompt tokens - 232, output tokens - 373, completion tokens - 141, batch size - 1, tps - 13.92334599413394



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 10.195412874221802, prompt tokens - 273, output tokens - 458, completion tokens - 185, batch size - 1, tps - 18.14541522568018



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.496978282928467, prompt tokens - 249, output tokens - 323, completion tokens - 74, batch size - 1, tps - 16.455494188379898



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.047774791717529, prompt tokens - 266, output tokens - 378, completion tokens - 112, batch size - 1, tps - 18.519208114922666



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.260363578796387, prompt tokens - 264, output tokens - 383, completion tokens - 119, batch size - 1, tps - 19.00848065806409



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.25749397277832, prompt tokens - 231, output tokens - 333, completion tokens - 102, batch size - 1, tps - 16.30045517322522



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 5.85966944694519, prompt tokens - 240, output tokens - 335, completion tokens - 95, batch size - 1, tps - 16.212518617330225



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 4.116188287734985, prompt tokens - 239, output tokens - 314, completion tokens - 75, batch size - 1, tps - 18.22074083041285



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.940778732299805, prompt tokens - 257, output tokens - 384, completion tokens - 127, batch size - 1, tps - 18.297658648731908



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 6.809351921081543, prompt tokens - 258, output tokens - 385, completion tokens - 127, batch size - 1, tps - 18.650820441048424



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 2.6747162342071533, prompt tokens - 239, output tokens - 287, completion tokens - 48, batch size - 1, tps - 17.94582893920644



Setting `pad_token_id` to `eos_token_id`:32000 for open-end generation.


latency - 7.3159499168396, prompt tokens - 243, output tokens - 367, completion tokens - 124, batch size - 1, tps - 16.949268572025225

latency - 4.54368782043457, prompt tokens - 225, output tokens - 309, completion tokens - 84, batch size - 1, tps - 18.48718559013282



In [None]:
benchmark_dict['model_out']

KeyError: 'model_out'

In [None]:
isinstance(, BaseGPTQForCausalLM)

In [8]:
import pandas as pd
import os



file_path = './model-benchmark.csv'

append_dict_to_csv(benchmark_dict, file_path)

KeyError: 'model_out'

In [36]:
pretty_print_dict(benchmark_dict, indent = 4)

{
    "model_id": "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ",
    "model_loader_kwargs": {
        "device_map": {
            "": 0
        },
        "pretrained_model_name_or_path": "/workspace/temp_model_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ"
    },
    "tokenizer_loader_kwargs": {
        "use_fast": false,
        "legacy": false,
        "pretrained_model_name_or_path": "/workspace/temp_model_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ"
    },
    "model_backend": "exllama_2_hf",
    "model_branch": "main",
    "tokenizer_encoding_kwargs": {
        "padding": true
    },
    "tokenizer_decoding_kwargs": {},
    "pt": "Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\nQ: If there are 3 ca

In [10]:
for out in res[1]['model_out']['outputs']:
    print(out)
    print('\n\n', '---' * 10, '\n\n')

<|im_start|> system
Solve the following math problems, end with The answer is<|im_end|>
 <|im_start|> user
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?<|im_end|>
 <|im_start|> assistant
<|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end|><|im_end

In [18]:
gsm8k_dataset

NameError: name 'gsm8k_dataset' is not defined

In [5]:
def load_dataset(samples = 10)
    sampled_dataset = gsm8k_dataset['train'].shuffle(seed=42).select(range(samples))
    return sampled_dataset



In [7]:







def load_model_with_hf_backend(model_id, loading_args, tokenizer_args):
    # model = AutoModelForCausalLM.from_pretrained(model_id, **loading_args)
    # tokenizer = AutoTokenizer.from_pretrained(model_id, *tokenizer_args)

    model = Exllamav2HF.from_pretrained(pretrained_model_name_or_path = model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False, legacy=False)

    return model, tokenizer

def prepare_input(tokenizer,question):
    text = textwrap.dedent("""\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}""")

    pt = langchain.PromptTemplate.from_template(text)
    formatted_pt = pt.format(question=question)

    messages = [
        {
            "role": "system",
            "content": "You are a friendly assistant who can solve math problems",
        },
        {"role": "user", "content": formatted_pt},
    ]

    ct_sample = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=True)
    return ct_sample


def driver(model_id, loading_args, tokenizer_args, gen_params1):

    cm()

    print(model_id)
    print(f"Model loading args - {loading_args}")
    print(f"tokenizer args - {tokenizer_args}\n")

    start = time.time()
    model, tokenizer = load_model_with_hf_backend(model_id, loading_args, tokenizer_args)
    loading_time = (time.time() - start)


    avg_tps = []

    seed = 42
    seed_everything(seed)

    detail_dict = {
        'model_loading_time' : loading_time,
        'input_args' : {
            'model_id' : model_id,
            'loading_args' : loading_args,
            'tokenizer_args' : tokenizer_args,
            'seed' : seed,
        },
        'out' : [],
        'inf_latency' : [],
        'p_tokens' : [],
        'c_tokens' : [],
        'tps' : [],
    }


    for idx, item in tqdm(enumerate(sampled_dataset)):
        prepared_example = prepare_input(tokenizer,item['question'])
        start = time.time()
        out, prompt_tokens, c_tokens = perform_single_example_inference(prepared_example, model,tokenizer, gen_params1)
        inf_latency = (time.time() - start)

        detail_dict['out'].append(out)
        detail_dict['inf_latency'].append(inf_latency)
        detail_dict['p_tokens'].append(prompt_tokens)
        detail_dict['c_tokens'].append(c_tokens)

        tps = c_tokens/inf_latency
        detail_dict['tps'].append(tps)

        # record post warmup (3 eg)
        if idx > 2:
            avg_tps.append(tps)
            print(f"idx  - {idx} , tps - {tps}\n")

            print('\n\n')
            print('----' * 10)
            print('\n\n')


    a_tps = sum(avg_tps)/len(avg_tps)
    print(f"Average tps - {a_tps}")

    print('\n', '----' * 15, '\n\n')

    detail_dict['average_tps'] = a_tps

    return detail_dict



In [8]:
stopping_criteria = StoppingCriteriaList([SingleTokenStoppingCriteria(token_id=32000)])
model_id = "/workspace/capybarahermes-2.5-gptq/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ"
loading_args = {
    'device_map' : {"" : 0},
}
tokenizer_args = {

}
gen_params1 = {
    'max_new_tokens' : 500,
    'stopping_criteria' : stopping_criteria,
    'do_sample' : False,
}
detail_dict = driver(model_id, loading_args, tokenizer_args, gen_params1)

print('\n\n')
print('****' * 20)



/workspace/capybarahermes-2.5-gptq/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ
Model loading args - {'device_map': {'': 0}}
tokenizer args - {}



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


0it [00:00, ?it/s]

Prompt tokens - 242
<|im_start|>system
You are a friendly assistant who can solve math problems<|im_end|>
<|im_start|>user
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?<|im_end|>
<|im_start|>assistant
A: Mimi picked up 2 dozen seashells, which is 2 * 12 = 24 seashells.
Kyle found twice as many shells as Mimi, so he found 24

In [9]:
detail_dict

{'model_loading_time': 1.8519587516784668,
 'input_args': {'model_id': '/workspace/capybarahermes-2.5-gptq/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ',
  'loading_args': {'device_map': {'': 0}},
  'tokenizer_args': {},
  'seed': 42},
 'out': ['<|im_start|>system\nYou are a friendly assistant who can solve math problems<|im_end|>\n<|im_start|>user\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?\nA: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.\n\nQ: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?\nA: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.\n\nQ: Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed

In [10]:
import json

def json_dump(ob : dict, file_path: Path):
    with open(file_path, 'w', encoding="utf-8") as json_file:
        json.dump(ob, json_file, indent=4)

idx = 1
file_path = f"./{model_id.split('/')[-1]}-{idx}-inf-time.json"
json_dump(detail_dict, file_path)