In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
from ezlife.ml.benchmarker.loaders.loader import Loader
from ezlife.ml.benchmarker.utils.mem_utils import gc_cuda
from exllamav2 import ExLlamaV2, ExLlamaV2Config, ExLlamaV2Cache, ExLlamaV2Tokenizer, Timer
from exllamav2.generator import ExLlamaV2DynamicGenerator


In [3]:
class ExllamaV2Loader(Loader):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @property
    def relevant_pkgs(self):
        return ['transformers', 'torch', 'exllamav2', 'auto-gptq']

    def load(self):
        super().load()
        self.model_dir = str(self.model_dir)
        self.config = ExLlamaV2Config(self.model_dir)
        self.model = ExLlamaV2(self.config)
        self.cache = ExLlamaV2Cache(self.model, max_seq_len = 8192, lazy = True)
        self.model.load_autosplit(self.cache, progress = False)
        self.tokenizer = ExLlamaV2Tokenizer(self.config)
        self.generator = ExLlamaV2DynamicGenerator(
            model = self.model,
            cache = self.cache,
            tokenizer = self.tokenizer,
        )

    def warmup_model(self):
        print("Warming up model")

        self.generator.warmup()

        example = "What is the meaning of life?"

        output = self.generator.generate(
            prompt = example,
            **self.generate_args,
        )

        print(output)

        print(f"model warmed up")

    def run_inference(self, example):
        gc_cuda()

        num_output_tokens = []
        latencies = []
        num_input_tokens = len(inputs['input_ids'][0])

In [4]:
loader_ob = ExllamaV2Loader(
    model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-GPTQ",
    model_loader_args = {},
    generate_args = {
        'max_new_tokens' : 50,
    },
    runs = 20,
    warmup = 20,
)

In [5]:
loader_ob.load()

downloading model....
Model already exists in /workspace/ezlife/ezlife/ml/benchmarker/loaders/models/TheBloke_CapybaraHermes-2.5-Mistral-7B-GPTQ. Checking the model files...
Checksum validated: model.safetensors  fc7d5419e6d124db8bd07a4c3332f867819dbde179db39e83611f4f7fcf23c3a
Checksum validated: tokenizer.model  dadfd56d766715c61d2ef780a525ab43b8e6da4de6865bda3d95fdef5e134055
[+] Validated checksums of all model files!
downloaded model


In [17]:
len(loader_ob.tokenizer.encode(text = "my name is praful")[0])

5

In [9]:
loader_ob.tokenizer??

[0;31mType:[0m           ExLlamaV2Tokenizer
[0;31mString form:[0m    <exllamav2.tokenizer.tokenizer.ExLlamaV2Tokenizer object at 0x7fcf50f27dc0>
[0;31mFile:[0m           /usr/local/lib/python3.10/dist-packages/exllamav2/tokenizer/tokenizer.py
[0;31mSource:[0m        
[0;32mclass[0m [0mExLlamaV2Tokenizer[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m    [0;32mclass[0m [0mTrie[0m[0;34m:[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m        [0mchildren[0m[0;34m:[0m [0mdict[0m[0;34m[0m
[0;34m[0m        [0mleaf[0m[0;34m:[0m [0mlist[0m[0;34m[0m
[0;34m[0m[0;34m[0m
[0;34m[0m        [0;32mdef[0m [0m__init__[0m[0;34m([0m[0mself[0m[0;34m,[0m [0mchildren[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m [0mleaf[0m [0;34m=[0m [0;32mNone[0m[0;34m)[0m[0;34m:[0m[0;34m[0m
[0;34m[0m            [0mself[0m[0;34m.[0m[0mchildren[0m [0;34m=[0m [0mchildren[0m [0;32mif[0m [0mchildren[0m [0;32mis[0m [0;32mnot[0m [0;

In [7]:
loader_ob.warmup_model()

Warming up model
What is the meaning of life?

The meaning of life is to live life to the fullest, to be happy, to experience joy, to find peace and contentment, to make a meaningful contribution to the world, and to love and be loved. For many people,
model warmed up
