In [3]:
"""

Check how the model pads input
Check how different enecoding & decoding params affect the encoding & decoding

"""

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch')

import gc
import torch
import ctypes
import json
import nltk
import math
import torch
import random
import evaluate
import datasets
import langchain
import numpy as np
import pandas as pd
import transformers
from transformers import GPTQConfig, BitsAndBytesConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import PreTrainedModel, PretrainedConfig, GenerationConfig, StoppingCriteria, AutoTokenizer, StoppingCriteriaList, AutoModel, AutoModelForCausalLM

import os
import gc
import ctypes
import traceback
from pathlib import Path
from typing import Any, Dict, Optional, Union, List

import time
import textwrap
from tqdm.auto import tqdm

from exllamav2 import (
    ExLlamaV2,
    ExLlamaV2Cache,
    ExLlamaV2Cache_8bit,
    ExLlamaV2Config
)

from datasets import load_dataset
from llmsearch.model_downloader import download_model_from_hf
from llmsearch.utils.model_utils import batcher, decoder_parser

import awq

from awq import AutoAWQForCausalLM

def pretty_print_dict(d, indent = 4):
    print(json.dumps(d, indent = indent, default = str))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
gsm8k_dataset = load_dataset("gsm8k", 'main')

torch.__version__, awq.__version__

('2.2.0+cu121', '0.2.3')

In [7]:

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


class SingleTokenStoppingCriteria(StoppingCriteria):
    """End generation if end token is encountered
    does not support batched implementation yet"""

    def __init__(self, token_id):
      super().__init__()
      self.token_id =  token_id

    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
        res = []

        last_token_id = input_ids[0][-1]
        if last_token_id == self.token_id:
            return True
        return False


def cm():
    gc.collect()
    ctypes.CDLL("libc.so.6").malloc_trim(0)
    torch.cuda.empty_cache()

def seed_everything(seed):
    """Seed for reproducibilty"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False



def perform_single_example_inference(example, model, tokenizer,gen_kwargs):

    tokenized_input = tokenizer(example, return_tensors = "pt", add_special_tokens = False)
    tokenized_input['input_ids'] = tokenized_input['input_ids'].to('cuda:0')

    tokenized_input['attention_mask'] = tokenized_input['attention_mask'].to('cuda:0')
    # tokenized_input.to(device)
    # print(tokenized_input)

    model_out = model.generate(**tokenized_input, **gen_kwargs)
    prompt_tokens = len(tokenized_input['input_ids'][0])
    print(f"Prompt tokens - {prompt_tokens}")
    # print(model_out.tolist()[0])

    output_token_ids = model_out.tolist()[0]
    decoded_output = tokenizer.decode(output_token_ids, spaces_between_special_tokens = False)

    print(decoded_output)
    completion_tokens = len(output_token_ids) - prompt_tokens

    print(f"Completion Tokens - {completion_tokens}")

    return decoded_output, prompt_tokens, completion_tokens

In [8]:
# loaders

class MultiTokenEOSCriteria(transformers.StoppingCriteria):
    """Criteria to stop on the specified multi-token sequence.

    This code is not thread safe. The same object cannot be used simultaneously in multiple threads.
    """

    def __init__(
        self,
        sequence_ids : List[int],
    ) -> None:
        self.sequence_ids = torch.tensor(sequence_ids, dtype = torch.int32, device = "cuda:0")
        # we look back for 2 more tokens than it takes to encode our stop sequence
        # because tokenizers suck, and a model might generate `['\n', '\n']` but our `sequence` is `['\n\n']`
        # and we don't want to mistakenly not stop a generation because our
        # (string) stop sequence was output in a different tokenization
        # NOTE: there is a minor danger that this will end up looking back 2 tokens into the past, into the inputs to the model,
        # and stopping generation immediately as a result. With only 2 extra tokens of lookback, this risk is minimized
        # Additionally, in lookback_ids_batch we should prevent ever looking back into the inputs as described.
        self.sequence_id_len = self.sequence_ids.shape[0] + 2
        self.state_initialized = False
        self.input_length = None
        self.state_initialized = False

    def set_state(self, batch_size, input_length):
        self.batch_size = batch_size
        self.input_length = input_length
        self.done_tracker = [False] * batch_size
        self.state_initialized = True

    def reset(self):
        self.batch_size = None
        self.input_length = None
        self.state_initialized = False


    def __call__(self, input_ids, scores, **kwargs) -> bool:
        # For efficiency, we compare the last n tokens where n is the number of tokens in the stop_sequence

        ret_val = False

        if not self.state_initialized:
            # 1st call to __call__ for this batch
            self.set_state(input_ids.shape[0], input_ids.shape[1])

        # IDs of all the tokens except the prompt
        lookback_ids_batch = input_ids[:, self.input_length :]
        # look back for 2 more tokens than it takes to encode our stop sequence
        lookback_ids_batch = lookback_ids_batch[:, -self.sequence_id_len :]

        # no elements yet to look back
        if lookback_ids_batch.nelement() == 0:
            return False

        for i, done in enumerate(self.done_tracker):
            if not done:
                # look back only as far as the last token of the stop sequence
                self.done_tracker[i] = self.sequence_ids == lookback_ids_batch[i][-(self.sequence_ids.shape[0]):]
        ret_val = False not in self.done_tracker
        if ret_val:
            # print(f"finish, ", self.sequence_ids, lookback_ids_batch)
            self.reset()
        return ret_val


def load_model_with_awq_backend(model_id, model_loader_kwargs, tokenizer_kwargs,temp_model_dir, model_branch = "main"):
    output_folder = download_model_from_hf(model_id, save_dir = temp_model_dir, branch = model_branch)

    model_loader_kwargs['pretrained_model_name_or_path'] = output_folder
    tokenizer_loader_kwargs['pretrained_model_name_or_path'] = output_folder

    model_name_or_path = model_loader_kwargs.pop('pretrained_model_name_or_path')
    model = AutoAWQForCausalLM.from_quantized(
        quant_path=model_name_or_path,
        **model_loader_kwargs
    )
    tokenizer = AutoTokenizer.from_pretrained(**tokenizer_kwargs, local_files_only=True)

    # pad token is null in config -https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ/blob/eb64c310c44905321d012962db9ac0d47c3a64fa/tokenizer_config.json#L53
    tokenizer.pad_token = tokenizer.eos_token

    return model, tokenizer

model_loader_backend_map = {
    # "exllama_2_hf": load_model_with_exllama_2_hf_backend,
    # "hf": load_model_with_hf_backend,
    # 'auto_gptq' : load_model_with_autogptq_backend,
    'awq' : load_model_with_awq_backend,
}

In [12]:
# https://huggingface.co/TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

temp_model_dir = Path(f"/workspace/temp_model_dir/")
temp_model_dir.mkdir(exist_ok = True, parents = True)

model_loader_kwargs = {
    'device_map' : {'' : 0},
    'fuse_layers' : True,
}

tokenizer_loader_kwargs = {
    'use_fast' : False,
    'legacy' : False,
    'padding_side' : 'left',
}

model, tokenizer = load_model_with_awq_backend(model_id, model_loader_kwargs, tokenizer_loader_kwargs,temp_model_dir, model_branch = "main")

Downloading the model to /workspace/temp_model_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-AWQ


100%|██████████| 51.0  /51.0   243kiB/s
100%|██████████| 17.9k /17.9k  25.7MiB/s
100%|██████████| 115   /115    756kiB/s
100%|██████████| 911   /911    3.30MiB/s
100%|██████████| 126   /126    285kiB/s
100%|██████████| 420   /420    1.61MiB/s
100%|██████████| 1.60k /1.60k  3.90MiB/s

100%|██████████| 493k  /493k   5.98MiB/s
100%|██████████| 1.80M /1.80M  3.17MiB/s
100%|██████████| 4.15G /4.15G  56.7MiB/s
Replacing layers...: 100%|██████████| 32/32 [00:05<00:00,  6.28it/s]
Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 10.85it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def preprocess_dataset(dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt = True):

    def wrapper(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        messages = [] if system_prompt is None else [{"role": "system", "content": system_prompt}]
        formatted_pt = pt.format(**{pt_col : sample[pt_col] for pt_col in pt_cols})
        messages.append(
            {
                "role": "user",
                "content": formatted_pt,
            }
        )
        formatted_pt_with_ct = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=add_generation_prompt)
        return formatted_pt_with_ct

    def actual_input(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        return sample[pt_cols[0]]



    pt_dataset = dataset.map(
        lambda sample : {
            "X" : wrapper(sample),
            'actual input' : actual_input(sample),
        }
    )

    return pt_dataset

In [15]:


pt = textwrap.dedent("""\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}""")
pt_cols = ['question']
system_prompt = "Solve the following math problems, end with The answer is"

# Add prompt template
processed_dataset = preprocess_dataset(gsm8k_dataset['train'], tokenizer,pt = pt, pt_cols = pt_cols, system_prompt = system_prompt, add_generation_prompt = True)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [16]:
seed = 42
bm_sample_size = 100
bm_samples = processed_dataset.shuffle(seed = seed).select(range(bm_sample_size))

In [19]:
tokenizer.special_tokens_map, tokenizer.clean_up_tokenization_spaces

({'bos_token': '<s>',
  'eos_token': '<|im_end|>',
  'unk_token': '<unk>',
  'pad_token': '<|im_end|>'},
 False)

In [17]:
print("Processed Dataset:\n")
for i in range(5):
    print(processed_dataset[i]['X'])
    print('\n')
    print('---' * 10)
    print('\n')

Processed Dataset:

<|im_start|>system
Solve the following math problems, end with The answer is<|im_end|>
<|im_start|>user
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?<|im_end|>
<|im_start|>assistant



------------------------------


<|im_start|>system
Solve the following math problems, end with The answer is<|im_end|>
<|im_start|>user
Q: There are 15 trees in the grove

In [20]:
bm_samples[:2]

{'question': ['Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?',
  "Frankie's parents let him have many pets. He has six more snakes than he has cats. He has one less parrot than cats. Six of his pets have four legs. He has 2 dogs. How many pets does he have in total?"],
 'answer': ['Mimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.\n#### 16',
  'He has 6 - 2 = <<6-2=4>>4 cats.\nHe has 4 - 1 = <<4-1=3>>3 parrots.\nHe has 4 + 6 = <<4+6=10>>10 snakes.\nHe has a total of 2 + 4 + 3 + 10 = <<2+4+3+10=19>>19 pets.\n#### 19'],
 'X': ['<|im_start|>system\nSolve the following math problems, end with The answer is<|im_end|>\n<|im_start|>user\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there w

In [21]:
from llmsearch.utils.model_utils import decoder_parser

def perform_single_example_inference(example, model, tokenizer, gen_kwargs):

    tokenized_input = tokenizer(example, return_tensors = "pt", add_special_tokens = False)
    tokenized_input['input_ids'] = tokenized_input['input_ids'].to('cuda:0')

    tokenized_input['attention_mask'] = tokenized_input['attention_mask'].to('cuda:0')
    # tokenized_input.to(device)
    # print(tokenized_input)

    model_out = model.generate(**tokenized_input, **gen_kwargs)
    prompt_tokens = len(tokenized_input['input_ids'][0])
    print(f"Prompt tokens - {prompt_tokens}")
    # print(model_out.tolist()[0])

    output_token_ids = model_out.tolist()[0]
    decoded_output = tokenizer.decode(output_token_ids, spaces_between_special_tokens = False)

    print(decoded_output)
    completion_tokens = len(output_token_ids) - prompt_tokens

    out = decoder_parser(outputs = [decoded_output], formatted_prompts = [example], prepoc = lambda x : x.strip())



    print(f"Completion Tokens - {completion_tokens}")

    return out

In [30]:
bm_samples[:2]

{'question': ['Mimi picked up 2 dozen seashells on the beach.  Kyle found twice as many shells as Mimi and put them in his pocket. Leigh grabbed one-third of the shells that Kyle found.  How many seashells did Leigh have?',
  "Frankie's parents let him have many pets. He has six more snakes than he has cats. He has one less parrot than cats. Six of his pets have four legs. He has 2 dogs. How many pets does he have in total?"],
 'answer': ['Mimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.\n#### 16',
  'He has 6 - 2 = <<6-2=4>>4 cats.\nHe has 4 - 1 = <<4-1=3>>3 parrots.\nHe has 4 + 6 = <<4+6=10>>10 snakes.\nHe has a total of 2 + 4 + 3 + 10 = <<2+4+3+10=19>>19 pets.\n#### 19'],
 'X': ['<|im_start|>system\nSolve the following math problems, end with The answer is<|im_end|>\n<|im_start|>user\nQ: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there w

In [31]:
sample_size

2

In [None]:
pre

In [41]:
from pprint import pprint

my_list = ['apple', 'banana', 'cherry', 'date', 'elderberry']

pprint(my_list)

['apple', 'banana', 'cherry', 'date', 'elderberry']


In [64]:
from llmsearch.utils.model_utils import batcher

def batch_inputs(inputs, tokenizer,batch_size, tokenizer_encoding_kwargs):
    batched_inputs = []


    for batch in tqdm(batcher(inputs, batch_size)):
        model_input = [item['X'] for item in batch]
        encoded_input = tokenizer(text = model_input, **tokenizer_encoding_kwargs, return_tensors = "pt")

        # TODO : When a batch is encoded an item in the list could be a batch, use torch.chunk to split and extend the list
        # final objective is to get a list of items how it would be encoded if batch size was some value
        print(encoded_input['input_ids'].shape)
        batched_inputs.append(encoded_input)

        # print(batched_inputs)

    return batched_inputs


batch_size = 2
sample_size = 2
tokenizer_encoding_kwargs = {
    # pad to longest seq in batch
    'padding' : 'longest',
    # adds <s> to the start (only adds to the longest sequence in the batch for some reason)
    'add_special_tokens' : True
}
# convert to a list of dicts
bm_sample_dicts = [{k: v[i] for k, v in bm_samples[:sample_size].items()} for i in range(sample_size)]

batched_input = batch_inputs(bm_sample_dicts, tokenizer, batch_size,tokenizer_encoding_kwargs)

tokenizer_decoding_kwargs = {
    # 'skip_special_tokens' : True,
    # 'clean_up_tokenization_spaces' : True,
}


for idx, encoded_batch in enumerate(batched_input):
    print(encoded_batch)
    decoded_input = tokenizer.batch_decode(encoded_batch['input_ids'], **tokenizer_decoding_kwargs)
    print(idx)
    pprint(decoded_input)
    print('\n\n', '---' * 10, '\n\n')

# for idx, item in enumerate(tokenizer.batch_decode(batched_input['input_ids'])):
#     print(idx)
#     print(item)
#     print('\n\n', '---' * 10, '\n\n')

0it [00:00, ?it/s]

torch.Size([2, 245])
{'input_ids': tensor([[    1, 28705, 32001,  6574,    13, 28735,  5303,   272,  2296, 11049,
          4418, 28725,   948,   395,   415,  4372,   349, 32000,    13, 32001,
          1838,    13, 28824, 28747,  1387,   460, 28705, 28740, 28782,  7099,
           297,   272,  5977,   333, 28723,  8697,   333,  7433,   622,  5100,
          7099,   297,   272,  5977,   333,  3154, 28723,  2530,   590,   460,
          2203, 28725,   736,   622,   347, 28705, 28750, 28740,  7099, 28723,
          1602,  1287,  7099,   863,   272,  5977,   333,  7433,  5100,  3154,
         28804,    13, 28741, 28747,  1387,   460, 28705, 28740, 28782,  7099,
         10806, 28723,  2479,   736,   654, 28705, 28750, 28740,  7099,  1024,
           741,   680,   654, 24571, 28723,  1537,   736,  1580,   506,   750,
         28705, 28750, 28740,   387, 28705, 28740, 28782,   327, 28705, 28784,
         28723,   415,  4372,   349, 28705, 28784, 28723,    13,    13, 28824,
         28747,  

In [54]:
stopping_criteria = StoppingCriteriaList([MultiTokenEOSCriteria(sequence_ids = [32000])])
gen_kwargs = {
    'max_new_tokens' : 500,
    'stopping_criteria' : stopping_criteria
}

# Output changes based on skip_special_tokens value
# padding tokens influencing output

out = perform_single_example_inference(tokenizer.decode(batched_input['input_ids'][2], skip_special_tokens=True), model, tokenizer, gen_kwargs)

Prompt tokens - 288
<|im_start|>system
Solve the following math problems, end with The answer is
 <|im_start|>user
Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

Q: Olaf collects colorful toy cars. At first, his collection consisted of 150 cars. His family, knowing his hobby, decided to give him some toy cars. Grandpa gave Olaf twice as many toy cars as the uncle. Dad gave Olaf 10 toy cars, 5 less than Mum. Auntie gave Olaf 6 toy cars, 1 more than the uncle. How many toy cars does Olaf have in total, after receiving all these gift

In [35]:
bm_samples['answer'][:4]

['Mimi has 2 x 12 = <<2*12=24>>24 sea shells.\nKyle has 24 x 2 = <<24*2=48>>48 sea shells.\nLeigh has 48 / 3 = <<48/3=16>>16 sea shells.\n#### 16',
 'He has 6 - 2 = <<6-2=4>>4 cats.\nHe has 4 - 1 = <<4-1=3>>3 parrots.\nHe has 4 + 6 = <<4+6=10>>10 snakes.\nHe has a total of 2 + 4 + 3 + 10 = <<2+4+3+10=19>>19 pets.\n#### 19',
 "Dad gave Olaf 10 toy cars,\nMom has given Olaf 5 more toy cars than Dad, so 10 + 5 = <<10+5=15>>15 toy cars\nAuntie gave Olaf 6 toy cars,\nUncle has given 1 less toy than Auntie, so 6 - 1 = <<6-1=5>>5 toy cars\nGrandpa gave Olaf 2 * 5 = <<2*5=10>>10 toy cars.\nAll the family together gave Olaf 10 +15 + 6 + 5 + 10 = <<10+15+6+5+10=46>>46.\nAdding the cars Olaf already had, Olaf's collection has 150 + 46 = <<150+46=196>>196 cars.\n#### 196",
 'She spend $56 because 7 x 8 = <<7*8=56>>56\nShe has $44 left in the bank because 100 - 56 = <<100-56=44>>44\nShe can get 8 five dollar bills because 44 / 5 = <<44/5=8.8>>8.8\nThis is equal to $40 because 8 x 5 = <<8*5=40>>40\n

In [None]:
batched_input['input_ids']

In [None]:
tokenizer.batch_decode(batched_input['input_ids'])