In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

In [2]:
import re
import textwrap
from pathlib import Path

import datasets

from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

from awq import AutoAWQForCausalLM
from transformers import StoppingCriteriaList, AutoTokenizer

from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import gc_cuda
from llmsearch.utils.model_downloader import download_model_from_hf
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria

  from .autonotebook import tqdm as notebook_tqdm


Monkey Patching .generate function of `transformers` library


In [3]:
def preprocess_dataset(dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt = True):

    def wrapper(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        messages = [] if system_prompt is None else [{"role": "system", "content": system_prompt}]
        formatted_pt = pt.format(**{pt_col : sample[pt_col] for pt_col in pt_cols})
        messages.append(
            {
                "role": "user",
                "content": formatted_pt,
            }
        )
        formatted_pt_with_ct = tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt=add_generation_prompt)
        return formatted_pt_with_ct

    def actual_input(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        return sample[pt_cols[0]]



    pt_dataset = dataset.map(
        lambda sample : {
            "X" : wrapper(sample),
            'actual input' : actual_input(sample),
        }
    )

    return pt_dataset

def get_score(y_true, y_pred):

    def extract_answer_from_out(s):
        pattern = re.compile(r"The answer is (\d+(?:\.\d+)?)")
        match = pattern.search(s)
        if match:
            return match.group(1).strip()
        else:
            return None

    scores = []

    for y_t, y_p in zip(y_true, y_pred):
        y_t_answer = y_t['answer'].split("####")[-1].strip()
        y_p_answer = extract_answer_from_out(y_p)


        if y_t_answer == y_p_answer:
            scores.append(1)
        else:
            scores.append(0)
    return sum(scores)/len(scores)

In [4]:
# load dataset, model, tokenizer
seed = 42
gsm8k_dataset = datasets.load_dataset("gsm8k", 'main')
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

temp_model_dir = Path(f"./temp_dir/")
temp_model_dir.mkdir(exist_ok = True, parents = True)
output_folder = download_model_from_hf(model_id, save_dir = temp_model_dir, branch = 'main')

gc_cuda()

model = AutoAWQForCausalLM.from_quantized(
        quant_path=output_folder,
        fuse_layers = True,
        device_map = {'' : 0}
    )
tokenizer = AutoTokenizer.from_pretrained(output_folder, local_files_only=True, legacy = False, use_fast = False)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = 'left'

Downloading the model to temp_dir/TheBloke_CapybaraHermes-2.5-Mistral-7B-AWQ


100%|██████████| 17.9k /17.9k  35.9MiB/s
100%|██████████| 51.0  /51.0   152kiB/s
100%|██████████| 911   /911    2.85MiB/s
100%|██████████| 115   /115    279kiB/s
100%|██████████| 126   /126    432kiB/s
100%|██████████| 420   /420    1.73MiB/s
  0%|          | 0.00  /4.15G  ?iB/s 
100%|██████████| 1.60k /1.60k  5.12MiB/s
  1%|          | 21.0M /4.15G  206MiB/s
100%|██████████| 493k  /493k   10.4MiB/s
100%|██████████| 1.80M /1.80M  4.22MiB/s
100%|██████████| 4.15G /4.15G  367MiB/s
Replacing layers...: 100%|██████████| 32/32 [00:04<00:00,  7.30it/s]
Fusing layers...: 100%|██████████| 32/32 [00:02<00:00, 11.17it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
def preprocess_dataset(
    dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt=True
):
    def wrapper(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        messages = (
            []
            if system_prompt is None
            else [{"role": "system", "content": system_prompt}]
        )
        formatted_pt = pt.format(**{pt_col: sample[pt_col] for pt_col in pt_cols})
        messages.append(
            {
                "role": "user",
                "content": formatted_pt,
            }
        )
        formatted_pt_with_ct = tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=add_generation_prompt
        )
        return formatted_pt_with_ct

    def actual_input(sample):
        """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
        return sample[pt_cols[0]]

    pt_dataset = dataset.map(
        lambda sample: {
            "X": wrapper(sample),
            "actual input": actual_input(sample),
        }
    )

    return pt_dataset

pt = textwrap.dedent("""\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}""")
pt_cols = ['question']
system_prompt = "Solve the following math problems, end with The answer is"

# Add prompt template
processed_dataset = preprocess_dataset(gsm8k_dataset['train'], tokenizer,pt = pt, pt_cols = pt_cols, system_prompt = system_prompt, add_generation_prompt = True)

bm_sample_size = 10
bm_samples = processed_dataset.shuffle(seed = seed).select(range(bm_sample_size))

# setup
multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids = [32000])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])


Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map: 100%|██████████| 7473/7473 [00:00<00:00, 10711.84 examples/s]


In [6]:
tuner_ob = Tuner(
    model = model,
    tokenizer = tokenizer,
    dataset = bm_samples,
    device = 'cuda:0',
    batch_size = 1,
    tokenizer_encode_args={'padding': 'longest', 'add_special_tokens' : False},
    tokenizer_decode_args={'spaces_between_special_tokens' : False},
    scorer = get_score,
    prompt_template = "{X}",
    is_encoder_decoder = False,
    seed = seed,
    column_mapping = {'input_cols' : ["X"],'eval_cols' : ['answer']},
    callbacks_after_inference = [multi_token_stop_criteria_ob.reset],
)

Map: 100%|██████████| 10/10 [00:00<00:00, 1487.50 examples/s]


In [7]:
gen_params_normal = {
    'max_new_tokens' : 500,
    'generation_seed' : 42,

    'stopping_criteria' : stopping_criteria,
}

gen_params_tfs = {
    'max_new_tokens' : 500,
    # max_new_tokens take precendece over stopping criteria
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,

    'tfs' : 0.99,
    'do_sample' : True,
}

gen_params_top_a = {
    'max_new_tokens' : 500,
    # max_new_tokens take precendece over stopping criteria
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,

    'top_a' : 0.1,
    'do_sample' : True,
}

gen_params_mirostat = {
    'max_new_tokens' : 500,

    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,

    'mirostat_mode' : 2,
    'do_sample' : True
}

scores, outputs = tuner_ob.get_score(gen_params_normal)

print(scores,'\n\n')
for output in outputs:
    print(output)
    print('\n\n')
    print('---' * 10)

100%|██████████| 10/10 [00:34<00:00,  3.48s/it]


0.7 


A: Mimi picked up 2 dozen seashells, which is 2 * 12 = 24 seashells. Kyle found twice as many shells as Mimi, so he found 24 * 2 = 48 seashells. Leigh grabbed one-third of the shells that Kyle found, so Leigh had 48 / 3 = 16 seashells. The answer is 16.<|im_end|>



------------------------------
A: Let's use variables to represent the number of each type of pet:

Let C = number of cats
Let S = number of snakes
Let P = number of parrots

We are given the following information:

1. S = C + 6 (Frankie has six more snakes than cats)
2. P = C - 1 (Frankie has one less parrot than cats)
3. There are 6 pets with 4 legs, and Frankie has 2 dogs, so there are S + P + 2 dogs with 4 legs.
4. The total number of pets is C + S + P + 2 dogs.

Now, we can use the given information to solve for the total number of pets:

From (1), we have S = C + 6.
From (2), we have P = C - 1.

Substitute these expressions into the total number of pets equation:

C + (C + 6) + (C - 1) + 2 = total number of pet