In [1]:
# Requires accelerate==0.27.2 py7zr==0.21.0 evaluate==0.4.0 rouge_score==0.1.2

# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch')

import torch
import evaluate
import datasets
import numpy as np

from llmsearch.tuner import Tuner
from sklearn.model_selection import GridSearchCV
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList

Monkey Patching .generate function of `transformers` library


### Running a generation param search

In [2]:
# Set some variables that we will use later
seed = 42
batch_size = 2
num_samples = 4

In [3]:
# load model & tokenizer
model_id = "cognitivecomputations/dolphin-2.9-llama3-8b"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.float16, device_map = "auto")

# load dataset on which to run search on
dataset = datasets.load_dataset("samsum")['train']
sample_dataset = dataset.shuffle(seed = seed).select(range(num_samples))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
# Optional : Define stopping criteria for the generation, here we stop a generation of a sequence when `<|im_end|>` is reached
multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[128256])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
# useful when batching to reset state variables for the stopping criteria
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

# create a function that can be useful for evaluation
rouge = evaluate.load('rouge')
def get_rouge_score(y_true, y_pred):
    return np.mean(rouge.compute(predictions=y_pred, references=[item['summary'] for item in y_true], use_stemmer=True, use_aggregator=False)['rouge2'])

# Define a dataset preprocessor - Should take in tokenizer & kwargs and return a string that can be input directly to the model, here we apply chat template which most decoder models use
def sample_to_chat_format(tokenizer, **kwargs):
    messages = [
        {
            'role' : "system",
            'content' : "You are Dolphin, a helpful AI assistant."
        },
        {
            'role' : "user",
            'content' : f"Summarize the following text: {kwargs['dialogue']}"
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

In [5]:
# define tuner object, this preprocesses the dataset and creates an LLMEstimator to run with scikit-learn
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=sample_dataset,
    device="cuda:0",
    # the tuner module automatically reduces the batch size while running inference if it goes OOM
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest",'truncation' : True, "add_special_tokens": False, 'max_length' : 1024},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    # pass in the scorer
    scorer=get_rouge_score,
    # pass in `dataset` preprocessor
    sample_preprocessor=sample_to_chat_format,
    seed=seed,
    # column mapping used to identify input and evaluation columns (these columns are passed in to the evaluation function & the dataset preprocessor)
    column_mapping={"input_cols": ["dialogue"], "eval_cols": ["summary"]},
    # callbacks if any to run after each inference
    callbacks_after_inference=callbacks_after_inference,
)

In [6]:
# Check to see if dataset is processed as expected, Tuner populates `_X` with the processed input and `_y` with `column_mapping.eval_cols`
print(f"Inputs: ")
for _x, _y in zip(tuner_ob.dataset['_X'][:3], tuner_ob.dataset['_y'][:3]):
    print(f"Input: {_x}")
    print('\n')
    print(f"Output: {_y}")

    print('\n\n')
    print('---' * 15,'\n\n')

Inputs: 
Input: <|im_start|>system
You are Dolphin, a helpful AI assistant.<|im_end|>
<|im_start|>user
Summarize the following text: Lucy: omg did you see JK this morning?
Sue: I try to avoid it lol
Lucy: you should have seen it it was disgusting
Sue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol
Lucy: you may be right I dont know how some of them can go on there in public for the world to see
Sue: I would die if I got a call to go on there lol
Sue: could you imagine ha ha 
Lucy: I would piss myself If I saw you and Andy up there
Sue: over my dead body !<|im_end|>
<|im_start|>assistant



Output: {'summary': "Sue doesn't watch JK any more as it's disgusting."}



--------------------------------------------- 


Input: <|im_start|>system
You are Dolphin, a helpful AI assistant.<|im_end|>
<|im_start|>user
Summarize the following text: Wendy: What's up?
Simon: Nothing much. I'm painting my cupboards. 
Angela: C

In [7]:
# 0.09896745444141972
# Get score & outputs using some generation parameters
gen_params = {
    'max_new_tokens' : 70,
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,
}

score, outputs = tuner_ob.get_score(gen_params)

print(f"Score - {score}")


  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Score - 0.09896745444141972


In [8]:
# Define your hyperparameter space here for the earch
hyp_space = {
    'max_new_tokens' : [70],
    'stopping_criteria' : [stopping_criteria],
    'generation_seed' : [42],
    'do_sample' : [True],

    'temperature': [0.1],
    'top_k': [50],
    'no_repeat_ngram_size': [0],
}

# Pass in estimator & scorer as you do with the scikit-learn API
clf = GridSearchCV(
    estimator = tuner_ob.estimator,
    param_grid=hyp_space,
    scoring = tuner_ob.scorer,
    cv = 2,
    n_jobs = None,
    verbose=3,
)

In [9]:
# fit on the dataset
clf.fit(X=tuner_ob.dataset["_X"], y=tuner_ob.dataset['_y'])

Fitting 2 folds for each of 1 candidates, totalling 2 fits


  0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[CV 1/2] END do_sample=True, generation_seed=42, max_new_tokens=70, no_repeat_ngram_size=0, stopping_criteria=[<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7feed80eb8b0>], temperature=0.1, top_k=50;, score=0.144 total time= 1.0min


  0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


[CV 2/2] END do_sample=True, generation_seed=42, max_new_tokens=70, no_repeat_ngram_size=0, stopping_criteria=[<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7feed80eb8b0>], temperature=0.1, top_k=50;, score=0.128 total time=  59.9s


In [10]:
# print out the best parameters
print(clf.best_params_)

{'do_sample': True, 'generation_seed': 42, 'max_new_tokens': 70, 'no_repeat_ngram_size': 0, 'stopping_criteria': [<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7feed80eb8b0>], 'temperature': 0.1, 'top_k': 50}


In [11]:
# evaluate on the tuned params
# you can also get a score on another dataset by passing in the `dataset` to the `get_score` method as another param, note that it gets processed the same way the `dataset` passed in the `Tuner` class was processed
scores, outputs = tuner_ob.get_score(clf.best_params_)

  0%|          | 0/2 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


In [12]:
print(f"Scores - {scores}")

Scores - 0.12349276113207769


### Logging Utils

In [13]:
from llmsearch.utils.logging_utils import set_verbosity_info, set_verbosity_warning, set_verbosity_debug

# set verbosity to debug, useful to debug model outputs
set_verbosity_debug()

In [14]:
# Calculate score on a different dataset
scores, outputs = tuner_ob.get_score(gen_params, dataset = datasets.Dataset.from_dict(sample_dataset[:2]))



Map:   0%|          | 0/2 [00:00<?, ? examples/s]

2024-05-23 19:55:39.202 - llmsearch.utils.mem_utils:154 - INFO - Starting inference with generation parameters - {'max_new_tokens': 70, 'stopping_criteria': [<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7feed80eb8b0>], 'generation_seed': 42}
2024-05-23 19:55:39.203 - llmsearch.utils.mem_utils:158 - INFO - Performing inference with batch_size - 2
2024-05-23 19:55:39.204 - llmsearch.utils.model_utils:98 - INFO - Detected generation type - Greedy Decoding


  0%|          | 0/1 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
2024-05-23 19:56:40.565 - llmsearch.utils.model_utils:149 - DEBUG - Input - '<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text: Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !<|im_end|>\n<|im_start|>assistant\n'
2024-05-23 19:56:40.567 - llmsearch.utils.model_utils:150 - DEBUG - Model Output - "Lucy and Sue discuss JK's morning show, which Lucy finds disgusting. Sue prefer