In [1]:
import torch
import evaluate
import datasets
import numpy as np

from llmsearch.tuner import Tuner
from sklearn.model_selection import GridSearchCV
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList

seed = 42
batch_size = 2
num_samples = 10

# Load model & tokenizer
model_id = "cognitivecomputations/dolphin-2.9-llama3-8b"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.float16, device_map = "auto")

# Load dataset & metric
dataset = datasets.load_dataset("samsum")['train']
sample_dataset = dataset.shuffle(seed = seed).select(range(num_samples))
rouge = evaluate.load('rouge')

# Optional : Define stopping criteria, here we stop a generation of a sequence when `<|im_end|>` is reached
multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[128256])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
# useful when batching
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

# Scorer
def get_rouge_score(y_true, y_pred):
    return np.mean(rouge.compute(predictions=y_pred, references=[item['summary'] for item in y_true], use_stemmer=True, use_aggregator=False)['rouge2'])

# To process dataset to chat format
def sample_to_chat_format(tokenizer, **kwargs):
    messages = [
        {
            'role' : "system",
            'content' : "You are Dolphin, a helpful AI assistant."
        },
        {
            'role' : "user",
            'content' : f"Summarize the following text: {kwargs['dialogue']}"
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

# Define tuner object and pass in model, metric & dataset
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=sample_dataset,
    device="cuda:0",
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest",'truncation' : True, "add_special_tokens": False, 'max_length' : 1024},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    scorer=get_rouge_score,
    sample_preprocessor=sample_to_chat_format,
    seed=seed,
    column_mapping={"input_cols": ["dialogue"], "eval_cols": ["summary"]},
    callbacks_after_inference=callbacks_after_inference,
)

# Check to see if dataset is processed as expected
print(tuner_ob.dataset['_X'][:2])

# Get score & outputs using arbitrary generation parameters to check performance before tuning
gen_params = {
    'max_new_tokens' : 70,
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,
}

score, outputs = tuner_ob.get_score(gen_params)

print(score)

# Define your hyperparameter space here for the earch
hyp_space = {
    'max_new_tokens' : [70],
    'stopping_criteria' : [stopping_criteria],
    'generation_seed' : [42],
    'do_sample' : [True],

    'temperature': [0.1,],
    'top_k': [50],
    'no_repeat_ngram_size': [0],
}

# Pass in estimator & scorer as you do with the scikit-learn API
clf = GridSearchCV(
    estimator = tuner_ob.estimator,
    param_grid=hyp_space,
    scoring = tuner_ob.scorer,
    cv = 2,
    n_jobs = None,
    verbose=3,
)

clf.fit(X=tuner_ob.dataset["_X"], y=tuner_ob.dataset['_y'])

# Get the best params
print(clf.best_params_)

  from .autonotebook import tqdm as notebook_tqdm


Monkey Patching .generate function of `transformers` library


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:22<00:00,  5.52s/it]


['<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text: Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !<|im_end|>\n<|im_start|>assistant\n', "<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text: Wendy: What's up?\r\nSimon: Nothing much. I'm painting my cupboards. \r\nAngela: Cool what colour?\r\nSimon: Green.\r\nBen: I'm just chilling in the garden. \r\nAngela: Nice weekend! I'm ab

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 1/5 [00:59<03:57, 59.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 2/5 [01:54<02:50, 56.80s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 3/5 [02:43<01:46, 53.09s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 4/5 [03:30<00:50, 50.97s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 5/5 [04:18<00:00, 51.74s/it]


0.12281033543101956
Fitting 2 folds for each of 1 candidates, totalling 2 fits


  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 0/5 [00:16<?, ?it/s]


KeyboardInterrupt: 

In [22]:
# Requires accelerate==0.27.2 py7zr==0.21.0 evaluate==0.4.0 rouge_score==0.1.2


import torch
import evaluate
import datasets
import numpy as np

from llmsearch.tuner import Tuner
from sklearn.model_selection import GridSearchCV
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList

In [2]:
model_id = "cognitivecomputations/dolphin-2.9-llama3-8b"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.float16, device_map = "auto")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Loading checkpoint shards: 100%|██████████| 4/4 [00:19<00:00,  4.85s/it]


In [3]:
dataset = datasets.load_dataset("samsum")['train']

In [12]:

seed = 42
batch_size = 1
sample_dataset = dataset.shuffle(seed = seed).select(range(10))

In [13]:
multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[128256])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

In [14]:


rouge = evaluate.load('rouge')



In [15]:

def get_rouge_score(y_true, y_pred):
    return np.mean(rouge.compute(predictions=y_pred, references=[item['summary'] for item in y_true], use_stemmer=True, use_aggregator=False)['rouge2'])

In [16]:
def sample_to_chat_format(tokenizer, **kwargs):
    messages = [
        {
            'role' : "system",
            'content' : "You are Dolphin, a helpful AI assistant."
        },
        {
            'role' : "user",
            'content' : f"Summarize the following text: {kwargs['dialogue']}"
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)


In [17]:
batch_size = 2
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=sample_dataset,
    device="cuda:0",
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest",'truncation' : True, "add_special_tokens": False, 'max_length' : 1024},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    scorer=get_rouge_score,
    sample_preprocessor=sample_to_chat_format,
    seed=seed,
    column_mapping={"input_cols": ["dialogue"], "eval_cols": ["summary"]},
    callbacks_after_inference=callbacks_after_inference,
)

In [18]:
gen_params1 = {
    'max_new_tokens' : 70,
    'stopping_criteria' : stopping_criteria,
    'generation_seed' : 42,
}

scores_before, outputs_before = tuner_ob.get_score(gen_params1)

  0%|          | 0/5 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 20%|██        | 1/5 [00:48<03:15, 48.87s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 40%|████      | 2/5 [01:37<02:26, 48.73s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 60%|██████    | 3/5 [02:29<01:40, 50.31s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 80%|████████  | 4/5 [03:13<00:47, 47.81s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 5/5 [04:02<00:00, 48.47s/it]


In [29]:
print(tuner_ob.dataset['_X'][:2])

['<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text: Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !<|im_end|>\n<|im_start|>assistant\n', "<|im_start|>system\nYou are Dolphin, a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text: Wendy: What's up?\r\nSimon: Nothing much. I'm painting my cupboards. \r\nAngela: Cool what colour?\r\nSimon: Green.\r\nBen: I'm just chilling in the garden. \r\nAngela: Nice weekend! I'm ab

In [19]:
print(scores_before)

0.12281033543101956


In [20]:
outputs_before

["Lucy and Sue discuss JK's morning show, which Lucy finds disgusting. Sue prefers listening to the radio instead. They agree that some guests on the show make them feel like the world is full of idiots. The idea of Sue and Andy being on the show makes Lucy laugh, but Sue firmly rejects the idea.\n\n\nSorry",
 'Wendy, Simon, Angela, and Ben are having a conversation. Simon is painting his cupboards green. Angela is about to meet Chris and is spending her weekend doing activities outdoors. Wendy is enjoying a lazy weekend after a hard week at work. Ben suggests coming to visit Simon in his new apartment, and he plans to be in Bournemouth',
 "Petra is unable to answer Zack's call because she is attending lectures. Zack suggests calling during a break, but Petra is unsure about the exact break time and if the lecturer will adhere to it. As a result, Petra offers to call back during lunch or alternatively, Zack can send a message about the matter since Petra can type and read but cannot",


In [33]:
outputs_before

["Lucy and Sue discuss JK's morning show, which Lucy finds disgusting. Sue prefers listening to the radio instead. They agree that some guests on the show make them feel like the world is full of idiots. The idea of Sue and Andy being on the show makes Lucy laugh, but Sue firmly rejects the idea."]

In [23]:
hyp_space = {
    'max_new_tokens' : [70],
    'stopping_criteria' : [stopping_criteria],
    'generation_seed' : [42],
    'do_sample' : [True],

    'temperature': [0.1,],
    'top_k': [50],
    'no_repeat_ngram_size': [0],
}

clf = GridSearchCV(
    estimator = tuner_ob.estimator,
    param_grid=hyp_space,
    scoring = tuner_ob.scorer,
    cv = 2,
    n_jobs = None,
    verbose=3,
)

In [24]:
clf.fit(X=tuner_ob.dataset["_X"], y=tuner_ob.dataset['_y'])

Fitting 2 folds for each of 1 candidates, totalling 2 fits


  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 1/3 [00:50<01:41, 50.58s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 67%|██████▋   | 2/3 [01:38<00:49, 49.21s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 3/3 [02:08<00:00, 42.78s/it]


[CV 1/2] END do_sample=True, generation_seed=42, max_new_tokens=70, no_repeat_ngram_size=0, stopping_criteria=[<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7ff72c3a2bc0>], temperature=0.1, top_k=50;, score=0.099 total time= 2.1min


  0%|          | 0/3 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 33%|███▎      | 1/3 [00:47<01:34, 47.02s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
 67%|██████▋   | 2/3 [01:37<00:48, 48.88s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 3/3 [01:48<00:00, 36.15s/it]


[CV 2/2] END do_sample=True, generation_seed=42, max_new_tokens=70, no_repeat_ngram_size=0, stopping_criteria=[<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria object at 0x7ff72c3a2bc0>], temperature=0.1, top_k=50;, score=0.136 total time= 1.8min


In [28]:
tokenizer.decode([128256])

'<|im_end|>'

In [25]:
clf.best_params_

{'do_sample': True,
 'generation_seed': 42,
 'max_new_tokens': 70,
 'no_repeat_ngram_size': 0,
 'stopping_criteria': [<llmsearch.scripts.stopping_criteria.MultiTokenStoppingCriteria at 0x7ff72c3a2bc0>],
 'temperature': 0.1,
 'top_k': 50}