In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('/workspace/llmsearch/')

import awq
import torch
import transformers
import llmsearch
import evaluate
import datasets
import numpy as np

from awq import AutoAWQForCausalLM
from sklearn.model_selection import GridSearchCV
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList


from llmsearch.tuner import Tuner
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria

  from .autonotebook import tqdm as notebook_tqdm


Monkey Patching .generate function of `transformers` library


In [2]:
seed = 42
batch_size = 2
num_samples = 10
device = "cuda:0"

In [3]:
import copy

In [4]:
model_id = "Praful932/dolphin-2.2.1-mistral-7b-samsum-ft-v1-GPTQ"
revision = "main"
tokenizer = AutoTokenizer.from_pretrained(model_id,revision = revision)
tokenizer.padding_side = "left"
# model = AutoAWQForCausalLM.from_quantized(
#         model_id, fuse_layers=True, device_map={"": device}, revision = revision
#     )
model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": device})

dataset = datasets.load_dataset("samsum")['train']
sample_dataset = dataset.shuffle(seed = seed).select(range(num_samples))
test_dataset = copy.deepcopy(datasets.Dataset.from_dict(sample_dataset[:2]))

# These are required to make the model end the sequence correctly - https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct#transformers-automodelforcausallm
terminators = [
    128001,
    128009,
]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at Praful932/dolphin-2.2.1-mistral-7b-samsum-ft-v1-GPTQ were not used when initializing MistralForCausalLM: ['model.layers.0.mlp.down_proj.bias', 'model.layers.0.mlp.gate_proj.bias', 'model.layers.0.mlp.up_proj.bias', 'model.layers.0.self_attn.k_proj.bias', 'model.layers.0.self_attn.o_proj.bias', 'model.layers.0.self_attn.q_proj.bias', 'model.layers.0.self_attn.v_proj.bias', 'model.layers.1.mlp.down_proj.bias', 'model.layers.1.mlp.gate_proj.bias', 'model.layers.1.mlp.up_proj.bias', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.1.self_attn.o_proj.bias', 'model.layers.1.self_attn.q_proj.bias', 'model.layers.1.self_attn.v_proj.bias', 'model.layers.10.mlp.down_proj.bias', 'model.layers.10.mlp.gate_proj.bias', 'model.layers.10.mlp.up_proj.bias', 'model.layers.10.self_attn.k_proj.bias', 'model.layers.10.self_attn.o_proj.bias', 'mod

In [5]:
sample_dataset[:2]

{'id': ['13681220', '13716809'],
 'dialogue': ['Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !',
  "Wendy: What's up?\r\nSimon: Nothing much. I'm painting my cupboards. \r\nAngela: Cool what colour?\r\nSimon: Green.\r\nBen: I'm just chilling in the garden. \r\nAngela: Nice weekend! I'm about to meet Chris.\r\nWendy: Say hello from me!\r\nAngela: Will do! And how is your weekend, Wendy?\r\nWendy: Very lazy... The week was hard at work, I really needed some rest. \r\nBen: We should all come and visit Simon in his

In [6]:
test_dataset = datasets.Dataset.from_dict(sample_dataset[:2])

In [7]:
# create a function that can be used for evaluation, should take in y_true (list[dict]), y_pred (list) and return a single value
rouge = evaluate.load('rouge')
def get_rouge_score(y_true : list, y_pred : list):
    return np.mean(rouge.compute(predictions=y_pred, references=[item['summary'] for item in y_true], use_stemmer=True, use_aggregator=False)['rouge2'])

# Define a dataset preprocessor that is called for every example in the dataset separately - Should take in tokenizer & kwargs and return a string that can be input directly to the model, here we apply chat template which most decoder models use
def sample_to_chat_format(tokenizer, **kwargs):
    messages = [
        {
            'role' : "system",
            'content' : "You are a helpful AI assistant."
        },
        {
            'role' : "user",
            'content' : f"Summarize the following text in less than 50 words: {kwargs['dialogue']}"
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

In [8]:
# define tuner object, this preprocesses the dataset and creates an LLMEstimator that can be run with GridSearchCV / RandomizedSearchCV of scikit-learn
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=sample_dataset,
    device="cuda:0",
    # the tuner module automatically reduces the batch size while running inference if it goes OOM
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest",'truncation' : True, "add_special_tokens": False, 'max_length' : 1024},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    # pass in the scorer that we will be used to evaluate (input to this function is a batch)
    scorer=get_rouge_score,
    # pass in `dataset` preprocessor, this is run on the passed in dataset before feeding into the model, input of this function is a single example
    sample_preprocessor=sample_to_chat_format,
    seed=seed,
    # column mapping used to identify input and evaluation columns (these columns are passed in to the evaluation function (scorer) & the dataset preprocessor(sample_preprocessor))
    column_mapping={"input_cols": ["dialogue"], "eval_cols": ["summary"]},
)

In [10]:
print(test_dataset)

Dataset({
    features: ['id', 'dialogue', 'summary'],
    num_rows: 2
})


In [15]:
tuner_ob.dataset['_X']

['<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text in less than 50 words: Lucy: omg did you see JK this morning?\r\nSue: I try to avoid it lol\r\nLucy: you should have seen it it was disgusting\r\nSue: I cant do it anymore i try to listen to the radio in the mornings.. jk makes you think the whole world is full of idiots lol\r\nLucy: you may be right I dont know how some of them can go on there in public for the world to see\r\nSue: I would die if I got a call to go on there lol\r\nSue: could you imagine ha ha \r\nLucy: I would piss myself If I saw you and Andy up there\r\nSue: over my dead body !<|im_end|>\n<|im_start|>assistant\n',
 "<|im_start|>system\nYou are a helpful AI assistant.<|im_end|>\n<|im_start|>user\nSummarize the following text in less than 50 words: Wendy: What's up?\r\nSimon: Nothing much. I'm painting my cupboards. \r\nAngela: Cool what colour?\r\nSimon: Green.\r\nBen: I'm just chilling in the garden. \r\nA

In [12]:
# Example Logs from the get score function - Calculate score on a different dataset

tokenizer.pad_token = tokenizer.eos_token

gen_params = {
    'max_new_tokens' : 70,
    'generation_seed' : 42,
    'eos_token_id' : terminators,
}

scores, outputs = tuner_ob.get_score(gen_params, dataset = test_dataset)

  0%|          | 0/1 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
100%|██████████| 1/1 [00:17<00:00, 17.57s/it]


In [11]:
from llmsearch.utils.mem_utils import gc_cuda
gc_cuda()

In [16]:
input_cols = tuner_ob.input_cols
eval_cols = tuner_ob.eval_cols


processed_dataset = test_dataset.map(
                lambda sample: {
                    "_X": sample_to_chat_format(tokenizer, **{col: sample[col] for col in input_cols + eval_cols}),
                },
            )

Map: 100%|██████████| 2/2 [00:00<00:00, 289.96 examples/s]
