In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload  
%load_ext autoreload
%autoreload 2

In [2]:
# Autoreload
import sys
from typing import List

sys.path.append('../')

import numpy as np
from IPython.display import Audio, display


import nltk
import torch
import numpy as np
import datasets
import pandas as pd
import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, T5ForConditionalGeneration, AutoModelForSeq2SeqLM

In [3]:
import llmsearch

Monkey Patching .generate function of `transformers` library


In [4]:
device = "cpu"

if torch.backends.mps.is_built() and torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"

from llmsearch.utils.mem_utils import gc_cuda

print(f"Device - {device}")

def beep(duration = 1, frequency=440, rhythm=1):
    sample_rate = 44100  # Standard audio sample rate
    t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
    audio_data = np.sin(2*np.pi*frequency*t)  # Generate a sine wave
    audio_data *= np.where(np.arange(len(audio_data)) % rhythm == 0, 1, 0)  # Apply rhythm
    display(Audio(audio_data, rate=sample_rate, autoplay=True))

Device - mps


In [5]:
dataset = datasets.load_dataset("samsum")

Found cached dataset samsum (/Users/praful932/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
sample_size = 100
samples_to_tune_on = datasets.Dataset.from_dict(dataset["train"][:sample_size])
samples_to_tune_on = samples_to_tune_on.rename_columns(column_mapping = {'dialogue' : 'X', 'summary' : "y"})

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False)
model =  AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)



In [8]:
import langchain

X = samples_to_tune_on[0]['X']

pt = langchain.PromptTemplate.from_template("Conversation: {X}\nSummary:")

print(pt.format_prompt(X = X).to_string())

Conversation: Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
Summary:


In [9]:
import evaluate

rouge_metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def get_rouge_score(y_true: List, y_pred: List):
    preds, gts = postprocess_text(preds=y_pred, labels=y_true)

    result = rouge_metric.compute(predictions=preds, references=gts, use_stemmer=True)
    return result['rouge2']

In [10]:
from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import get_total_available_ram, get_gpu_information
from llmsearch.utils.logging_utils import set_verbosity_info, set_verbosity_debug
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

seed = 42

set_verbosity_info()


In [11]:
tuner_ob = Tuner(model = model,tokenizer = tokenizer,dataset = samples_to_tune_on,
                 device = device, batch_size = 512,
                 tokenizer_encoding_kwargs={'padding': True, 'truncation': True, 'max_length': 512},
                 tokenizer_decoding_kwargs = {'skip_special_tokens' : True,  'spaces_between_special_tokens' : False}, 
                 scorer = get_rouge_score, prompt_template = pt, is_encoder_decoder = True, seed = seed, column_mapping = {"text_column_name": "X", "label_column_name": "y"})


Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [12]:
len(samples_to_tune_on['X'])

100

In [None]:
# Earlier
from llmsearch.utils.model_utils import seed_everything


"""
parameters and how they affect do_sample == False
1. temperature - output does not change - greedy decoding
2. top_k - output does not change - greedy decoding
3. repetition_penalty - output changes
4. no_repeat_ngram_size - output changes
"""

# seed_everything(seed)

initial_generation_params1 = {
    'max_new_tokens' : 120,
    'repetition_penalty'  : 0.6,
    'repetition_penalty_range'  : 5,
#     'temperature' : 0.7,
#     'do_sample' : True,
#     'generation_seed' : 42
#     'mirostat_mode': 2,
#     'mirostat_tau': 8,
}
score, outputs1 = tuner_ob.get_score(initial_generation_params1)


2023-08-21 23:35:01.231 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'max_new_tokens': 120, 'repetition_penalty': 0.6, 'repetition_penalty_range': 5}
2023-08-21 23:35:01.231 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-21 23:35:01.231 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Greedy Decoding


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
for y_t, y_p in zip(tuner_ob.dataset['y'], outputs1):
    print(y_p)
    print(y_t,'\n\n')

Amanda baked cookies. Jerry will bring Amanda tomorrow.
Amanda baked cookies and will bring Jerry some tomorrow. 


Olivia is voting for Liberals.
Olivia and Olivier are voting for liberals in this election.  


Kim is going to do a lot of stuff but he's procrastinating. Kim will do everything he wants to do tomorrow. Kim will eat some defrosted veggies. Tim will do a Pomodoro technique where he uses breaks for doing chores.
Kim may try the pomodoro technique recommended by Tim to get more stuff done. 


Edward is in ove with Rachel. Rachel is outside.
Edward thinks he is in love with Bella. Rachel wants Edward to open his door. Rachel is outside.  


Sam is worried about rick saying something to her roommate. Sam is worried about her roommate.
Sam is confused, because he overheard Rick complaining about him as a roommate. Naomi thinks Sam should talk to Rick. Sam is not sure what to do. 


Neville got married on September 17. Wyatt will ask her wife to check on her wedding anniversary

In [19]:
print(outputs1)

['Amanda baked cookies. Jerry will bring Amanda tomorrow.', 'Olivia is voting for Liberals.', "Kim is going to do a lot of stuff but she's procrastin", 'Edward is in ove with Rachel.', 'Sam is worried about rick saying something to her roommate. He told them that he', 'Neville got married on September 17. Wyatt will ask her wife to check on her wedding', 'Cassandra will check the homework for John in 20 minutes. John will check it', "James found a song on youtube. He doesn't like it. He listens", 'Noah quit his job and quit his job. Madison thinks Noah liked it.', 'Matt and Agnes are going to the Georgian restaurant in Kazimierz on Saturday at', 'Lucas is going to Death & Co tonight at 10 pm.', 'Mark will send George the tracking number tomorrow.', 'Anita is at the station in Bologna. Jenny is not able to get', 'Arthur is looking for a junior project manager at his company. He has an offer as', 'Macca is in Iceland. She has never done ice climbing before.', 'Isabella is not able to get

In [17]:
print(outputs1)

['Amanda will bring Jerry some cookies tomorrow.', 'Olivia and Oliver are voting for Liberals.', "Kim is going to do some toy stuff he's going to do tomorrow. Kim will use Pomodoro technique. Kim and Kim want to do something other than to do things.", 'Rachel is invited to a party outside.', 'Sam is being bullied by rick. Naomi is not going to talk to him. Sam is not sure if he is going to talk with him.', 'Neville got married on September 17. Wyatt will ask her wife about the issue.', "Cassandra had some homework for her next week. John will check the homework for her tomorrow. Cassandra has a boss with him because he doesn't want to fix everything for everyone. John will drive her home and go to a bar. Cassandra and John will drive her home.", 'James has found a song on your playlist. James loves the song with music. He is upset that the music has no lyrics.', 'Noah has quit his job and has a new boss.', 'Matt will pick Agnes up on Saturday at 6 pm for a date.', 'Lucas and Demi will 

In [13]:
hyp_param_grid = {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5))),
    "top_p": [0.75, 0.8, 0.9, 1.0],
    'do_sample' : [True, False],
    'generation_seed' : [42],
    'repetition_penalty' : [1.0, 1.2],
    'no_repeat_ngram_size' : [0,2,3],
}

hyp_param_grid_2= {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10000)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5000))),
    "top_p": list(map(int,np.linspace(start=10, stop=50,num=5000))),
    'do_sample' : [True],
    'generation_seed' : [42],
    'num_beams' : [1],
#     'repetition_penalty' : [1.0, 1.2],
#     'no_repeat_ngram_size' : [0,2,3],
}

scorer = make_scorer(score_func=get_rouge_score, greater_is_better=True)


clf = RandomizedSearchCV(
    estimator=tuner_ob.estimator,
    param_distributions=hyp_param_grid_2,
    n_iter = 2,
    scoring=scorer,
    cv=5,
    random_state = 42,
    n_jobs=None,
)

"""
5 fold means, whole sample set of 100 examples will be split into 80:20 ratio
for each hyper_parameter set we have a model f(hyper_params)
    - we will evaluate this model and get the cross val score (test on each 20 samples 5 times, while training on the rest 80 each time)
    - we get the score on the quality of hyperparams by evaluating the model with the hyperparams on the unseen 1 fold
"""


'\n5 fold means, whole sample set of 100 examples will be split into 80:20 ratio\nfor each hyper_parameter set we have a model f(hyper_params)\n    - we will evaluate this model and get the cross val score (test on each 20 samples 5 times, while training on the rest 80 each time)\n    - we get the score on the quality of hyperparams by evaluating the model with the hyperparams on the unseen 1 fold\n'

In [1]:
1 = 1/0

SyntaxError: cannot assign to literal (826511956.py, line 1)

In [14]:
clf.fit(X=tuner_ob.dataset["X"], y=tuner_ob.dataset["y"])

2023-08-13 16:09:42.692 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 36, 'top_k': 17, 'temperature': 0.6999099909990999, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:09:42.692 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:09:42.693 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

  input_ids = input_ids.repeat_interleave(expand_size, dim=0)
  if unfinished_sequences.max() == 0:
2023-08-13 16:09:56.295 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 13.602621 secs
2023-08-13 16:09:56.384 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 36, 'top_k': 17, 'temperature': 0.6999099909990999, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:09:56.385 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:09:56.385 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:00.067 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 3.681773 secs
2023-08-13 16:10:00.154 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 36, 'top_k': 17, 'temperature': 0.6999099909990999, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:00.155 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:00.155 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:03.307 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 3.152030 secs
2023-08-13 16:10:03.395 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 36, 'top_k': 17, 'temperature': 0.6999099909990999, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:03.395 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:03.395 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:10.357 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 6.961224 secs
2023-08-13 16:10:10.445 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 36, 'top_k': 17, 'temperature': 0.6999099909990999, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:10.445 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:10.446 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:12.834 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 2.388126 secs
2023-08-13 16:10:12.918 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 15, 'top_k': 45, 'temperature': 0.5357335733573357, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:12.919 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:12.919 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:15.993 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 3.073820 secs
2023-08-13 16:10:16.076 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 15, 'top_k': 45, 'temperature': 0.5357335733573357, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:16.076 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:16.077 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:21.293 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 5.215927 secs
2023-08-13 16:10:21.380 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 15, 'top_k': 45, 'temperature': 0.5357335733573357, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:21.381 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:21.381 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:23.803 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 2.421913 secs
2023-08-13 16:10:23.889 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 15, 'top_k': 45, 'temperature': 0.5357335733573357, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:23.889 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:23.889 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:26.772 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 2.882948 secs
2023-08-13 16:10:26.860 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'top_p': 15, 'top_k': 45, 'temperature': 0.5357335733573357, 'num_beams': 1, 'max_new_tokens': 120, 'generation_seed': 42, 'do_sample': True}
2023-08-13 16:10:26.861 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-08-13 16:10:26.861 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

2023-08-13 16:10:29.732 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 2.871265 secs


In [15]:
clf.best_params_

{'top_p': 15,
 'top_k': 45,
 'temperature': 0.5357335733573357,
 'max_new_tokens': 120,
 'generation_seed': 42,
 'do_sample': True}

In [24]:
clf.best_estimator_

In [25]:
clf.best_estimator_.set_params(**clf.best_params_).get_params()


{'model': T5ForConditionalGeneration(
   (shared): Embedding(32128, 512)
   (encoder): T5Stack(
     (embed_tokens): Embedding(32128, 512)
     (block): ModuleList(
       (0): T5Block(
         (layer): ModuleList(
           (0): T5LayerSelfAttention(
             (SelfAttention): T5Attention(
               (q): Linear(in_features=512, out_features=384, bias=False)
               (k): Linear(in_features=512, out_features=384, bias=False)
               (v): Linear(in_features=512, out_features=384, bias=False)
               (o): Linear(in_features=384, out_features=512, bias=False)
               (relative_attention_bias): Embedding(32, 6)
             )
             (layer_norm): T5LayerNorm()
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (1): T5LayerFF(
             (DenseReluDense): T5DenseGatedActDense(
               (wi_0): Linear(in_features=512, out_features=1024, bias=False)
               (wi_1): Linear(in_features=512, out_features=1024, b

In [None]:
clf.fi

In [None]:
from sklearn.base import clone

clone(1)

In [None]:
dir(clf)

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.cv_results_

In [None]:
clf.best_estimator_.get_params()