In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload  
%load_ext autoreload
%autoreload 2

In [2]:
# Autoreload
import sys
from typing import List

sys.path.append('../')

import numpy as np
from IPython.display import Audio, display


import nltk
import torch
import numpy as np
import datasets
import pandas as pd
import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, T5ForConditionalGeneration, AutoModelForSeq2SeqLM

In [3]:
import llmsearch

Monkey Patching .generate function of `transformers` library


In [4]:
device = "cpu"

if torch.backends.mps.is_built() and torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"

from llmsearch.utils.mem_utils import gc_cuda

print(f"Device - {device}")

def beep(duration = 1, frequency=440, rhythm=1):
    sample_rate = 44100  # Standard audio sample rate
    t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
    audio_data = np.sin(2*np.pi*frequency*t)  # Generate a sine wave
    audio_data *= np.where(np.arange(len(audio_data)) % rhythm == 0, 1, 0)  # Apply rhythm
    display(Audio(audio_data, rate=sample_rate, autoplay=True))

Device - mps


In [5]:
dataset = datasets.load_dataset("samsum")

Found cached dataset samsum (/Users/praful932/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
sample_size = 5
samples_to_tune_on = datasets.Dataset.from_dict(dataset["train"][:sample_size])
samples_to_tune_on = samples_to_tune_on.rename_columns(column_mapping = {'dialogue' : 'X', 'summary' : "y"})

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False)
model =  AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)



In [8]:
import langchain

X = samples_to_tune_on[0]['X']

pt = langchain.PromptTemplate.from_template("Conversation: {X}\nSummary:")

print(pt.format_prompt(X = X).to_string())

Conversation: Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
Summary:


In [9]:
import evaluate

rouge_metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def get_rouge_score(y_true: List, y_pred: List):
    preds, gts = postprocess_text(preds=y_pred, labels=y_true)

    result = rouge_metric.compute(predictions=preds, references=gts, use_stemmer=True)
    return result['rouge2']

In [10]:
from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import get_total_available_ram, get_gpu_information
from llmsearch.utils.logging_utils import set_verbosity_info, set_verbosity_debug
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

seed = 42

set_verbosity_info()


In [11]:
tuner_ob = Tuner(model = model,tokenizer = tokenizer,dataset = samples_to_tune_on,
                 device = device, batch_size = 512,
                 tokenizer_encoding_kwargs={'padding': True, 'truncation': True, 'max_length': 512},
                 tokenizer_decoding_kwargs = {'skip_special_tokens' : True,  'spaces_between_special_tokens' : False}, 
                 scorer = get_rouge_score, prompt_template = pt, is_encoder_decoder = True, seed = seed, column_mapping = {"text_column_name": "X", "label_column_name": "y"})


Map:   0%|          | 0/5 [00:00<?, ? examples/s]

In [15]:
# Earlier
from llmsearch.utils.model_utils import seed_everything


"""
parameters and how they affect do_sample == False
1. temperature - output does not change - greedy decoding
2. top_k - output does not change - greedy decoding
3. repetition_penalty - output changes
4. no_repeat_ngram_size - output changes
"""

# seed_everything(seed)

initial_generation_params1 = {
    'max_new_tokens' : 120,
#     'repetition_penalty'  : 0.6,
#     'repetition_penalty_range'  : 5,
#     'temperature' : 0.7,
    'do_sample' : True,
    'generation_seed' : 42,
    'mirostat_mode': 2,
    'mirostat_tau' : 5,
#     'top_a' : 0.1,
}
score, outputs1 = tuner_ob.get_score(initial_generation_params1)



2023-09-03 19:55:51.443 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'max_new_tokens': 120, 'do_sample': True, 'generation_seed': 42, 'mirostat_mode': 2, 'mirostat_tau': 5}
2023-09-03 19:55:51.444 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-09-03 19:55:51.444 - llmsearch.utils.model_utils:101 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

here - 2
Iteration - 1
[21542, 16637, 24571, 24244, 13390]
sorted logits tensor sum - -580818.875
[0.9838887453079224, 0.011557672172784805, 0.0011503256391733885, 0.00023728670203126967]
row logit tensor sum - 8.781203269958496
Break index i - 3
torch.Size([3])
row sum - 8.781203269958496
softmaxed candidate - tensor([0.9872, 0.0116, 0.0012], device='mps:0')
previous index - tensor([0], device='mps:0')
logit val - tensor([0.9872], device='mps:0')
obs surprise value - 0.018514678748829252
indices to remove - 516098713
sum value - 6.658944606781006
Iteration - 2
[13635, 56, 11, 65, 19]
sorted logits tensor sum - -582465.6875
[0.6717743277549744, 0.13498978316783905, 0.04175282269716263, 0.027842435985803604]
row logit tensor sum - -47.788448333740234
Break index i - 27
torch.Size([27])
row sum - -47.788448333740234
softmaxed candidate - tensor([0.6907, 0.1388, 0.0429, 0.0286, 0.0254, 0.0109, 0.0096, 0.0075, 0.0058,
        0.0048, 0.0045, 0.0037, 0.0035, 0.0033, 0.0029, 0.0026, 0.0021, 

indices to remove - 516120190
sum value - 0.0
Iteration - 9
[12, 3, 29, 150, 641]
sorted logits tensor sum - -497524.15625
[0.11059970408678055, 0.09525997191667557, 0.06570249795913696, 0.056367941200733185]
row logit tensor sum - -1048.8671875
Break index i - 254
torch.Size([254])
row sum - -1048.8671875
softmaxed candidate - tensor([0.1177, 0.1014, 0.0699, 0.0600, 0.0505, 0.0455, 0.0426, 0.0307, 0.0212,
        0.0211, 0.0171, 0.0156, 0.0155, 0.0153, 0.0151, 0.0148, 0.0145, 0.0136,
        0.0134, 0.0121, 0.0120, 0.0108, 0.0082, 0.0081, 0.0077, 0.0074, 0.0064,
        0.0064, 0.0062, 0.0060, 0.0059, 0.0056, 0.0054, 0.0047, 0.0046, 0.0046,
        0.0044, 0.0044, 0.0043, 0.0036, 0.0035, 0.0034, 0.0034, 0.0032, 0.0032,
        0.0031, 0.0031, 0.0029, 0.0027, 0.0026, 0.0025, 0.0024, 0.0022, 0.0021,
        0.0020, 0.0019, 0.0019, 0.0018, 0.0018, 0.0018, 0.0017, 0.0017, 0.0016,
        0.0016, 0.0016, 0.0016, 0.0016, 0.0015, 0.0015, 0.0014, 0.0014, 0.0014,
        0.0014, 0.0014, 0.0014

indices to remove - 516098713
sum value - 3.6618006229400635
Iteration - 13
[56, 11, 19, 31, 54]
sorted logits tensor sum - -596377.0625
[0.8217237591743469, 0.03975258022546768, 0.0353764072060585, 0.026279160752892494]
row logit tensor sum - -287.9407653808594
Break index i - 61
torch.Size([61])
row sum - -287.9407653808594
softmaxed candidate - tensor([8.3318e-01, 4.0307e-02, 3.5870e-02, 2.6646e-02, 8.7015e-03, 7.4266e-03,
        4.7972e-03, 4.7897e-03, 3.9082e-03, 2.7695e-03, 2.6826e-03, 2.0602e-03,
        1.5882e-03, 1.4827e-03, 1.2633e-03, 1.2559e-03, 1.2427e-03, 1.0963e-03,
        1.0764e-03, 1.0539e-03, 9.7906e-04, 9.4039e-04, 9.2795e-04, 8.9948e-04,
        7.4698e-04, 7.3594e-04, 6.7558e-04, 6.6796e-04, 6.3352e-04, 5.0185e-04,
        4.7789e-04, 4.6120e-04, 4.4782e-04, 3.9630e-04, 3.8877e-04, 3.8650e-04,
        3.8485e-04, 3.8202e-04, 3.5818e-04, 3.5742e-04, 3.5226e-04, 3.5032e-04,
        3.4628e-04, 3.0449e-04, 3.0232e-04, 2.8955e-04, 2.3487e-04, 2.3455e-04,
        2.

indices to remove - 516114534
sum value - 3.166081190109253
Iteration - 17
[5, 12, 6, 21, 11]
sorted logits tensor sum - -649105.625
[0.9183662533760071, 0.024043764919042587, 0.00776395620778203, 0.00671361293643713]
row logit tensor sum - -317.45343017578125
Break index i - 57
torch.Size([57])
row sum - -317.45343017578125
softmaxed candidate - tensor([9.2087e-01, 2.4109e-02, 7.7851e-03, 6.7319e-03, 6.4656e-03, 5.0768e-03,
        3.4032e-03, 3.3779e-03, 2.3401e-03, 2.3397e-03, 1.7622e-03, 1.6850e-03,
        1.4899e-03, 1.2083e-03, 1.0474e-03, 8.9872e-04, 8.9374e-04, 8.6616e-04,
        6.8932e-04, 6.4189e-04, 5.7750e-04, 4.3713e-04, 3.9857e-04, 3.5012e-04,
        3.4723e-04, 3.3526e-04, 3.2649e-04, 3.0299e-04, 2.9745e-04, 2.8192e-04,
        2.1310e-04, 2.1102e-04, 1.7872e-04, 1.4330e-04, 1.3627e-04, 1.3218e-04,
        1.2765e-04, 1.2158e-04, 1.2089e-04, 1.0333e-04, 9.7672e-05, 9.6463e-05,
        9.4214e-05, 9.3742e-05, 9.2275e-05, 8.4826e-05, 7.5789e-05, 7.3791e-05,
        6.4

2023-09-03 19:56:29.280 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 37.836405 secs


indices to remove - 516120254
sum value - 0.8747960329055786


In [15]:
outputs1

['Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.',
 'Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.',
 'Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.',
 'Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.',
 'Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.']

In [15]:
outputs1

['Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.']

In [21]:
outputs1

['Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.']

In [None]:
for y_t, y_p in zip(tuner_ob.dataset['y'], outputs1):
    print(y_p)
    print(y_t,'\n\n')

In [None]:
print(outputs1)

In [None]:
print(outputs1)

In [None]:
hyp_param_grid = {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5))),
    "top_p": [0.75, 0.8, 0.9, 1.0],
    'do_sample' : [True, False],
    'generation_seed' : [42],
    'repetition_penalty' : [1.0, 1.2],
    'no_repeat_ngram_size' : [0,2,3],
}

hyp_param_grid_2= {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10000)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5000))),
    "top_p": list(map(int,np.linspace(start=10, stop=50,num=5000))),
    'do_sample' : [True],
    'generation_seed' : [42],
    'num_beams' : [1],
#     'repetition_penalty' : [1.0, 1.2],
#     'no_repeat_ngram_size' : [0,2,3],
}

scorer = make_scorer(score_func=get_rouge_score, greater_is_better=True)


clf = RandomizedSearchCV(
    estimator=tuner_ob.estimator,
    param_distributions=hyp_param_grid_2,
    n_iter = 2,
    scoring=scorer,
    cv=5,
    random_state = 42,
    n_jobs=None,
)

"""
5 fold means, whole sample set of 100 examples will be split into 80:20 ratio
for each hyper_parameter set we have a model f(hyper_params)
    - we will evaluate this model and get the cross val score (test on each 20 samples 5 times, while training on the rest 80 each time)
    - we get the score on the quality of hyperparams by evaluating the model with the hyperparams on the unseen 1 fold
"""


In [None]:
1 = 1/0

In [None]:
clf.fit(X=tuner_ob.dataset["X"], y=tuner_ob.dataset["y"])

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
clf.best_estimator_.set_params(**clf.best_params_).get_params()


In [None]:
clf.fi

In [None]:
from sklearn.base import clone

clone(1)

In [None]:
dir(clf)

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.cv_results_

In [None]:
clf.best_estimator_.get_params()