In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload  
%load_ext autoreload
%autoreload 2

In [2]:
# Autoreload
import sys
from typing import List

sys.path.append('../')

import numpy as np
from IPython.display import Audio, display


import nltk
import torch
import numpy as np
import datasets
import pandas as pd
import transformers

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel, T5ForConditionalGeneration, AutoModelForSeq2SeqLM

In [3]:
import llmsearch

Monkey Patching .generate function of `transformers` library


In [4]:
device = "cpu"

if torch.backends.mps.is_built() and torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"

from llmsearch.utils.mem_utils import gc_cuda

print(f"Device - {device}")

def beep(duration = 1, frequency=440, rhythm=1):
    sample_rate = 44100  # Standard audio sample rate
    t = np.linspace(0, duration, int(duration * sample_rate), endpoint=False)
    audio_data = np.sin(2*np.pi*frequency*t)  # Generate a sine wave
    audio_data *= np.where(np.arange(len(audio_data)) % rhythm == 0, 1, 0)  # Apply rhythm
    display(Audio(audio_data, rate=sample_rate, autoplay=True))

Device - mps


In [5]:
dataset = datasets.load_dataset("samsum")

Found cached dataset samsum (/Users/praful932/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
sample_size = 2
samples_to_tune_on = datasets.Dataset.from_dict(dataset["train"][:sample_size])
samples_to_tune_on = samples_to_tune_on.rename_columns(column_mapping = {'dialogue' : 'X', 'summary' : "y"})

In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast = False)
model =  AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)



In [8]:
import langchain

X = samples_to_tune_on[0]['X']

pt = langchain.PromptTemplate.from_template("Conversation: {X}\nSummary:")

print(pt.format_prompt(X = X).to_string())

Conversation: Amanda: I baked  cookies. Do you want some?
Jerry: Sure!
Amanda: I'll bring you tomorrow :-)
Summary:


In [9]:
import evaluate

rouge_metric = evaluate.load("rouge")

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]
    

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels


def get_rouge_score(y_true: List, y_pred: List):
    preds, gts = postprocess_text(preds=y_pred, labels=y_true)

    result = rouge_metric.compute(predictions=preds, references=gts, use_stemmer=True)
    return result['rouge2']

In [10]:
from llmsearch.tuner import Tuner
from llmsearch.utils.mem_utils import get_total_available_ram, get_gpu_information
from llmsearch.utils.logging_utils import set_verbosity_info, set_verbosity_debug
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

seed = 42

set_verbosity_info()


In [11]:
tuner_ob = Tuner(model = model,tokenizer = tokenizer,dataset = samples_to_tune_on,
                 device = device, batch_size = 512,
                 tokenizer_encoding_kwargs={'padding': True, 'truncation': True, 'max_length': 512},
                 tokenizer_decoding_kwargs = {'skip_special_tokens' : True,  'spaces_between_special_tokens' : False}, 
                 scorer = get_rouge_score, prompt_template = pt, is_encoder_decoder = True, seed = seed, column_mapping = {"text_column_name": "X", "label_column_name": "y"})


Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [12]:
# Earlier
from llmsearch.utils.model_utils import seed_everything

"""
parameters and how they affect do_sample == False
1. temperature - output does not change - greedy decoding
2. top_k - output does not change - greedy decoding
3. repetition_penalty - output changes
4. no_repeat_ngram_size - output changes
"""

# seed_everything(seed)

initial_generation_params1 = {
    'max_new_tokens' : 120,
#     'repetition_penalty'  : 0.6,
#     'repetition_penalty_range'  : 5,
#     'temperature' : 0.7,
    'do_sample' : True,
    'generation_seed' : 42,
    'mirostat_mode': 2,
    'mirostat_tau' : 5,
#     'top_a' : 0.1,
}
score, outputs1 = tuner_ob.get_score(initial_generation_params1)


2023-09-14 00:52:26.803 - llmsearch.utils.mem_utils:145 - INFO - Starting inference with generation parameters - {'max_new_tokens': 120, 'do_sample': True, 'generation_seed': 42, 'mirostat_mode': 2, 'mirostat_tau': 5}
2023-09-14 00:52:26.803 - llmsearch.utils.mem_utils:149 - INFO - Performing inference with batch_size - 512
2023-09-14 00:52:26.804 - llmsearch.utils.model_utils:102 - INFO - Detected generation type - Sampling


  0%|          | 0/1 [00:00<?, ?it/s]

here - 2


  input_ids = input_ids.repeat_interleave(expand_size, dim=0)


Iteration - 1
sorted logits 1st example - tensor([ 6.6590,  2.2148, -0.0925, -1.6710], device='mps:0')
sorted indices - [21542, 16637, 24571, 24244, 13390]
Probability 1st example - tensor([9.8389e-01, 1.1558e-02, 1.1503e-03, 2.3729e-04], device='mps:0')
tensor(0.9839, device='mps:0')
Row logit tensor sum - 8.781220436096191
Row sum - 8.781220436096191
softmaxed vals - tensor([0.9872, 0.0116, 0.0012], device='mps:0')
Seeding with 42
previous index - tensor([0], device='mps:0')
logit val - tensor([0.9872], device='mps:0')
obs surprise value - 0.018514678748829252
indices to remove - 516098713
sum value - 6.658950328826904




  if unfinished_sequences.max() == 0:


Iteration - 2
sorted logits 1st example - tensor([3.2360, 1.6313, 0.4579, 0.0527], device='mps:0')
sorted indices - [13635, 56, 11, 65, 19]
Probability 1st example - tensor([0.6718, 0.1350, 0.0418, 0.0278], device='mps:0')
tensor(0.6718, device='mps:0')
Row logit tensor sum - -47.7884521484375
Row sum - -47.7884521484375
softmaxed vals - tensor([0.6907, 0.1388, 0.0429, 0.0286], device='mps:0')
Seeding with 42
previous index - tensor([7], device='mps:0')
logit val - tensor([0.0075], device='mps:0')
obs surprise value - 7.067882950159778
indices to remove - 516117509
sum value - 0.0


Iteration - 3
sorted logits 1st example - tensor([ 2.1572,  0.8451,  0.0723, -1.0812], device='mps:0')
sorted indices - [12, 128, 5081, 7364, 3]
Probability 1st example - tensor([0.5925, 0.1595, 0.0737, 0.0232], device='mps:0')
tensor(0.5925, device='mps:0')
Row logit tensor sum - -96.61461639404297
Row sum - -96.61461639404297
softmaxed vals - tensor([0.6300, 0.1696, 0.0783, 0.0247], device='mps:0')
Seedin

indices to remove - 516119974
sum value - 0.0


Iteration - 16
sorted logits 1st example - tensor([1.5522, 1.0980, 0.8778, 0.6058], device='mps:0')
sorted indices - [12, 28, 168, 2177, 5721]
Probability 1st example - tensor([0.2490, 0.1581, 0.1268, 0.0966], device='mps:0')
tensor(0.2490, device='mps:0')
Row logit tensor sum - -28.028358459472656
Row sum - -28.028358459472656
softmaxed vals - tensor([0.2835, 0.1800, 0.1444, 0.1100], device='mps:0')
Seeding with 42
previous index - tensor([7], device='mps:0')
logit val - tensor([0.0275], device='mps:0')
obs surprise value - 5.185305672740188
indices to remove - 516120239
sum value - 0.0


Iteration - 17
sorted logits 1st example - tensor([2.0087, 1.1472, 0.6497, 0.3960], device='mps:0')
sorted indices - [8, 5721, 3, 12, 160]
Probability 1st example - tensor([0.2922, 0.1235, 0.0751, 0.0583], device='mps:0')
tensor(0.2922, device='mps:0')
Row logit tensor sum - -33.124168395996094
Row sum - -33.124168395996094
softmaxed vals - tensor([0.36

indices to remove - 516120250
sum value - 1.9058787822723389


Iteration - 30
sorted logits 1st example - tensor([ 1.9662, -1.2483, -1.6524, -2.7703], device='mps:0')
sorted indices - [1, 16637, 21542, 451, 328]
Probability 1st example - tensor([0.9099, 0.0366, 0.0244, 0.0080], device='mps:0')
tensor(0.9099, device='mps:0')
Row logit tensor sum - -3.7048492431640625
Row sum - -3.7048492431640625
softmaxed vals - tensor([0.9296, 0.0373, 0.0249, 0.0082], device='mps:0')
Seeding with 42
previous index - tensor([0], device='mps:0')
logit val - tensor([0.9296], device='mps:0')
obs surprise value - 0.10536677576175119
indices to remove - 516120254
sum value - 1.9661556482315063




2023-09-14 00:53:45.985 - llmsearch.utils.mem_utils:179 - INFO - Finished running inference, took 79.181386 secs


In [None]:
print(outputs1)

In [13]:
print(outputs1)

['Amanda wants Jerry cookies so Amanda mnad. It should go in one sitting or one of these ones to see them again tomorrow.', 'Amanda wants Jerry cookies so Amanda mnad. It should go in one sitting or one of these ones to see them again tomorrow.']


In [13]:
outputs1

['Amanda baked cookies with Jerry. She has another brother. Amanda will bring her tomorrow.']

In [13]:
outputs1

['Amanda baked cookies with him. Jerry will bring Amanda',
 'Olivia is voting for the Liberals so she will',
 'Kim is going to do some things now and won',
 "Edward's not in ovl me",
 'Sam and Naomi have made a point of asking',
 'Neville was married on September 17th as she',
 'Cassandra has completed the homework tomorrow',
 'Sarah sent James lyrics to the song she loved.',
 'Noah is meeting Madison with her boss at the Man',
 'Matt and Agnes both know each other in']

In [21]:
outputs1

['Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.',
 'Kim will drop some off of her - for later. Tim will use Pomodoro technique for doing something for themselves and she suggests poetry.']

In [None]:
for y_t, y_p in zip(tuner_ob.dataset['y'], outputs1):
    print(y_p)
    print(y_t,'\n\n')

In [None]:
print(outputs1)

In [None]:
print(outputs1)

In [None]:
hyp_param_grid = {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5))),
    "top_p": [0.75, 0.8, 0.9, 1.0],
    'do_sample' : [True, False],
    'generation_seed' : [42],
    'repetition_penalty' : [1.0, 1.2],
    'no_repeat_ngram_size' : [0,2,3],
}

hyp_param_grid_2= {
    "max_new_tokens": [120],
    "temperature": list(np.linspace(start=0.1, stop=1.0,num=10000)),
    'top_k' : list(map(int,np.linspace(start=10, stop=50,num=5000))),
    "top_p": list(map(int,np.linspace(start=10, stop=50,num=5000))),
    'do_sample' : [True],
    'generation_seed' : [42],
    'num_beams' : [1],
#     'repetition_penalty' : [1.0, 1.2],
#     'no_repeat_ngram_size' : [0,2,3],
}

scorer = make_scorer(score_func=get_rouge_score, greater_is_better=True)


clf = RandomizedSearchCV(
    estimator=tuner_ob.estimator,
    param_distributions=hyp_param_grid_2,
    n_iter = 2,
    scoring=scorer,
    cv=5,
    random_state = 42,
    n_jobs=None,
)

"""
5 fold means, whole sample set of 100 examples will be split into 80:20 ratio
for each hyper_parameter set we have a model f(hyper_params)
    - we will evaluate this model and get the cross val score (test on each 20 samples 5 times, while training on the rest 80 each time)
    - we get the score on the quality of hyperparams by evaluating the model with the hyperparams on the unseen 1 fold
"""


In [None]:
1 = 1/0

In [None]:
clf.fit(X=tuner_ob.dataset["X"], y=tuner_ob.dataset["y"])

In [None]:
clf.best_params_

In [None]:
clf.best_estimator_

In [None]:
clf.best_estimator_.set_params(**clf.best_params_).get_params()


In [None]:
clf.fi

In [None]:
from sklearn.base import clone

clone(1)

In [None]:
dir(clf)

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.best_estimator_.get_params()

In [None]:
clf.cv_results_

In [None]:
clf.best_estimator_.get_params()