In [1]:
# Autocompletion
%config Completer.use_jedi = False

# Autoreload
%load_ext autoreload
%autoreload 2

import sys
from typing import List


import nltk
import torch
import datasets
import evaluate
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


sys.path.append("../")

In [2]:
1

1

In [9]:
try:
    a = 2 / 0
except:
    print("im here")


im here


In [3]:
device = "cpu"

if torch.backends.mps.is_built() and torch.backends.mps.is_available():
    device = "mps"

print(f"Device - {device}")

Device - mps


In [None]:
# Load dataset that you want to tune your model on


dataset = datasets.load_dataset("c-s-ale/dolly-15k-instruction-alpaca-format")
rouge_metric = evaluate.load("rouge")

In [None]:
task_types = ["open_"]

samples = []

for sample in dataset["train"]:
    if sample["category"] in task_types:
        samples.append(sample)

samples_to_tune_on = datasets.Dataset.from_list(samples)[:100]

In [None]:
samples_to_tune_on.keys()

In [None]:
samples_to_tune_on["input"]

In [None]:
# Define your model

model_id = "google/flan-t5-small"

tokenizer = AutoTokenizer.from_pretrained(model_id)

model = AutoModelForSeq2SeqLM.from_pretrained(model_id).to(device)

In [None]:
type(model)

In [11]:
# Define your metric that you want to minimize or maximize on - should take in y_true & y_pred and return a dict of metrics


def my_metric(y_true: List, y_pred: List):
    def postprocess_text(preds: List, labels: List):
        """Postprocess text(preds & labels) to rouge format for evaluation"""
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]
        return preds, labels

    """Get rouge score from y_true & y_pred"""
    preds, gts = postprocess_text(preds=y_pred, labels=y_true)

    result = rouge_metric.compute(predictions=preds, references=gts, use_stemmer=True)

    # TODO : fix metric
    return result["rouge2"]


# Example run on some random samples
my_metric(y_true=samples_to_tune_on["output"], y_pred=samples_to_tune_on["output"])

0.96

In [7]:
samples_to_tune_on

{'instruction': ['Why can camels survive for long without water?',
  "Alice's parents have three daughters: Amy, Jessy, and what’s the name of the third daughter?",
  'Who gave the UN the land in NY to build their HQ',
  'What is a polygon?',
  'Which episodes of season four of Game of Thrones did Michelle MacLaren direct?',
  'What individual has won the most Olympic gold medals in the history of the games?',
  'Which Dutch artist painted “Girl with a Pearl Earring”?',
  'What happens when the sun goes down?',
  'What is a verb?',
  'Who became king of Holland in 1806?',
  'Who played Billy the Kid in The Left Handed Gun',
  'why did Syd Barrett left the Pink Floyd?',
  'What is underwriting?',
  'Who saved Andromeda from the sea monster',
  'Who are \'\'The Lumières" ?',
  'When was the wheel invented?',
  'Are lilies safe for cats?',
  'What is Sunshine Recession?',
  'What is the currency in use in the Netherlands?',
  'In the series A Song of Ice and Fire, who is the founder of Ho

In [8]:
# Check dataset specific numbers to define tokenization limits

# For input

pd.Series(
    [
        len(
            tokenizer(sample, max_length=None, truncation=False, padding=False)[
                "input_ids"
            ]
        )
        for sample in samples_to_tune_on["instruction"]
    ]
).quantile([0.7, 0.8, 0.85, 0.9, 0.95])

0.70    13.00
0.80    16.00
0.85    17.00
0.90    19.00
0.95    21.05
dtype: float64

In [9]:
# For label
pd.Series(
    [
        len(
            tokenizer(sample, max_length=None, truncation=False, padding=False)[
                "input_ids"
            ]
        )
        for sample in samples_to_tune_on["output"]
    ]
).quantile([0.7, 0.8, 0.85, 0.9, 0.95])

Token indices sequence length is longer than the specified maximum sequence length for this model (564 > 512). Running this sequence through the model will result in indexing errors


0.70     77.30
0.80    106.20
0.85    131.35
0.90    169.40
0.95    237.30
dtype: float64

In [10]:
# Check metrics on dataset without any optimization

model_input_tokenizer_kwargs = {
    "max_length": 20,
    "truncation": True,
    "padding": True,
}

generation_kwargs = {
    "temperature": 1,
    "max_new_tokens": 200,
    "num_beams": 3,
}

from llmsearch.utils.model_utils import infer_batches


model_output_before_tuning, latency = infer_batches(
    model=model,
    tokenizer=tokenizer,
    device=device,
    model_inputs=samples_to_tune_on["instruction"],
    model_input_tokenizer_kwargs=model_input_tokenizer_kwargs,
    generation_kwargs=generation_kwargs,
    batch_size=32,
)

  input_ids = input_ids.repeat_interleave(expand_size, dim=0)
  sent_lengths_max = sent_lengths.max().item() + 1


NameError: name 'my_metric' is not defined

In [12]:
my_metric(y_true=samples_to_tune_on["output"], y_pred=model_output_before_tuning)

0.013292305726151015

In [13]:
# Run llm search

from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from llmsearch.wrappers.estimator import EstimatorWrapper

seed = 42


hyp_param_grid = {
    "max_new_tokens": [200],
    "num_beams": [3],
    "temperature": [1],
    "epsilon_cutoff": [3e-4, 6e-4, 9e-4, 0],
    "repetition_penalty": [0.3, 0.7, 1.0],
    "no_repeat_ngram_size": [0, 2, 3, 4],
}

model_estimator = EstimatorWrapper(
    model=model,
    tokenizer=tokenizer,
    device=device,
    batch_size=32,
    model_input_tokenizer_kwargs=model_input_tokenizer_kwargs,
)
scorer = make_scorer(score_func=my_metric, greater_is_better=True)

clf = RandomizedSearchCV(
    estimator=model_estimator,
    param_distributions=hyp_param_grid,
    n_iter=30,
    scoring=scorer,
    cv=5,
    random_state=seed,
    n_jobs=1,
    verbose=1,
)

In [14]:
clf.fit(X=samples_to_tune_on["instruction"], y=samples_to_tune_on["output"])

Fitting 5 folds for each of 30 candidates, totalling 150 fits


0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [16]:
model_output_after_tuning, latency = infer_batches(
    model=model,
    tokenizer=tokenizer,
    device=device,
    model_inputs=samples_to_tune_on["instruction"],
    model_input_tokenizer_kwargs=model_input_tokenizer_kwargs,
    generation_kwargs=clf.best_params_,
    batch_size=32,
)

In [17]:
my_metric(y_true=samples_to_tune_on["output"], y_pred=model_output_before_tuning)

0.013292305726151015

In [18]:
my_metric(y_true=samples_to_tune_on["output"], y_pred=model_output_after_tuning)

0.03367519423588454

In [19]:
clf.best_params_

{'temperature': 1,
 'repetition_penalty': 0.7,
 'num_beams': 3,
 'no_repeat_ngram_size': 4,
 'max_new_tokens': 200,
 'epsilon_cutoff': 0}

In [20]:
model_output_after_tuning

['water is a source of a fungus and fungi that can survive for long without water and fungus that can live for a long time.',
 "Alice's parents have three daughters: Amy, Jessy, and what's Alice's mom's name?",
 'the United Nations',
 'polygon',
 'tv series vs. st. john s s tv vs st s n s r n tv',
 'samuel edwards',
 'samuel edwards',
 'the sun rises',
 'irrational adolescent adiolescence adilescence a dilescent dilad adlescent blunder ad ludicrous adalescence adicional adriolescent',
 'henry iii',
 'samuel edwards',
 'he was a saxophonist',
 'underwriting',
 'andromeda',
 "''The Lumières' ''",
 '1903–03–03 and 1904–03–04 and 1903–04–04 and 1906–04–03 and 1906–07–04 and 1911–04–07 and 1911–07–19 and 1911–1919–19 and 1912–1912–1914 and 1911–14–1914–1915–1915 and 1911–15–1914, 1911–1915, 1915–1918, 1915, 1912–1815, 1921–1918 and 1911–1819–1918–1921–1921, 1911–1818, 1911–21–1919, 1911–15, 1911–12, 1911–14, 1912, 1921, 1914–15, 1914–14, 1914–18, 1914–1919 and 1914–14–14–18–18–19–19–14–15–1

In [21]:
model_output_before_tuning

['water is a source of heat and moisture.',
 "Alice's parents have three daughters: Amy, Jessy, and what's Alice's parents have?",
 'the United Nations',
 'polygon',
 'tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series tv series',
 'samuel wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson wilson

In [22]:
for a, b, c in zip(
    samples_to_tune_on["output"], model_output_before_tuning, model_output_after_tuning
):
    print(f"Ground Truth - {a}")
    print(f"Model Output Before Tuning - {b}")
    print(f"Model Output After Tuning  - {c}")
    print()
    print("---" * 20)
    print()

Ground Truth - Camels use the fat in their humps to keep them filled with energy and hydration for long periods of time.
Model Output Before Tuning - water is a source of heat and moisture.
Model Output After Tuning  - water is a source of a fungus and fungi that can survive for long without water and fungus that can live for a long time.

------------------------------------------------------------

Ground Truth - The name of the third daughter is Alice
Model Output Before Tuning - Alice's parents have three daughters: Amy, Jessy, and what's Alice's parents have?
Model Output After Tuning  - Alice's parents have three daughters: Amy, Jessy, and what's Alice's mom's name?

------------------------------------------------------------

Ground Truth - John D Rockerfeller
Model Output Before Tuning - the United Nations
Model Output After Tuning  - the United Nations

------------------------------------------------------------

Ground Truth - A polygon is a form in Geometry.  It is a singl

In [24]:
samples_to_tune_on["output"]

['Tope',
 'Gudok is string, Cantaro is percussion.',
 'The useful objects for transportation in the list are train, plane, bicyle and scooter.',
 'Zurna is woodwind, Panduri is string.',
 'Shamisen is string, Kpanlogo is percussion.',
 'blue, yellow, green, red, purple, orange',
 'Eminem, 50 Cent',
 'The classification is as follows:\n- Soccer: goalie gloves, corner flag\n- Basketball: shooting sleeve\n- Football: penalty flag, kicking tee',
 'Giant Tortoise is alive, Palaeophis is extinct.',
 'Viola toeria is string, Samphor is percussion.',
 'Inside a house you can find a chair, a table and a microwave. Cars, clouds and parks are found outside of houses.',
 'Dark-colored beers: porter, stout, amber\nLight-colored beers: pilsner, lager',
 'avocados, kiwi, pairs, blueberries, green beans, kale, cabbage',
 'Guitar - String\nViolin - String\npiano - keyboard\nharmonium - keyboard\ncello - string\naccordion - keyboard\nbanjo - string',
 'Bacon, eggs, sausages, beans, toast, mushrooms, tom