In [1]:
import sys

sys.path.append('/workspace/llmsearch/')

from llmsearch.utils.common_utils import json_load

gsm8k_res = json_load('/workspace/llmsearch/examples/gsm-8k-best-params-150s-capybara-7b.json')

  from .autonotebook import tqdm as notebook_tqdm


Monkey Patching .generate function of `transformers` library


In [2]:
import re

from transformers import AutoTokenizer

In [4]:
seed = 42
batch_size = 1
num_tune_samples = 150
num_test_samples = 500
model_id = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

tokenizer = AutoTokenizer.from_pretrained(
        model_id, local_files_only=False, legacy=False, use_fast=False
    )
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "left"

def standardize(s):
    if s is None:
        return s
    s = s.replace(",", "")
    s = s.replace("$", "")
    s = s.replace(".00", "")
    if s.endswith("."):
        s = s[:-1]
    return s.strip()

def extract_answer_from_out(s):
    pattern = re.compile(r"The answer is ((\d|\-)((\d|\,|\.)+)?\d?)")
    pattern = re.compile(r"The answer is ((\d|\-|\$)((\d|\,|\.)+)?\d?)")
    match = pattern.search(s)
    if match:
        ret = match.group(1).strip()
        return standardize(ret)
    else:
        return None

import textwrap
import datasets


def load_dataset():

    def preprocess_dataset(
        dataset, tokenizer, pt, pt_cols, system_prompt, add_generation_prompt=True
    ):

        def wrapper(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            messages = (
                []
                if system_prompt is None
                else [{"role": "system", "content": system_prompt}]
            )
            formatted_pt = pt.format(**{pt_col: sample[pt_col] for pt_col in pt_cols})
            messages.append(
                {
                    "role": "user",
                    "content": formatted_pt,
                }
            )
            formatted_pt_with_ct = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=add_generation_prompt
            )
            return formatted_pt_with_ct

        def actual_input(sample):
            """Takes in a sample, formats it using prompt template, applies chat template and returns the formatted string"""
            return sample[pt_cols[0]]

        pt_dataset = dataset.map(
            lambda sample: {
                "X": wrapper(sample),
                "actual input": actual_input(sample),
            }
        )

        return pt_dataset


    # 2-shot prompt template - https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/gsm8k/gsm8k-cot.yaml
    pt = textwrap.dedent(
    """\
    Q: There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?
    A: There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6.

    Q: If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?
    A: There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5.

    Q: {question}"""
    )
    pt_cols = ["question"]
    system_prompt = "Solve the following math problems, end with The answer is"
    gsm8k_dataset = datasets.load_dataset("gsm8k", "main")


    processed_dataset = preprocess_dataset(
        gsm8k_dataset["train"],
        tokenizer,
        pt=pt,
        pt_cols=pt_cols,
        system_prompt=system_prompt,
        add_generation_prompt=True,
    )

    shuffled_dataset = processed_dataset.shuffle(seed=seed)

    samples_to_tune_on = shuffled_dataset.select(range(num_tune_samples))
    remaining_indices = range(num_tune_samples, num_tune_samples + num_test_samples)
    test_dataset = shuffled_dataset.select(remaining_indices)
    return samples_to_tune_on, test_dataset



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
samples_to_tune_on

NameError: name 'samples_to_tune_on' is not defined

In [None]:
gsm8k_res.keys()

dict_keys(['scores_before', 'scores_after', 'outputs_before', 'outputs_after', 'oos_scores_before', 'oos_scores_after', 'oos_outputs_before', 'oos_outputs_after', 'best_params'])

In [None]:
samples_to_tune_on,test_dataset = load_dataset()

In [12]:
examples = []

# oos b4 44, after - 51
# s b4 14, after - 10




for oos_out_before, oos_out_after, y_true in zip(gsm8k_res['outputs_before'], gsm8k_res['outputs_after'], samples_to_tune_on['answer']):

    y_t = standardize(y_true.split("####")[-1].strip())
    before = extract_answer_from_out(oos_out_before)
    after = extract_answer_from_out(oos_out_after)

    if y_t == before and y_t != after:
        print(before, after, y_t)
        examples.append({
            'out_before' : oos_out_before,
            'out_after' : oos_out_after,
            'y_true' : y_t,

            'extracted_before' : before,
            'extracted_after' : after

        })

print(len(examples))


4 40 4
160 None 160
1440 1360 1440
1500 None 1500
50 600 50
40 28 40
22 None 22
21 12 21
4000 1000 4000
9800 None 9800
10 2 10
1 0 1
3 8 3
442 - 442
14


In [15]:
for item in examples:
    print(item['out_before'])

    print('---' * 10)
    print(item['out_after'])

    print(item['extracted_before'], item['extracted_after'])

    print('\n\n')

    print('***' * 10)
    print('\n\n')

A: Emma starts with $100 in her account. She spends $8 each day for a week, which is 7 days. So she spends 8 * 7 = $56 during the week. She has $100 - $56 = $44 left in her account. She asks for as many $5 bills as her account can give her, which is $44 / $5 = 8 $5 bills. So she takes out 8 * $5 = $40 from her account. She leaves the remaining $44 - $40 = $4 in her account. The answer is $4.
------------------------------
A: Emma starts with $100 in her account. She spends $8 each day for a week, which is 8 * 7 = $56. After spending, she has $100 - 56 = $44 left in her account. She receives $5 bills, and each $5 bill is 5 dollars. To find out how many $5 bills she can get, divide the remaining amount by 5: 44 / 5 = 8.8. Since she can only receive whole $5 bills, she gets 8 of them. 8 * 5 = $40. The answer is $40 remains in her account.
4 40



******************************



A: Let's denote the total number of books as x. We know that 35% of the books are for children, so 65% of the 

In [2]:
import plotly.graph_objects as go
import pandas as pd

def plot_model_performance(data):
    # Convert the data dictionary to a DataFrame for easier handling
    df = pd.DataFrame(data)

    for index, row in df.iterrows():
        # Convert 'before' and 'after' scores to percentages if the metric is accuracy
        if row['metric'] == 'accuracy':
            before_value = row['before'] * 100
            after_value = row['after'] * 100
            value_format = "{:.2f}%"
            delta_format = "Δ = {:.2f}%"
        else:
            before_value = row['before']
            after_value = row['after']
            value_format = "{:.5f}"
            delta_format = "Δ = {:.5f}"

        # Calculate the delta (difference between after and before)
        delta = after_value - before_value

        # Determine the y-axis range for better visibility of the delta
        y_min = min(before_value, after_value) - abs(delta) * 1.5  # Adjust the range to make delta visible
        y_max = max(before_value, after_value) + abs(delta) * 1.5  # Adjust the range to make delta visible

        # Create a plot for each model
        fig = go.Figure()

        # Add 'before' and 'after' bars
        fig.add_trace(go.Bar(
            x=['Before', 'After'],
            y=[before_value, after_value],
            text=[value_format.format(before_value), value_format.format(after_value)],
            textposition='auto',
            name=row['model'],
            marker_color=['#1f77b4', '#ff7f0e'],  # Blue and orange for better contrast
            hoverinfo='y+text'
        ))

        # Add a line indicating the delta without text
        fig.add_trace(go.Scatter(
            x=['Before', 'After'],
            y=[before_value, after_value],
            mode='lines+markers',
            line=dict(color='#2ca02c', width=2),  # Green line
            marker=dict(size=12, color='#d62728'),  # Red markers
            hoverinfo='none'
        ))

        # Add annotations for delta
        fig.add_annotation(
            x=1,
            y=after_value,
            text=delta_format.format(delta),
            showarrow=True,
            arrowhead=2,
            ax=0,
            ay=-40,
            font=dict(size=14, color="black"),
            bgcolor="rgba(255, 255, 255, 0.9)",
            bordercolor="black",
            borderwidth=1
        )

        # Update layout for better readability and appearance
        fig.update_layout(
            title=dict(
                text=f"{row['model']}",
                x=0.5,
                xanchor='center',
                font=dict(size=16)  # Smaller font size for the title
            ),
            xaxis_title='Metric',
            yaxis_title='Value',
            yaxis=dict(range=[y_min, y_max], tickformat=".2f" if row['metric'] == 'accuracy' else ".5f"),
            margin=dict(l=40, r=40, t=40, b=40),
            template='plotly_white',
            showlegend=False,
            font=dict(size=14)
        )

        fig.show()

# Example usage with your data
data = {
    'model': ['TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ (dataset - gsm8k, metric - accuracy)', 'Praful932/dolphin-2.2.1-mistral-7b-samsum-ft-v1-awq (dataset - samsum, metric - rouge2)'],
    'dataset': ['gsm8k', 'samsum'],
    'before': [0.545, 0.2543],
    'after': [0.5525, 0.2564],
    'samples': [500, 500],
    'metric': ['accuracy', 'rouge_2'],
    'best_params': [
        {'do_sample': True, 'generation_seed': 42, 'max_new_tokens': 500, 'no_repeat_ngram_size': 0, 'stopping_criteria': ['MultiTokenStoppingCriteria'], 'top_k': 10, 'top_p': 0.7},
        {'do_sample': True, 'generation_seed': 42, 'max_new_tokens': 70, 'no_repeat_ngram_size': 0, 'stopping_criteria': ['MultiTokenStoppingCriteria'], 'temperature': 0.1, 'top_k': 50}
    ]
}

plot_model_performance(data)


In [5]:
# Requires j

import sys
sys.path.append('/Users/praful932/myfiles/code/llmsearch')

import torch
import evaluate
import datasets
import numpy as np

from llmsearch.tuner import Tuner
from sklearn.model_selection import GridSearchCV
from llmsearch.scripts.stopping_criteria import MultiTokenStoppingCriteria
from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteriaList

seed = 42
batch_size = 2
num_samples = 10

# Load model & tokenizer
model_id = "cognitivecomputations/dolphin-2.9-llama3-8b"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.float16, device_map = "auto")

# Load dataset & metric used to evaluate the model during the search
dataset = datasets.load_dataset("samsum")['train']
sample_dataset = dataset.shuffle(seed = seed).select(range(num_samples))
rouge = evaluate.load('rouge')

# Optional : Define stopping criteria for the generation, here we stop a generation of a sequence when `<|im_end|>` is reached
multi_token_stop_criteria_ob = MultiTokenStoppingCriteria(sequence_ids=[128256])
stopping_criteria = StoppingCriteriaList([multi_token_stop_criteria_ob])
# useful when batching
callbacks_after_inference = [multi_token_stop_criteria_ob.reset]

# Scorer
def get_rouge_score(y_true, y_pred):
    return np.mean(rouge.compute(predictions=y_pred, references=[item['summary'] for item in y_true], use_stemmer=True, use_aggregator=False)['rouge2'])

# Process dataset to chat format
def sample_to_chat_format(tokenizer, **kwargs):
    messages = [
        {
            'role' : "system",
            'content' : "You are Dolphin, a helpful AI assistant."
        },
        {
            'role' : "user",
            'content' : f"Summarize the following text: {kwargs['dialogue']}"
        }
    ]
    return tokenizer.apply_chat_template(messages, tokenize = False, add_generation_prompt = True)

# Define tuner object and pass in model, metric & dataset
tuner_ob = Tuner(
    model=model,
    tokenizer=tokenizer,
    dataset=sample_dataset,
    device="cuda:0",
    batch_size=batch_size,
    tokenizer_encode_args={"padding": "longest",'truncation' : True, "add_special_tokens": False, 'max_length' : 1024},
    tokenizer_decode_args={"spaces_between_special_tokens": False, 'skip_special_tokens' : True},
    scorer=get_rouge_score,
    sample_preprocessor=sample_to_chat_format,
    seed=seed,
    column_mapping={"input_cols": ["dialogue"], "eval_cols": ["summary"]},
    callbacks_after_inference=callbacks_after_inference,
)

  _torch_pytree._register_pytree_node(


Monkey Patching .generate function of `transformers` library


  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  Additional headers to be sent with the request.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  _torch_pytree._register_pytree_node(
Downloading shards:  25%|██▌       | 1/4 [14:52<44:38, 892.68s/it]

In [19]:
from IPython.lib.display import Audio
import numpy as np

framerate = 4410
play_time_seconds = 2

t = np.linspace(0, play_time_seconds, framerate*play_time_seconds)
audio_data = np.sin(2*np.pi*300*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)

In [20]:
audio_data = np.sin(4*np.pi*100*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)

In [22]:
audio_data = np.sin(5*np.pi*100*t) + np.sin(2*np.pi*240*t)
Audio(audio_data, rate=framerate, autoplay=True)

In [1]:
# check if beam search works
# has issues with recent transformers version

from awq import AutoAWQForCausalLM
from transformers import AutoTokenizer, TextStreamer

quant_path = "TheBloke/CapybaraHermes-2.5-Mistral-7B-AWQ"

# Load model
model = AutoAWQForCausalLM.from_quantized(quant_path, fuse_layers=True)
tokenizer = AutoTokenizer.from_pretrained(quant_path, trust_remote_code=True)

config.json:   0%|          | 0.00/911 [00:00<?, ?B/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/17.9k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/51.0 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/115 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

quant_config.json:   0%|          | 0.00/126 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/4.15G [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
# play sound

In [2]:


# Convert prompt to tokens
prompt_template = """\
<|system|>
</s>
<|user|>
{prompt}</s>
<|assistant|>"""

prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

tokens = tokenizer(
    prompt_template.format(prompt=prompt),
    return_tensors='pt'
).input_ids.cuda()

In [3]:
%%time

# Generate output
generation_output = model.generate(
    tokens,
    max_new_tokens = 20,
)

print(generation_output)

tensor([[    1,   523, 28766,  6574, 28766, 28767,    13,     2, 28705,    13,
         28789, 28766,  1838, 28766, 28767,    13,  1976, 28742,   267,  6328,
           356,   272,  5439,   302,   272,  8599, 28723,   995,  2338,   624,
         13677,  6287, 28725,   624, 13677,  7635,   304,   624, 13677,  6120,
         28723,   995,   948,   582,  4668,   970,   368,  2774, 28723,  6926,
           460,   368, 28804,     2, 28705,    13, 28789, 28766,   489, 11143,
         28766, 28767,    13,  1976,   460,   438,   272,  4982,   302,   264,
          9661,   395,   264,   624, 28733, 23881, 13630, 28723,   851,   349,
          1096,   739]], device='cuda:0')
CPU times: user 722 ms, sys: 82.4 ms, total: 804 ms
Wall time: 800 ms


In [4]:
%%time

# Generate output
generation_output = model.generate(
    tokens,
    max_new_tokens = 20,
    num_beams = 2,
)

print(generation_output)

tensor([[    1,   523, 28766,  6574, 28766, 28767,    13,     2, 28705,    13,
         28789, 28766,  1838, 28766, 28767,    13,  1976, 28742,   267,  6328,
           356,   272,  5439,   302,   272,  8599, 28723,   995,  2338,   624,
         13677,  6287, 28725,   624, 13677,  7635,   304,   624, 13677,  6120,
         28723,   995,   948,   582,  4668,   970,   368,  2774, 28723,  6926,
           460,   368, 28804,     2, 28705,    13, 28789, 28766,   489, 11143,
         28766, 28767,    13,  1976,   460,   438,   272,  4982,   302,   264,
          9661,   395,   264,   624, 28733, 23881, 13630, 28723,   851,   349,
          1096,   739]], device='cuda:0')
CPU times: user 459 ms, sys: 27 ms, total: 486 ms
Wall time: 485 ms
