# Embedding Search Benchmarking

In this study we wish to check the AI's ability to understand the natural language description of the experiment and select the correct experiment from the code base.

We generate 10 different prompts for each experiment and check if the AI can find the correct related experiment.

We report the accuracy of top hit, top 3 hit, top 5 hit and hit after filtering and revision.

In [1]:
import leeq
from leeq.experiments.builtin import *

In [2]:
experiment_prompt = {
    'GMM': (MeasurementCalibrationMultilevelGMM,[
        'Run GMM measurement calibration',
        'Implement GMM model calibration for measurement',
        'Calibrate the measurement GMM model',
        'Calibrate the measurement GMM model with amplitude=0.2 and drive frequency = 9876 MHz',
        'Run measurement calibration with amp=0.1 and frequency = 8790 MHz',
        'Implement calibration for measurement',
        'Calibrate measurement parameters',
        'Run measurement calibration',
        'Implement measurement calibration for state discrimination',
        'Do measurement calibration'
    ]),
    'Ramsey': (SimpleRamseyMultilevel,[
        'Run ramsey experiment with default parameters',
        'Run ramsey experiment to calibrate qubit frequency with default parameters',
        'Implement ramsey expeirment for qubit frequency calibration',
        'Calibrate qubit frequency with ramsey experiment',
        'Qubit frequency calibration with ramsey experiment',
        'Implement ramsey experiments to estimate qubit frequency.',
        'Qubit frequency estimation with ramsey experiment',
        'Do ramsey experiment to calibrate qubit frequency',
        'Do Ramsey interferometry experiment',
        'Calibrate qubit frequency using the ramsey experiment'
    ]),
    'Rabi': (NormalisedRabi,[
        'Run rabi experiment to calibrate single qubit gate driving amplitudes',
        'Measure Rabi oscillations to determine single qubit gate driving amplitudes',
        'Implement Rabi experiment to find pi pulse duration',
        'Calibrate Rabi rate for single qubit gate',
        'Determine single qubit gate prameter using Rabi experiment',
        'Run Rabi experiment with default parameters',
        'Single qubit gate amptiludes estimation using Rabi experiments',
        'Do Rabi experiment to measure qubit drive amplitudes',
        'Run Rabi experiment',
        'Calibrate qubit drive amplitudes using Rabi experiment'
    ]),
    'Pingpong': (AmpPingpongCalibrationSingleQubitMultilevel,[
        'Fine-tune single qubit gate driving amplitude settings using pingpong method',
        'Adjust single qubit gate driving amplitude for optimal fidelity',
        'Calibrate signal amplitude with pingpong feedback loop',
        'Run single qubit gate amplitude tuning using pingpong experiment',
        'Implement pingpong feedback calibration for fine tuning single qubit gate amplitude',
        'Optimize driving parameters with the pingpong method',
        'Pingpong tuning of amplitudes settings',
        'Amplitude fine-tuning with pingpong approach',
        'Iterative tuning of amplitude using piongpong experiment',
        'Calibrate amplitude settings using iterative pingpong method'
    ]),
    'Resonator spectroscopy': (ResonatorSweepTransmissionWithExtraInitialLPB,[
        'Run resonator spectroscopy to determine resonant frequencies',
        'Implement spectroscopy on resonator',
        'Calibrate resonator using spectroscopic techniques',
        'Measure quality factor of resonators with spectroscopy',
        'Determine resonator location via resonator spectroscopy',
        'Spectroscopic analysis of resonator bandwidth using resonator spectroscopy',
        'Run full spectroscopic scan on resonator',
        'Discover resonators with spectroscopy',
        'Resonator frequency mapping using spectroscopy',
        'Measure resonator frequency response using spectroscopy'
    ]),
    'T1': (SimpleT1,[
        'Run T1 experiment to measure relaxation time',
        'Implement T1 relaxation time measurement',
        'Determine qubit T1 relaxation times',
        'Measure T1 relaxation time of qubit',
        'Do T1 experiment',
        'Measure T1 Relaxation time',
        'Do T1 experiment for qubit decay',
        'T1 relaxation profiling of qubits',
        'Experimentally determine qubit T1 times',
        'Measure decay characteristics with T1 experiment'
    ]),
    'T2': (SpinEchoMultiLevel,[
        'Run T2 experiment to measure dephasing time',
        'Implement T2 echo experiment',
        'Determine qubit T2 dephasing times',
        'Measure coherence time using Hahn echo',
        'Calibrate for T2 dephasing time measurements',
        'Quantify qubit coherence with echo experiments',
        'Do T2 echo to observe qubit dephasing',
        'T2 coherence profiling of qubits',
        'Experimentally determine qubit T2 times',
        'Measure qubit coherence decay using T2 experiment'
    ]),
    'RB1Q': (SingleQubitRandomizedBenchmarking,[
        'Run randomized benchmarking to measure single-qubit gate fidelity',
        'Implement 1Q randomized benchmarking',
        'Measure qubit gate errors using randomized benchmarking',
        'Characterize single qubit gates performance using randomized benchmarking',
        'Determine fidelity of qubit operations with randomized benchmarking',
        'Randomized benchmarking for error characterization on single qubit gate',
        'Perform single-qubit benchmarking for error rates',
        'Implement randomized benchmarking for single qubits',
        'Calibrate and measure single-qubit errors with randomized benchmarking',
        'Quantify qubit performance with randomized benchmarking'
    ]),
    'Drag': (DragCalibrationSingleQubitMultilevel,[
        'Implement DRAG calibration to reduce gate errors',
        'Calibrate DRAG parameters to optimize qubit control',
        'Run DRAG calibration for improved qubit gate fidelity',
        'Optimize DRAG coefficients for quantum gates',
        'Measure and adjust DRAG parameters for qubit gates',
        'Calibrate pulse shaping using DRAG technique',
        'DRAG parameter tuning for gate error reduction',
        'Implement DRAG calibration routines',
        'Optimize qubit gate performance with DRAG calibration',
        'DRAG calibration to minimize leakage errors'
    ])
}


In [3]:
from leeq.utils.ai.code_indexer import build_leeq_code_ltm
from leeq.utils.ai.staging.stage_execution import get_codegen_wm, CodegenModel
from leeq.utils.ai.variable_table import VariableTable

from mllm.config import default_models
default_models["normal"] = "gpt-4o-mini"
default_models["expensive"] = "gpt-4o"
default_models["normal"] = "replicate/meta/meta-llama-3-70b-instruct"
default_models["expensive"] = "replicate/meta/meta-llama-3-70b-instruct"
leeq_code_ltm, exps_var_table = build_leeq_code_ltm()
code_cog_model = CodegenModel()
code_cog_model.n_recall_items = 3
for idea in leeq_code_ltm.ideas:
    code_cog_model.lt_memory.add_idea(idea)

ModuleNotFoundError: No module named 'fibers.tree.node_attr.code_node'

In [None]:
def benchmark_single(key,exp_class,description, codegen = False):
    input_var_table = VariableTable()
    codegen_wm = get_codegen_wm(description, input_var_table)
    recall_res = code_cog_model.recall(codegen_wm)
    obtained_exp_cls = [x.idea.exp_cls for x in recall_res.idea_results]
    additional_info = obtained_exp_cls
    if codegen:
        codes = code_cog_model.codegen(codegen_wm, recall_res)
        success = exp_class.__name__ in codes
        additional_info.append(codes)
    else:
        success = exp_class in obtained_exp_cls
        

    return success,additional_info


def benchmark_all():

    results_list = {}
    
    for exp_name in experiment_prompt.keys():
        results = []
        exp_class = experiment_prompt[exp_name][0]
        exp_prompts = experiment_prompt[exp_name][1]

        for prompt in exp_prompts:
            success,additional_info = benchmark_single(exp_name,exp_class,prompt,codegen=True)
            print(success,additional_info)
            results.append((prompt,success,additional_info))

        results_list[exp_name] = results

    return results_list

In [None]:
results = benchmark_all()

In [None]:
import pickle
with open('embedding_search_benchmark.pkl','wb') as f:
    pickle.dump(results,f)

In [None]:
for k,vs in results.items():
    success_list = np.asarray([v[1] for v in vs]).astype(float)
    success_rate = success_list.mean()
    print(k,success_rate)
    

In [None]:
# TODO: Report the hit rate top 1 as well