<!-- ## Install dependencies -->

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama

# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"

# For WebUI testing:
# %pip install open-webui

%load_ext autoreload

In [2]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.few_shot import transform_code_data
from my_packages.utils.file_utils import read_dataset_to_json

main_dataset_folder = '../../data/MBPP_Midio_50/'

train_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/train_dataset.json'))
val_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/validation_dataset.json'))
test_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/test_dataset.json'))

print(f"Train data: {len(train_data)}")
print(f"Val data: {len(val_data)}")
print(f"Test data: {len(test_data)}")

Train data: 32
Val data: 9
Test data: 9


# Model configurations

In [3]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'openai'

load_dotenv("../../.env")
all_responses = [sample["response"] for sample in train_data] + [sample["response"] for sample in val_data] + [sample["response"] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            if not is_remote_server_reachable(url = host + "/api/tags"):
                print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        client = ChatOllama
   
        models = [
            # 14b models:
            # "deepseek-r1:14b-qwen-distill-fp16",
            "phi4:14b-fp16", #16k context length
            # "qwen2.5:14b-instruct-fp16", #128 k

            #32b models:
            # "qwq:32b-fp16", #ctx: 32,768 tokens
            # "qwen2.5-coder:32b-instruct-fp16", #32,768 tokens
 
            # #70b models:
            # "llama3.3:70b-instruct-fp16", #ctx: 130k
            # "qwen2.5:72b-instruct-fp16", #ctx: 139k
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50


# Task configurations

<!-- ## Experiments settings -->

In [4]:
import json
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.common.classes import PromptType
from my_packages.data_processing.attributes_processing import used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json


prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 5
semantic_selector = False

dataset = read_dataset_to_json(main_dataset_folder + "MBPP-Midio-50.json")
used_functions_json = read_dataset_to_json(main_dataset_folder + "/metadata/used_external_functions.json")
available_nodes = used_functions_to_string(used_functions_json)
print(used_functions_json)


[{'type': 'function', 'function_name': 'Std.If', 'module_path': 'Std', 'doc': 'If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.', 'body': 'extern func(doc: "If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.") If {\n\n        in(x: 0, y: 0, name: "execute") trigger() execute\n\n        in(x: 0, y: 0, name: "input") property(Bool) predicate\n\n        out(x: 0, y: 0, name: "then") trigger() then\n\n        out(x: 0, y: 0, name: "else") trigger() else\n\n        out(x: 0, y: 0, name: "value") property(Bool) value\n\n    }', 'types': 'type CountContext Number\ntype AnyContext Any'}, {'type': 'function', 'function_name': 'Std.IfExpression', 'module_path': 'Std', 'doc': 'IfExpression is a function that allows you to create a conditional expression using a simple expression language provided through the `expression` parameter. E

# Init Example selector

In [5]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

example_pool = train_data
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )
used_ids = []
for task in train_data:
    examples = selector.select_examples({"task": task["task"]})
    used_ids += [e['task_id'] for e in examples]
    print(f"Task: {task['task']}")


Number of examples in the pool: 32
Task: Create a function to find the smallest number in a list.
Task: Create a function to find the kth element in the given array using 1-based indexing.
Task: Create a function to find the minimum of three numbers.
Task: Create a function that takes two lists and returns true if they have at least one common element.
Task: Create a function to check whether the entered number is greater than the elements of the given array.
Task: Create a function to find the last digit of a given number.
Task: Create a function that matches a word containing 'z'.
Task: Create a function to check whether the given list contains consecutive numbers or not.
Task: Create a function to get the sum of the digits of a non-negative integer.
Task: Create a function to check whether a specified list is sorted or not.
Task: Create a function to check whether a list is a sublist of another list.
Task: Create a function which takes a list and returns a list with the same element

## Evaluation

In [6]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
%autoreload 2
%reload_ext autoreload
from my_packages.db_service.best_params_service import get_db_best_params
from my_packages.db_service.experiment_service import confirm_rerun, experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
from my_packages.evaluation.code_evaluation import run_validation, run_testing
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style
import os
os.environ['EXPERIMENT_DB_NAME'] = "few_shot_experiments"


metrics = ['syntax', 'semantic'] #['syntax', 'semantic', 'tests'] OR ['syntax', 'semantic']
prompt_type = PromptType.SIGNATURE
example_selector = f"{'semantic' if semantic_selector else 'coverage'}"
experiment_name = f"{prompt_type.value}_{example_selector}_{NUM_SHOTS}_shot"

# Create separate database for this experiment
if experiment_exists(experiment_name):
    # delete_experiment(experiment_name)
    print(f"📂 Experiemnt '{experiment_name}' already exists.")
    pretty_print_experiment_collections(
        experiment_name,
        exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"]
    )
    if not run_experiment_quality_checks(experiment_name):
        print("\n❌ Experiment quality checks failed. Exiting.")
        raise Exception("Experiment quality checks failed.")
else:
    setup_experiment_collection(experiment_name)

results = {}

for model_name in models:
    print("\n\n")
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')
    results[model_name] = (None, None)

    if confirm_rerun(experiment_name, model_name, eval_method="hold_out", phase="validation"): # If best params exists for model, prompt user to rerun validation. If not exists, return true to run validation.

        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data[:1],
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
            optimizer_metric=metrics[-1],
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors, and best params
            prompt_type=prompt_type,
        )
        exit()
        validation_result.print()
        test_results, final_result = run_testing( # Must rerun testing if validation is rerun
            client, 
            model,
            available_nodes,
            test_data,
            selector,
            temperature=validation_result.temperature,
            top_p=validation_result.top_p,
            top_k=validation_result.top_k,
            ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
            seeds=[3, 75, 346],
            debug=True,
            metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
            prompt_type=prompt_type,
            )
        results[model_name] = (test_results, final_result)

    else:
        validation_result = get_db_best_params( 
            experiment_name, 
            model["name"], 
            [metrics[-1]], # optimizer_metric
            k = 1
        )
        # validation_result = Run(
        #     phase="validation",
        #     temperature=0.6,
        #     top_p=0.9,
        #     top_k=10,
        #     metric_results={
        #         "pass@1": 0.2,
        #     },
        #     seed=None
        # )
    
        validation_result.print()
        if confirm_rerun(experiment_name, model_name, eval_method="hold_out", phase="testing"): # If results exists for model, prompt user to rerun testing. If not exists, return true to run testing first time.
                test_results, final_result = run_testing(
                    client,
                    model,
                    available_nodes,
                    test_data,
                    selector,
                    temperature=validation_result.temperature,
                    top_p=validation_result.top_p,
                    top_k=validation_result.top_k,
                    ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
                    seeds=[3, 75, 346],
                    debug=True,
                    metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
                    experiment_name=experiment_name,
                    env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
                    prompt_type = prompt_type,
                )
                results[model_name] = (test_results, final_result)
    
        
    print(f"======\n{model_name} FINAL RESULTS: ===============================")
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    test_runs = results[model_name][0]
    final_result = results[model_name][1]
    if test_runs:
        for run in test_runs:
            run.print()
    if final_result:
        final_result.print()

      
# Print the final results
print("\nALL MODELS FINAL RESULTS:")
for model_name, (test_runs, final_result) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    for test_run in test_runs:
        test_run.print()
    final_result.print()



📂 Experiemnt 'signature_coverage_5_shot' already exists.

🔍 Existing collections for signature_coverage_5_shot: 

📂 Collection: signature_coverage_5_shot_best_params 
--------------------------------------------------
                     model_name optimizer_metric  temperature  top_p  top_k  seed              created_at  syntax@1 eval_method  semantic@1
qwen2.5-coder:32b-instruct-fp16           syntax          0.2    0.2     10     9 2025-03-05 08:23:48.047  1.000000    hold_out         NaN
           qwq:32b-preview-fp16           syntax          0.2    0.6     50     9 2025-03-05 08:23:47.165  0.888889    hold_out         NaN
     llama3.3:70b-instruct-fp16           syntax          0.2    0.9     50     9 2025-03-05 08:23:47.720  0.888889    hold_out         NaN
                         gpt-4o           syntax          0.6    0.6     -1     9 2025-03-05 08:22:29.627  1.000000    hold_out         NaN
                         gpt-4o         semantic          0.2    0.6     -1     9 

TypeError: get_db_best_params() missing 1 required positional argument: 'eval_method'

<!-- ## Langsmith evaluate -->

# Pretty print

In [None]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.db_service.experiment_service import experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
experiment_name = f"regular_coverage_1_shot"
pretty_print_experiment_collections(experiment_name, exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"])


🔍 Existing collections for regular_coverage_1_shot: 

📂 Collection: regular_coverage_1_shot_results 
--------------------------------------------------
                     model_name         seed  temperature  top_p  top_k            ks                   metrics              created_at                    syntax@1                    syntax@2                    syntax@5                   syntax@10                  semantic@1                  semantic@2                  semantic@5                 semantic@10                   tests@1                   tests@2                   tests@5                  tests@10
                  phi4:14b-fp16 [3, 75, 346]          0.2    0.6     10 [1, 2, 5, 10] [syntax, semantic, tests] 2025-03-05 11:17:16.609 {'mean': 0.53, 'std': 0.01} {'mean': 0.63, 'std': 0.02} {'mean': 0.74, 'std': 0.02}  {'mean': 0.78, 'std': 0.0} {'mean': 0.43, 'std': 0.02} {'mean': 0.52, 'std': 0.03} {'mean': 0.66, 'std': 0.04} {'mean': 0.74, 'std': 0.05} {'mean': 0.0, 'std': 0.

## Testing one run:

In [7]:
# %autoreload 2
# %reload_ext autoreload
# import pandas as pd
# from my_packages.common.classes import RagData
# from my_packages.evaluation.code_evaluation import evaluate_code, extract_code, run_model
# from my_packages.utils.file_utils import write_json_file

# #LOAD RAG DATA
# # Load JSONL dataset into Pandas DataFrame
# main_dataset_folder = '../../data/'
# df = pd.read_json(main_dataset_folder + 'rag_dataset.jsonl', lines=True)
# docs = df.to_dict(orient="records")
# print(f"Number of docs: {len(docs)}")

# language_files = [
#     # "overview.md",
#     "the-midio-language.md", 
#     "technical-details.midio", 
#     "partial-function-application.md",
#     "contexts.md",
#     "loops.midio",
#     "map-filter-reduce.md",
#     "writing-tests.md",
# ]
# order_mapping = {name.upper(): idx for idx, name in enumerate(language_files)}
# print(order_mapping)

# language_docs = [doc for doc in docs if os.path.basename(doc["file"]).upper() in [f.upper() for f in language_files]]
# language_docs = sorted(
#     language_docs,
#     key=lambda doc: order_mapping.get(os.path.basename(doc["file"]).upper(), float('inf'))
# )

# node_files = [ # How to use
#     "http.md",
#     "std.md",
# ] 

# native_files = [  # Native implementation. "### External functions  \nExternal functions in Midio are functions with a native implementation in Rust. These functions generally provide better performance and access to lower-level system features compared to functions written directly in Midio. Most of the functions in the Midio standard library are external functions, which form a foundation of useful and efficient building blocks for your programs.  \n> You currently cannot create your own external functions, but this will be possible in the future.",
#     "http_extern.midio",
#     "std_extern.midio"
# ]
# node_docs = [doc for doc in docs if os.path.basename(doc["file"]).upper() in [f.upper() for f in native_files]]
# print(f"Number of language docs: {len(language_docs)}")
# print(f"Number of node docs: {len(node_docs)}")
# formatted_language_context = "\n\n".join([doc["content"]for doc in language_docs])
# formatted_node_context = "\n\n".join([doc["content"]for doc in node_docs])

# with open("./language_doc", 'w') as f:
#     f.write(formatted_language_context)

# with open("./node_doc", 'w') as f:
#     f.write(formatted_node_context)

# rag_data = RagData(
#     OllamaEmbeddings(model="nomic-embed-text"), 
#     language_docs=language_docs,
#     node_docs=node_docs
# )
# rag_data.init_language_retriever()
# rag_data.init_node_retriever()

# docs = rag_data.language_retriever.similarity_search("Math.Modulo, List.Empty, Std.For, Std.Map")
# for doc in docs:
#     print(doc.page_content)
# docs = rag_data.node_retriever.similarity_search("Math.Modulo, List.Empty, Std.For, Std.Map")
# for doc in docs:
#     print(doc.page_content)
    
# tokens = client(model= "phi4:14b-fp16").get_num_tokens()

from my_packages.evaluation.code_evaluation import evaluate_code, run_model


model_result, largest_context = run_model(
    client,
    model_name,
    available_nodes=available_nodes,
    data=val_data[:2],
    example_pool=selector,
    max_new_tokens = 1000,
    temperature = 0.2,
    top_p = 0.9,
    top_k = 50,
    n = 1,
    seed = 9,
    debug =True, 
    prompt_type=PromptType.REGULAR,
)

metric_results_lists = evaluate_code (
    model_result,
    ks=[1],
    evaluation_metrics=["syntax"],
    experiment_name="test_RAG_5-shot",
    model_name=model_name,
    env="dev",
    hyperparams={"seed": 9, "temperature": 0.2, "top_p": 0.9, "top_k": 50},
    phase="validation"
)
## Optimizing for the first k in the ks list
pass_at_k_dict = metric_results_lists[0]
val_metric = pass_at_k_dict[f"pass@1"]
print(f"Validation with temp={0.2}, top_k={50} and top_p={0.9}. Gave pass@1={val_metric} and pass@ks={pass_at_k_dict}")


Prompt size: 5925


[1m=== Sample: 1 ===
[36m[1m User prompt: [0m
System: You are a Midio code generator and are going to solve some programming tasks for the node-based programming language Midio. 
You will get prompted with a task and you will respond with the code that might be used to solve the given task.

You must return the code inside a code block, in the following form:

```midio

    Midio code is placed here, without any comments.

```
Important that you generate only the code, so that it can be directly executed in a compiler.
Only the external functions that is provided can be used. If you need to create a new function, that does something different than the external functions provided, you can do so by creating a new function in the code.

Bellow is the list of external functions that you can choose from:
Function node name: Std.If
 Documentation: If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value o

# Write experiment to file

In [None]:
from my_packages.db_service.few_shots.few_shot_db_service import export_collection


export_collection(experiment_name, "errors", "json", exclude_columns=[])

# Visualization

In [None]:

# Assuming the experiment 'GPT4_signature_exp1' exists,
# and you evaluated 100 candidates for model 'GPT-4':
from my_packages.db_service.data_visualization import visualize_error_flow_for_model


visualize_error_flow_for_model(experiment_name, models[0])
