<!-- ## Install dependencies -->

In [11]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama

# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"

# For WebUI testing:
# %pip install open-webui

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [12]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.few_shot import transform_code_data
from my_packages.utils.file_utils import read_dataset_to_json

main_dataset_folder = '../../data/MBPP_Midio_50/'

train_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/train_dataset.json'))
val_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/validation_dataset.json'))
test_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/test_dataset.json'))

print(f"Train data: {len(train_data)}")
print(f"Val data: {len(val_data)}")
print(f"Test data: {len(test_data)}")

Train data: 32
Val data: 9
Test data: 9


# Model configurations

In [13]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample["response"] for sample in train_data] + [sample["response"] for sample in val_data] + [sample["response"] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            if not is_remote_server_reachable(url = host + "/api/tags"):
                print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        client = ChatOllama
   
        models = [
            # 14b models:
            # "deepseek-r1:14b-qwen-distill-fp16",
            # "phi4:14b-fp16", #16k context length
            # "qwen2.5:14b-instruct-fp16", #128 k

            #32b models:
            "qwq:32b-fp16", #ctx: 32,768 tokens
            # "qwen2.5-coder:32b-instruct-fp16", #32,768 tokens
 
            # #70b models:
            # "llama3.3:70b-instruct-fp16", #ctx: 130k
            # "qwen2.5:72b-instruct-fp16", #ctx: 139k
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50
Url: http://localhost:11434/api/tags
Response: <Response [200]>
Server is reachable.
Model: qwq:32b-fp16


Token indices sequence length is longer than the specified maximum sequence length for this model (1102 > 1024). Running this sequence through the model will result in indexing errors


# Task configurations

<!-- ## Experiments settings -->

In [14]:
import json
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.common.classes import PromptType
from my_packages.data_processing.attributes_processing import used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json


prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 5
semantic_selector = False

dataset = read_dataset_to_json(main_dataset_folder + "MBPP-Midio-50.json")
used_functions_json = read_dataset_to_json(main_dataset_folder + "/metadata/used_external_functions.json")
available_nodes = used_functions_to_string(used_functions_json)
print(used_functions_json)


[{'type': 'function', 'function_name': 'Std.If', 'module_path': 'Std', 'doc': 'If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.', 'body': 'extern func(doc: "If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.") If {\n\n        in(x: 0, y: 0, name: "execute") trigger() execute\n\n        in(x: 0, y: 0, name: "input") property(Bool) predicate\n\n        out(x: 0, y: 0, name: "then") trigger() then\n\n        out(x: 0, y: 0, name: "else") trigger() else\n\n        out(x: 0, y: 0, name: "value") property(Bool) value\n\n    }', 'types': 'type CountContext Number\ntype AnyContext Any'}, {'type': 'function', 'function_name': 'Std.IfExpression', 'module_path': 'Std', 'doc': 'IfExpression is a function that allows you to create a conditional expression using a simple expression language provided through the `expression` parameter. E

# Init Example selector

In [15]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

example_pool = train_data
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )
used_ids = []
for task in train_data:
    examples = selector.select_examples({"task": task["task"]})
    used_ids += [e['task_id'] for e in examples]
    print(f"Task: {task['task']}")


Number of examples in the pool: 32
Task: Create a function to find the smallest number in a list.
Task: Create a function to find the kth element in the given array using 1-based indexing.
Task: Create a function to find the minimum of three numbers.
Task: Create a function that takes two lists and returns true if they have at least one common element.
Task: Create a function to check whether the entered number is greater than the elements of the given array.
Task: Create a function to find the last digit of a given number.
Task: Create a function that matches a word containing 'z'.
Task: Create a function to check whether the given list contains consecutive numbers or not.
Task: Create a function to get the sum of the digits of a non-negative integer.
Task: Create a function to check whether a specified list is sorted or not.
Task: Create a function to check whether a list is a sublist of another list.
Task: Create a function which takes a list and returns a list with the same element

## Evaluation

In [None]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
%autoreload 2
%reload_ext autoreload
from my_packages.db_service.best_params_service import get_best_params
from my_packages.db_service.experiment_service import confirm_testing_rerun, confirm_validation_rerun, experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
from my_packages.evaluation.code_evaluation import run_validation, run_testing
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style
import os
os.environ['EXPERIMENT_DB_NAME'] = "few_shot_experiments"


metrics = ['syntax', 'semantic'] #['syntax', 'semantic', 'tests'] OR ['syntax', 'semantic']
prompt_type = PromptType.SIGNATURE
example_selector = f"{'semantic' if semantic_selector else 'coverage'}"
experiment_name = f"{prompt_type.value}_{example_selector}_{NUM_SHOTS}_shot"

# Create separate database for this experiment
if experiment_exists(experiment_name):
    # delete_experiment(experiment_name)
    print(f"📂 Experiemnt '{experiment_name}' already exists.")
    pretty_print_experiment_collections(
        experiment_name,
        exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"]
    )
    if not run_experiment_quality_checks(experiment_name):
        print("\n❌ Experiment quality checks failed. Exiting.")
        raise Exception("Experiment quality checks failed.")
else:
    setup_experiment_collection(experiment_name)

results = {}

for model_name in models:
    print("\n\n")
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')
    results[model_name] = (None, None)

    if confirm_validation_rerun(experiment_name, model_name, eval_method="hold_out"): # If best params exists for model, prompt user to rerun validation. If not exists, return true to run validation.

        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data[:1],
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
            optimizer_metric=metrics[-1],
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors, and best params
            prompt_type=prompt_type,
        )
        validation_result.print()
        test_results, final_result = run_testing( # Must rerun testing if validation is rerun
            client, 
            model,
            available_nodes,
            test_data,
            selector,
            temperature=validation_result.temperature,
            top_p=validation_result.top_p,
            top_k=validation_result.top_k,
            ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
            seeds=[3, 75, 346],
            debug=True,
            metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
            prompt_type=prompt_type,
            )
        results[model_name] = (test_results, final_result)

    else:
        validation_result = get_best_params( 
            experiment_name, 
            model["name"], 
            metrics[-1], # optimizer_metric
            k = 1
        )
        # validation_result = Run(
        #     phase="validation",
        #     temperature=0.6,
        #     top_p=0.9,
        #     top_k=10,
        #     metric_results={
        #         "pass@1": 0.2,
        #     },
        #     seed=None
        # )
    
        validation_result.print()
        if confirm_testing_rerun(experiment_name, model_name, eval_method="hold_out"): # If results exists for model, prompt user to rerun testing. If not exists, return true to run testing first time.
                test_results, final_result = run_testing(
                    client,
                    model,
                    available_nodes,
                    test_data,
                    selector,
                    temperature=validation_result.temperature,
                    top_p=validation_result.top_p,
                    top_k=validation_result.top_k,
                    ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
                    seeds=[3, 75, 346],
                    debug=True,
                    metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
                    experiment_name=experiment_name,
                    env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
                    prompt_type = prompt_type,
                )
                results[model_name] = (test_results, final_result)
    
        
    print(f"======\n{model_name} FINAL RESULTS: ===============================")
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    test_runs = results[model_name][0]
    final_result = results[model_name][1]
    if test_runs:
        for run in test_runs:
            run.print()
    if final_result:
        final_result.print()

      
# Print the final results
print("\nALL MODELS FINAL RESULTS:")
for model_name, (test_runs, final_result) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    for test_run in test_runs:
        test_run.print()
    final_result.print()



📂 Experiemnt 'signature_coverage_5_shot' already exists.

🔍 Existing collections for signature_coverage_5_shot: 

📂 Collection: signature_coverage_5_shot_best_params 
--------------------------------------------------
                     model_name optimizer_metric  temperature  top_p  top_k  seed              created_at  syntax@1  semantic@1
qwen2.5-coder:32b-instruct-fp16           syntax          0.2    0.2     10     9 2025-03-05 08:23:48.047  1.000000         NaN
           qwq:32b-preview-fp16           syntax          0.2    0.6     50     9 2025-03-05 08:23:47.165  0.888889         NaN
     llama3.3:70b-instruct-fp16           syntax          0.2    0.9     50     9 2025-03-05 08:23:47.720  0.888889         NaN
                         gpt-4o           syntax          0.6    0.6     -1     9 2025-03-05 08:22:29.627  1.000000         NaN
                         gpt-4o         semantic          0.2    0.6     -1     9 2025-03-05 08:23:14.732       NaN    0.888889
...
📊 qwen2.5:

KeyboardInterrupt: 

<!-- ## Langsmith evaluate -->

# Pretty print

In [None]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.db_service.experiment_service import confirm_testing_rerun, confirm_validation_rerun, experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
experiment_name = f"regular_coverage_1_shot"
pretty_print_experiment_collections(experiment_name, exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"])


🔍 Existing collections for regular_coverage_1_shot: 

📂 Collection: regular_coverage_1_shot_results 
--------------------------------------------------
                     model_name         seed  temperature  top_p  top_k            ks                   metrics              created_at                    syntax@1                    syntax@2                    syntax@5                   syntax@10                  semantic@1                  semantic@2                  semantic@5                 semantic@10                   tests@1                   tests@2                   tests@5                  tests@10
                  phi4:14b-fp16 [3, 75, 346]          0.2    0.6     10 [1, 2, 5, 10] [syntax, semantic, tests] 2025-03-05 11:17:16.609 {'mean': 0.53, 'std': 0.01} {'mean': 0.63, 'std': 0.02} {'mean': 0.74, 'std': 0.02}  {'mean': 0.78, 'std': 0.0} {'mean': 0.43, 'std': 0.02} {'mean': 0.52, 'std': 0.03} {'mean': 0.66, 'std': 0.04} {'mean': 0.74, 'std': 0.05} {'mean': 0.0, 'std': 0.

## Testing one run:

In [24]:
%autoreload 2
%reload_ext autoreload
from my_packages.evaluation.code_evaluation import evaluate_code, extract_code, run_model

code = """
<think>
Okay, I need to create a Midio function called opposite_signs that checks if two integers have opposite signs. Let's think about how to do this.

First, the problem is to determine if one number is positive and the other is negative. So, their product should be negative because positive times negative is negative. But wait, what if either of them is zero? The problem says "integers", but does zero count as having an opposite sign? Probably not. Because zero is neither positive nor negative. So if either x or y is zero, the function should return false.

So the logic would be: if (x < 0 && y >= 0) || (x >=0 && y <0), then true else false. Alternatively, multiply them and check if the product is negative, but only if neither is zero.

Hmm, maybe using multiplication is a concise way. Let's see:

If x * y < 0, then they have opposite signs. But if either is zero, the product would be zero, which is not less than zero. So that works because if either is zero, the condition fails. So yes, multiplying them and checking if it's negative should work.

So I need to compute the product of x and y, then check if it's less than zero. The result of that check will be the output.

Looking at the available functions, there's Math.Mul which can multiply two numbers. Then use Math.LessThan to compare the product with 0.

Let me structure this:

1. Take inputs x and y.
2. Multiply them using Math.Mul.
3. Check if the result is less than 0 using Math.LessThan.
4. Output the boolean result from that comparison.

So in Midio nodes, I need to connect x and y into a Mul node's first and second inputs. Then take the result to LessThan's left input, and 0 to its right. The output of LessThan is the desired Bool.

Wait, but what about if one or both are zero? Let me test with examples:

Case1: x=5, y=-3 → product -15 <0 → true (correct)
Case2: x=-2, y=4 → product -8 → true
Case3: x=0, y=5 → 0 → not less than 0 → false (correct)
Case4: x=0, y=-3 → same as above.
Case5: Both negative: x=-2, y-3 → product positive → false. Correct.

Yes, this approach works.

So the nodes needed are:

- Math.Mul to multiply x and y
- Math.LessThan to compare with 0

Then connect Mul.result to LessThan.left, 0 to right. The output is LessThan's result.

Now translating that into Midio code structure as per the function signature provided.

The function has inputs x_853326 (x) and y_5390f5 (y), and output output_3339a3.

So in the Midio code:

Import statements first, then define the function. Inside the function:

Instance a Mul node, take x and y as inputs to its first and second parameters.

Then instance a LessThan node, left is the Mul.result, right is 0.

Connect the LessThan's result output to the output_3339a3 property.

So code would look like this:

```midio

    import("std", Std_k98ojb)
import("http", Http_q7o96c)

module() main {
    func(doc: "checks whether the given two integers have opposite sign or not.") opposite_signs {
        in(x: -426, y: -248, name: "x") property(Number) x_853326
        in(x: -420, y: -107, name: "y") property(Number) y_5390f5

        out(x: 159, y: -219, name: "output") property(Bool) output_3339a3

        instance(x: 200, y: 100) mul_1 root.Std_k98ojb.Math.Mul {}
        instance(x: 300, y: 150) lessThan_1 root.Std_k98ojb.Math.LessThan {}

        x_853326 -> mul_1.first
        y_5390f5 -> mul_1.second
        mul_1.result -> lessThan_1.left
        0 -> lessThan_1.right
        lessThan_1.result -> output_3339a3
    }

    

    instance(x: -200, y: 0) opposite_signs_instance root.main.opposite_signs {}
}

```

Wait but in Midio syntax, how are the connections written? Let me check previous examples.

Looking at prior code snippets:

For example, in the cube_nums function, they have:

list_63c248 -> map_f2330d.items

So the format is source -> destination.

In the Math nodes, like in sum_digits there's modulo_adfa16.result -> add_88bb8a.first.

Yes. So here, the Mul node's result goes to LessThan.left, and 0 is connected to right.

This should work.




    import("std", Std_k98ojb)
import("http", Http_q7o96c)

module() main {
    func(doc: "checks whether the given two integers have opposite sign or not.") opposite_signs {
        in(x: -426, y: -248, name: "x") property(Number) x_853326
        in(x: -420, y: -107, name: "y") property(Number) y_5390f5

        out(x: 159, y: -219, name: "output") property(Bool) output_3339a3

        instance(x: 184, y: -167) mul_3e8d0b root.Std_k98ojb.Math.Mul {}
        instance(x: 356, y: -162) lessthan_3c7f3a root.Std_k98ojb.Math.LessThan {}

        x_853326 -> mul_3e8d0b.first
        y_5390f5 -> mul_3e8d0b.second
        0 -> lessthan_3c7f3a.right
        mul_3e8d0b.result -> lessthan_3c7f3a.left
        lessthan_3c7f3a.result -> output_3339a3
    }

    

    instance(x: -364, y: -185) opposite_signs_2e3d0b root.main.opposite_signs {}
}


""" ## TEST uten think
print(extract_code(code))
model_name = "qwq:32b-fp16"
model_result, largest_context = run_model(
    client,
    model_name,
    available_nodes,
    val_data,
    selector,
    max_new_tokens = -1,
    temperature = 0.2,
    top_p = 0.9,
    top_k = 50,
    n = 1,
    seed = 9,
    debug =True, 
    prompt_type=PromptType.REGULAR,
)

metric_results_lists = evaluate_code (
    model_result,
    ks=[1],
    evaluation_metric=["semantic"],
    experiment_name=experiment_name,
    model_name=model_name,
    env="dev",
    hyperparams={"seed": 9, "temperature": 0.2, "top_p": 0.9, "top_k": 50},
    phase="validation"
)
## Optimizing for the first k in the ks list
pass_at_k_dict = metric_results_lists[0]
val_metric = pass_at_k_dict[f"pass@1"]
print(f"Validation with temp={0.2}, top_k={50} and top_p={0.9}. Gave pass@1={val_metric} and pass@ks={pass_at_k_dict}")


import("std", Std_k98ojb)
import("http", Http_q7o96c)

module() main {
    func(doc: "checks whether the given two integers have opposite sign or not.") opposite_signs {
        in(x: -426, y: -248, name: "x") property(Number) x_853326
        in(x: -420, y: -107, name: "y") property(Number) y_5390f5

        out(x: 159, y: -219, name: "output") property(Bool) output_3339a3

        instance(x: 200, y: 100) mul_1 root.Std_k98ojb.Math.Mul {}
        instance(x: 300, y: 150) lessThan_1 root.Std_k98ojb.Math.LessThan {}

        x_853326 -> mul_1.first
        y_5390f5 -> mul_1.second
        mul_1.result -> lessThan_1.left
        0 -> lessThan_1.right
        lessThan_1.result -> output_3339a3
    }

    

    instance(x: -200, y: 0) opposite_signs_instance root.main.opposite_signs {}
}
    > Generating k response..  (1/1)

KeyboardInterrupt: 

# Write experiment to file

In [None]:
from my_packages.db_service.few_shots.few_shot_db_service import export_collection


export_collection(experiment_name, "errors", "json", exclude_columns=[])

# Visualization

In [None]:

# Assuming the experiment 'GPT4_signature_exp1' exists,
# and you evaluated 100 candidates for model 'GPT-4':
from my_packages.db_service.data_visualization import visualize_error_flow_for_model


visualize_error_flow_for_model(experiment_name, models[0])
