<!-- ## Install dependencies -->

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama



# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"


# For WebUI testing:
# %pip install open-webui



<!-- # Get data from Langsmith -->

# Get dataset from Langsmith

In [2]:
from dotenv import load_dotenv
from langsmith import Client
from langsmith.schemas import Example

dataset_name = "Code prediction"

load_dotenv("../../.env")
langsmith_client = Client()

if langsmith_client.has_dataset(dataset_name=dataset_name):
    langsmith_dataset=langsmith_client.read_dataset(dataset_name=dataset_name)
    print("Existing dataset found.")
    
    train_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["train"],
        # metadata={"task_id": "1"},
        # limit=1
    ))
    

    # train_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in train_data]
    print(train_data[0])
    print(f"Number of train examples loaded: {len(train_data)}")
    
    val_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["validation"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of validation examples loaded: {len(val_data)}")
    # val_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in val_data]


    test_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["test"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of test examples loaded: {len(test_data)}")
    # test_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in test_data]
    
    # make the rest of the code work with Example object, instead of [{'task_id': str, 'task': str, 'response': list, 'MBPP_task_id': str}]. 
    # Create an database of type chat
 
else:
    print(f"No existing dataset found with name: {dataset_name}.")



Existing dataset found.
dataset_id=UUID('7000235e-def8-4df7-8dfa-1ade6fd1447e') inputs={'task': 'Create a function to check whether the given number is a perfect square or not.', 'function_signature': 'func(doc: "checks whether the given number is a perfect square or not.") is_perfect_square {\nin(x: 354, y: -53, name: "input") property(Number) input_f5c03d\n in(x: 161, y: -211, name: "execute") trigger() execute_27b8c3\n out(x: 1748, y: -10, name: "out") property(Bool) out_f3db9f\n out(x: 1894, y: 120, name: "continue") trigger() continue_8ba06b'} outputs={'response': 'import("std", Std_k98ojb)\nimport("http", Http_q7o96c)\n\nmodule() main {\n    func(doc: "checks whether the given number is a perfect square or not.") is_perfect_square {\n        in(x: 354, y: -53, name: "input") property(Number) input_f5c03d\n        in(x: 161, y: -211, name: "execute") trigger() execute_27b8c3\n\n        out(x: 1748, y: -10, name: "out") property(Bool) out_f3db9f\n        out(x: 1894, y: 120, name: 

<!-- # Init model provider -->

# Model configurations

In [3]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample.outputs['response'] for sample in train_data] + [sample.outputs['response'] for sample in val_data] + [sample.outputs['response'] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")
        client = ChatOllama
   
    
        models = [
            # 14b models:
            # "phi4:14b-fp16", #16k context length
            # "deepseek-r1:14b",
            "qwen2.5:14b-instruct-fp16", #128 k

            #32b models:
            # "qwq:32b-preview-fp16", #ctx: 32,768 tokens
            # "qwen2.5-coder:32b-instruct-fp16", #32,768 tokens
 
            # #70b models:
            # "llama3.3:70b-instruct-fp16", #ctx: 130k
            # "qwen2.5:72b-instruct-fp16", #ctx: 139k
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50
Server is reachable.


# Task configurations

<!-- ## Experiments settings -->

In [None]:
from my_packages.data_processing.attributes_processing import used_functions_from_dataset, used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json


prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 5
semantic_selector = False

main_dataset_folder = '../../data/MBPP-Midio-50.json'
dataset = read_dataset_to_json(main_dataset_folder)
used_functions_json = used_functions_from_dataset(dataset)
available_nodes = used_functions_to_string(used_functions_json)


Library functions included in the dataset: 51


# Init Example selector

In [5]:
import sys
import numpy as np
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

def example_to_dict(example):
    example = example.dict()
    return {
        "task": example["inputs"]["task"],
        "function_signature": example["inputs"]["function_signature"],
        "response": example["outputs"]["response"],
        "task_id": example["metadata"]["task_id"],
        "MBPP_task_id": example["metadata"]["MBPP_task_id"],
        "external_functions": example["metadata"]["external_functions"]
    }

# Transform train_data to a list of dictionaries, and sort them by task_id
example_pool = [example_to_dict(example) for example in train_data]
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )

examples = selector.select_examples({"task": "Create a list of all the unique elements in a list."})

for e in examples:
    print(e['task_id'])

Number of examples in the pool: 32
Computes selection
50
29
14
47
4


## Evaluation

#### Testing functional correctness:

In [6]:
# import sys
# sys.path.append('../../')  # Add the path to the my_packages module
# from my_packages.evaluation.code_evaluation import calculate_pass_at_k_scores
# from my_packages.evaluation.metrics import check_correctness
# all_data = train_data + val_data + test_data
# results = {}
# for example in all_data:
#     task_id = example.metadata['task_id']
#     # if task_id in results:
#     #     continue
#     results[task_id] = [example.outputs['response']]

# test_results = check_correctness(results)
# print("\n\nTESTS FAILED:")
# for task_id, responses in test_results.items():
#     if not responses[0]['passed']:
#         print(task_id)
#         print(responses[0])
# pass_at_ks = calculate_pass_at_k_scores(results, ks=[1], metric="tests")
prompt = ""
for e in train_data + val_data + test_data:
    prompt += f"{e.inputs['task']} {e.outputs['response']}\n"

client(model="phi4:14b-fp16").get_num_tokens(prompt)


Token indices sequence length is longer than the specified maximum sequence length for this model (33689 > 1024). Running this sequence through the model will result in indexing errors


33689

In [7]:
import json
import sys
%load_ext autoreload
%autoreload 2
from my_packages.prompting.few_shot import get_prompt_template_variables
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.evaluation.code_evaluation import Run, run_validation, run_testing, calculate_final_result
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style

def dict_to_example(dict) -> Example:
    return Example(
        inputs=dict["task"],
        outputs=dict["response"],
        metadata=dict["MBPP_task_id"]
    )

results = {}
for model_name in models:
    print(f"Model: {model_name}")
    
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')

    validation_result = None
    validation_result = Run(
        phase="validation",
        temperature=0.6,
        top_p=0.9,
        top_k=10,
        metric_results={
            "pass@1": 0.2,
        },
        seed=None
    )

    if validation_result == None:
  
        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data,
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
            optimizer_metric="semantic", # "semantic"  "syntax", "tests"
        )
    validation_result.print()
   
    test_results = run_testing(
        client,
        model,
        available_nodes,
        test_data[:1],
        selector,
        temperature=validation_result.temperature,
        top_p=validation_result.top_p,
        top_k=validation_result.top_k,
        ks=[1, 5], # k generations per task
        seeds=[3, 75, 346],
        debug=True,
        metrics=["syntax", "semantic", "tests"] # current metrics: "semantic",  "syntax", "tests"
    )
    results[model["name"]] = (validation_result, test_results)
      
# Print the final results
print("\nFINAL RESULTS:")
for model_name, (val_run, test_runs) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    val_run.print()
    for test_run in test_runs:
        test_run.print()
    
    result_run = calculate_final_result(test_runs)
    result_run.print()

Model: qwen2.5:14b-instruct-fp16

===  VALIDATION ===
[32m[1m  > Temperature: 0.60
  > Top_k: 10.00
  > Top_p: 0.90
[0m  > Optimized metric result:: {
    "pass@1": 0.2
} > Metadata: null
[36m[1mTesting Phase:[0m

Tokens in the final prompt: 8876
Generating response for sample 1..

[1m=== Sample: 0 ===
[36m[1m User prompt: [0m
System: You are a Midio code generator and are going to solve some programming tasks for the node-based programming language Midio. 
You will get prompted with a task and you will respond with the code that might be used to solve the given task.

You must return the code inside a code block, in the following form:

```midio

    //Midio Code

```
Important that you generate only the code, so that it can be directly executed in a compiler.
Only the external functions that is provided can be used. If you need to create a new function, that does something different than the external functions provided, you can do so by creating a new function in the code.


<!-- ## Langsmith evaluate -->