<!-- ## Install dependencies -->

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama



# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"


# For WebUI testing:
# %pip install open-webui



<!-- # Get data from Langsmith -->

# Get dataset from Langsmith

In [2]:
from dotenv import load_dotenv
from langsmith import Client
from langsmith.schemas import Example

dataset_name = "Node prediction"

load_dotenv("../../.env")
langsmith_client = Client()

if langsmith_client.has_dataset(dataset_name=dataset_name):
    langsmith_dataset=langsmith_client.read_dataset(dataset_name=dataset_name)
    print("Existing dataset found.")
    
    train_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["train"],
        # metadata={"task_id": "1"},
        # limit=1
    ))
    

    # train_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in train_data]
    print(train_data[0])
    print(f"Number of train examples loaded: {len(train_data)}")
    
    val_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["validation"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of validation examples loaded: {len(val_data)}")
    # val_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in val_data]


    test_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["test"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of test examples loaded: {len(test_data)}")
    # test_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in test_data]
    
    # make the rest of the code work with Example object, instead of [{'task_id': str, 'task': str, 'response': list, 'MBPP_task_id': str}]. 
    # Create an database of type chat
 
else:
    print(f"No existing dataset found with name: {dataset_name}.")



Existing dataset found.
dataset_id=UUID('bb70a165-86a2-43b6-8843-37d8b541167f') inputs={'task': 'Create a function to check whether the given number is a perfect square or not.'} outputs={'response': 'Iteration.Map, List.GenerateRange, Math.LessThanOrEqual, Std.For, Logic.And, Std.If, Math.Expression, Math.Add, Math.Mul'} metadata={'task_id': '50', 'MBPP_task_id': '803', 'dataset_split': ['train'], 'external_functions': ['Iteration.Map', 'List.GenerateRange', 'Math.LessThanOrEqual', 'Std.For', 'Logic.And', 'Std.If', 'Math.Expression', 'Math.Add', 'Math.Mul']} id=UUID('3634f910-0ee5-4b1f-b2b7-c7710e5b8c90') created_at=datetime.datetime(2025, 2, 7, 12, 46, 5, 303611, tzinfo=datetime.timezone.utc) modified_at=datetime.datetime(2025, 2, 7, 12, 46, 5, 303611, tzinfo=datetime.timezone.utc) runs=[] source_run_id=None attachments={}
Number of train examples loaded: 32
Number of validation examples loaded: 9
Number of test examples loaded: 9


<!-- # Init model provider -->

# Model configurations

In [None]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample.outputs['response'] for sample in train_data] + [sample.outputs['response'] for sample in val_data] + [sample.outputs['response'] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        embed_client = OllamaEmbeddings
        client = ChatOllama
        models = [
            # 14b models:
            "phi4:14b-fp16",
            "qwen2.5:14b-instruct-fp16",

            #32b models:
            "qwq:32b-preview-fp16",
            "qwen2.5-coder:32b-instruct-fp16",
 
            #70b models:
            "llama3.3:70b-instruct-fp16",
            "qwen2.5:72b-instruct-fp16",
        ]
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_tokens_to_file(embed_client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        embed_client = OpenAIEmbeddings
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50
Server is reachable.


# Task configurations

<!-- ## Experiments settings -->

In [None]:
from my_packages.data_processing.attributes_processing import used_functions_from_dataset, used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json 

prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 5
semantic_selector = True

main_dataset_folder = '../../data/MBPP_Midio_50/MBPP-Midio-50.json'
dataset = read_dataset_to_json(main_dataset_folder)
used_functions_json = used_functions_from_dataset(dataset)
available_nodes = used_functions_to_string(used_functions_json)


Library functions included in the dataset: 51


# Init Example selector

In [9]:
import sys
import numpy as np
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

def example_to_dict(example):
    example = example.dict()
    return {
        "task": example["inputs"]["task"],
        "response": example["outputs"]["response"],
        "task_id": example["metadata"]["task_id"],
        "MBPP_task_id": example["metadata"]["MBPP_task_id"],
        "external_functions": example["metadata"]["external_functions"]
    }

# Transform train_data to a list of dictionaries, and sort them by task_id
example_pool = [example_to_dict(example) for example in train_data]
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        embed_client(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )

examples = selector.select_examples({"task": "Create a list of all the unique elements in a list."})

for e in examples:
    print(e['task_id'])

Number of examples in the pool: 32
47
39
7
36
49


## Evaluation

In [6]:
import json
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.evaluation.node_evaluation import run_validation, run_testing, print_validation_result, print_test_result, calculate_deviation
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style

def example_to_dict(example):
    example = example.dict()
    return {
        "task": example["inputs"]["task"],
        "response": example["outputs"]["response"],
        "task_id": example["metadata"]["task_id"],
        "MBPP_task_id": example["metadata"]["MBPP_task_id"],
        "external_functions": example["metadata"]["external_functions"]
    }

def dict_to_example(dict) -> Example:
    return Example(
        inputs=dict["task"],
        outputs=dict["response"],
        metadata=dict["MBPP_task_id"]
    )

results = {}
for model_name in models:
    print(f"Model: {model_name}")
    model = get_model_code_tokens_from_file(model_name, 'nodes_max_tokens.json')

    validation_result = None
    # validation_result = {
    #     "seed": None,
    #     "val_f1": 0.58,
    #     "val_pass_ks": {"pass@1": 0.2},
    #     "temperature": 0.6,
    #     "top_k": 10,
    #     "top_p": 0.9,
    # }

    if validation_result == None:
  
        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data,
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=False,
        )
    print_validation_result(validation_result)

    test_results = run_testing(
        client,
        model,
        available_nodes,
        test_data,
        selector,
        temperature=validation_result["temperature"],
        top_p=validation_result["top_p"],
        top_k=validation_result["top_k"],
        ks=[1, 5], # k generations per task
        seeds=[3, 75, 346],
        debug=True,
    )
    results[model["name"]] = (validation_result, test_results)
      

# Print the final results
print("\nTesting and validation result:")
for model_name, (val_run, test_runs) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    print_validation_result(val_run)
    for run in test_runs:
        print_test_result(run)
    

    mean_f1, f1_std, mean_pass_k, pass_ks_std = calculate_deviation(test_runs)
    print("\nFINAL RESULTS:")
    print(f"\n{Fore.GREEN}{Style.BRIGHT}")
    print(f"Test F1 mean: {mean_f1}\n")
    print(f"Test F1 std: {f1_std}\n")
    print(f"Test Pass@ks mean: {json.dumps(mean_pass_k, indent=4)}\n")
    print(f"Test Pass@ks std: {json.dumps(pass_ks_std, indent=4)}\n")
    f"{Style.RESET_ALL}"


Model: phi4:14b-fp16
[36m[1mValidation Phase:[0m
Generating response for sample 8..
 == Pass@k computation ==

Total: [1 1 1 1 1 1 1 1 1], Correct: [0 1 0 1 0 1 1 0 1]
[0. 1. 0. 1. 0. 1. 1. 0. 1.]
Pass@K: {'pass@1': 0.5555555555555556}

== F1 Score computation ==

[33m[1mPartial match: F1 Score=0.38[0m

[32m[1m100% Correct response (exact match)[0m

[33m[1mPartial match: F1 Score=0.67[0m

[32m[1m100% Correct response (exact match)[0m

[33m[1mPartial match: F1 Score=0.20[0m

[32m[1m100% Correct response (exact match)[0m

[32m[1m100% Correct response (exact match)[0m

[33m[1mPartial match: F1 Score=0.50[0m

[33m[1mPartial match: F1 Score=0.50[0m

F1 Scores: [0.375, 1.0, 0.6666666666666666, 1.0, 0.2, 1.0, 1.0, 0.5, 0.5], Mean: 0.6935185185185185
Validation with temp=0.2, top_k=10 and top_p=0.2. Gave f1=0.6935185185185185 and pass@ks={'pass@1': 0.5555555555555556}
Generating response for sample 8..
 == Pass@k computation ==

Total: [1 1 1 1 1 1 1 1 1], Correct

<!-- ## Langsmith evaluate -->