<!-- ## Install dependencies -->

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama



# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"


# For WebUI testing:
# %pip install open-webui

%load_ext autoreload



<!-- # Get data from Langsmith -->

# Get dataset from Langsmith

In [2]:
from dotenv import load_dotenv
from langsmith import Client
from langsmith.schemas import Example

dataset_name = "Code prediction"

load_dotenv("../../.env")
langsmith_client = Client()

if langsmith_client.has_dataset(dataset_name=dataset_name):
    langsmith_dataset=langsmith_client.read_dataset(dataset_name=dataset_name)
    print("Existing dataset found.")
    
    train_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["train"],
        # metadata={"task_id": "1"},
        # limit=1
    ))
    

    # train_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in train_data]
    print(train_data[0])
    print(f"Number of train examples loaded: {len(train_data)}")
    
    val_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["validation"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of validation examples loaded: {len(val_data)}")
    # val_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in val_data]


    test_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["test"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of test examples loaded: {len(test_data)}")
    # test_data = [example.dict(include={"inputs", "outputs", "metadata"}) for example in test_data]
    
    # make the rest of the code work with Example object, instead of [{'task_id': str, 'task': str, 'response': list, 'MBPP_task_id': str}]. 
    # Create an database of type chat
 
else:
    print(f"No existing dataset found with name: {dataset_name}.")



Existing dataset found.
dataset_id=UUID('cfbffc6c-8db2-4ac1-aa5b-66381852d34f') inputs={'task': 'Create a function to check whether the given number is a perfect square or not.', 'function_signature': 'func(doc: "checks whether the given number is a perfect square or not.") is_perfect_square {\n\tin(x: 354, y: -53, name: "input") property(Number) input_f5c03d\n\tin(x: 161, y: -211, name: "execute") trigger() execute_27b8c3\n\tout(x: 1748, y: -10, name: "out") property(Bool) out_f3db9f\n\tout(x: 1894, y: 120, name: "continue") trigger() continue_8ba06b\n}'} outputs={'response': 'import("std", Std_k98ojb)\nimport("http", Http_q7o96c)\n\nmodule() main {\n    func(doc: "checks whether the given number is a perfect square or not.") is_perfect_square {\n        in(x: 354, y: -53, name: "input") property(Number) input_f5c03d\n        in(x: 161, y: -211, name: "execute") trigger() execute_27b8c3\n\n        out(x: 1748, y: -10, name: "out") property(Bool) out_f3db9f\n        out(x: 1894, y: 120

<!-- # Init model provider -->

# Model configurations

In [3]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample.outputs['response'] for sample in train_data] + [sample.outputs['response'] for sample in val_data] + [sample.outputs['response'] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")
        client = ChatOllama
   
    
        models = [
            # 14b models:
            #"phi4:14b-fp16", #16k context length
            "qwen2.5:14b-instruct-fp16", #128 k

            #32b models:
            # "qwq:32b-preview-fp16", #ctx: 32,768 tokens
            # "qwen2.5-coder:32b-instruct-fp16", #32,768 tokens
 
            # #70b models:
            # "llama3.3:70b-instruct-fp16", #ctx: 130k
            # "qwen2.5:72b-instruct-fp16", #ctx: 139k
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50
Server is reachable.


# Task configurations

<!-- ## Experiments settings -->

In [4]:
from my_packages.data_processing.attributes_processing import used_functions_from_dataset, used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json


prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 5
semantic_selector = False
metrics = ['tests']

main_dataset_folder = '../../data/MBPP-Midio-50.json'
dataset = read_dataset_to_json(main_dataset_folder)
used_functions_json = used_functions_from_dataset(dataset)
available_nodes = used_functions_to_string(used_functions_json)


Library functions included in the dataset: 51


# Init Example selector

In [5]:
import sys
import numpy as np
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

def example_to_dict(example):
    example = example.dict()
    return {
        "task": example["inputs"]["task"],
        "function_signature": example["inputs"]["function_signature"],
        "response": example["outputs"]["response"],
        "task_id": example["metadata"]["task_id"],
        "MBPP_task_id": example["metadata"]["MBPP_task_id"],
        "external_functions": example["metadata"]["external_functions"]
    }

# Transform train_data to a list of dictionaries, and sort them by task_id
example_pool = [example_to_dict(example) for example in train_data]
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )

examples = selector.select_examples({"task": "Create a list of all the unique elements in a list."})

for e in examples:
    print(e['task_id'])

Number of examples in the pool: 32
Computes selection
50
29
14
47
4


## Evaluation

In [None]:
import json
import logging
import sys
%autoreload 2
%reload_ext autoreload
from my_packages.common import Run
from my_packages.few_shot_db_service import delete_experiment, experiment_exists, get_best_params, pretty_print_experiment_collections, save_best_params_to_db, setup_experiment_collection
sys.path.append('../../')  # Add the path to the my_packages module

from my_packages.evaluation.code_evaluation import run_validation, run_testing, calculate_final_result
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style

def dict_to_example(dict) -> Example:
    return Example(
        inputs=dict["task"],
        outputs=dict["response"],
        metadata=dict["MBPP_task_id"]
    )


############ MAKE database to store the ERRORS and RESULTS #################
example_selector = f"{'semantic' if semantic_selector else 'coverage'}"
prompt_template = f"{'signature' if 'tests' in metrics else 'regular'}"
experiment_name = f"{prompt_template}_{example_selector}_{NUM_SHOTS}-shot"

delete_experiment(experiment_name)

# Create separate database for this experiment
if experiment_exists(experiment_name):
    print(f"📂 Experiemnt '{experiment_name}' already exists.")
    pretty_print_experiment_collections(experiment_name)
else:
    setup_experiment_collection(experiment_name)


results = {}
for model_name in models:
    print("\n\n")
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')

    validation_result = get_best_params(
        experiment_name, 
        model, 
    )
    validation_result = Run(
        phase="validation",
        temperature=0.6,
        top_p=0.9,
        top_k=10,
        metric_results={
            "pass@1": 0.2,
        },
        seed=None
    )
    if not validation_result:
        # Run validation
        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data,
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=False,
            optimizer_metric=metrics[-1],
        )
        # Save to MongoDB
        save_best_params_to_db(experiment_name, model["name"], metrics[-1], validation_result)

    validation_result.print()
    

    test_results = run_testing(
        client,
        model,
        available_nodes,
        test_data[:1],
        selector,
        temperature=validation_result.temperature,
        top_p=validation_result.top_p,
        top_k=validation_result.top_k,
        ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
        seeds=[3, 75, 346],
        debug=False,
        metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
        experiment_name=experiment_name,
        env="prod" #'dev' for file storage and 'prod' for database storage, for results and errors
    )
    
    results[model_name] = test_results
    # Print the final results
    print(f"======\n{model_name} FINAL RESULTS: ===============================")
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")

    for test_run in test_results:
        test_run.print()
    
    result_run = calculate_final_result(test_results)
    result_run.print()
      
# Print the final results
print("\nALL MODELS FINAL RESULTS:")
for model_name, (val_run, test_runs) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    val_run.print()
    for test_run in test_runs:
        test_run.print()
    
    result_run = calculate_final_result(test_runs)
    result_run.print()



❌ Deletion cancelled.
📂 Experiemnt 'signature_coverage_5-shot' already exists.

🔍 Existing collections for signature_coverage_5-shot: 

📂 Collection: signature_coverage_5-shot_best_params
--------------------------------------------------
⚠️ No data found in this collection.

📂 Collection: signature_coverage_5-shot_results
--------------------------------------------------
               model_name  seed  temperature  top_p  top_k            ks  metric                                                                                       pass_at_k
qwen2.5:14b-instruct-fp16     3          0.6    0.9     10 [1, 2, 5, 10] [tests] [{'pass@1': 0.09999999999999998, 'pass@2': 0.19999999999999996, 'pass@5': 0.5, 'pass@10': 1.0}]
--------------------------------------------------

📂 Collection: signature_coverage_5-shot_errors
--------------------------------------------------
               model_name  task_id  candidate_id metric test_result error_type                                          

Token indices sequence length is longer than the specified maximum sequence length for this model (8933 > 1024). Running this sequence through the model will result in indexing errors


Tokens in the final prompt: 8933
Generating response for sample 1..

<!-- ## Langsmith evaluate -->