<!-- ## Install dependencies -->

In [7]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama

# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"

# For WebUI testing:
# %pip install open-webui

%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.few_shot import transform_code_data
from my_packages.utils.file_utils import read_dataset_to_json

main_dataset_folder = '../../data/MBPP_Midio_50/'

train_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/train_dataset.json'))
val_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/validation_dataset.json'))
test_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/test_dataset.json'))

print(f"Train data: {len(train_data)}")
print(f"Val data: {len(val_data)}")
print(f"Test data: {len(test_data)}")

Train data: 32
Val data: 9
Test data: 9


# Model configurations

In [9]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'openai'

load_dotenv("../../.env")
all_responses = [sample["response"] for sample in train_data] + [sample["response"] for sample in val_data] + [sample["response"] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            if not is_remote_server_reachable(url = host + "/api/tags"):
                print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        client = ChatOllama
   
        models = [
            # 14b models:
            "phi4:14b-fp16", #16k context length
            "qwen2.5:14b-instruct-fp16", #128 k

            #32b models:
            # "qwq:32b-preview-fp16", #ctx: 32,768 tokens
            # "qwen2.5-coder:32b-instruct-fp16", #32,768 tokens
 
            # #70b models:
            # "llama3.3:70b-instruct-fp16", #ctx: 130k
            # "qwen2.5:72b-instruct-fp16", #ctx: 139k
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_tokens_to_file(client, models_not_tokenized, all_responses, 'code_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




Number of all responses: 50


# Task configurations

<!-- ## Experiments settings -->

In [10]:
import json
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.common import PromptType
from my_packages.data_processing.attributes_processing import used_functions_to_string
from my_packages.utils.file_utils import read_dataset_to_json


prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 10
semantic_selector = False


dataset = read_dataset_to_json(main_dataset_folder + "MBPP-Midio-50.json")
used_functions_json = read_dataset_to_json(main_dataset_folder + "/metadata/used_external_functions.json")
available_nodes = used_functions_to_string(used_functions_json)
print(used_functions_json)


[{'type': 'function', 'function_name': 'Std.If', 'module_path': 'Std', 'doc': 'If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.', 'body': 'extern func(doc: "If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.") If {\n\n        in(x: 0, y: 0, name: "execute") trigger() execute\n\n        in(x: 0, y: 0, name: "input") property(Bool) predicate\n\n        out(x: 0, y: 0, name: "then") trigger() then\n\n        out(x: 0, y: 0, name: "else") trigger() else\n\n        out(x: 0, y: 0, name: "value") property(Bool) value\n\n    }', 'types': 'type CountContext Number\ntype AnyContext Any'}, {'type': 'function', 'function_name': 'Std.IfExpression', 'module_path': 'Std', 'doc': 'IfExpression is a function that allows you to create a conditional expression using a simple expression language provided through the `expression` parameter. E

# Init Example selector

In [11]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

example_pool = train_data
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )

examples = selector.select_examples({"task": "Create a list of all the unique elements in a list."})

for e in examples:
    print(e['task_id'])

Number of examples in the pool: 32
50
29
14
47
4
37
25
26
28
45


## Evaluation

In [13]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
%autoreload 2
%reload_ext autoreload
from my_packages.db_service.best_params_service import get_best_params
from my_packages.db_service.experiment_service import confirm_testing_rerun, confirm_validation_rerun, experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
from my_packages.evaluation.code_evaluation import run_validation, run_testing
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style
import os
os.environ['EXPERIMENT_DB_NAME'] = "few_shot_experiments"


metrics = ['syntax', 'semantic'] #['syntax', 'semantic', 'tests'] OR ['syntax', 'semantic']
prompt_type = PromptType.SIGNATURE
example_selector = f"{'semantic' if semantic_selector else 'coverage'}"
experiment_name = f"{prompt_type.value}_{example_selector}_{NUM_SHOTS}_shot"

# Create separate database for this experiment
if experiment_exists(experiment_name):
    # delete_experiment(experiment_name)
    print(f"📂 Experiemnt '{experiment_name}' already exists.")
    pretty_print_experiment_collections(
        experiment_name,
        exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"]
    )
    if not run_experiment_quality_checks(experiment_name):
        print("\n❌ Experiment quality checks failed. Exiting.")
        raise Exception("Experiment quality checks failed.")
else:
    setup_experiment_collection(experiment_name)

results = {}

for model_name in models:
    print("\n\n")
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')
    results[model_name] = (None, None)

    if confirm_validation_rerun(experiment_name, model_name): # If best params exists for model, prompt user to rerun validation. If not exists, return true to run validation.

        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data[:1],
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
            optimizer_metric=metrics[-1],
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors, and best params
            prompt_type=prompt_type,
        )
        validation_result.print()
        test_results, final_result = run_testing( # Must rerun testing if validation is rerun
            client, 
            model,
            available_nodes,
            test_data,
            selector,
            temperature=validation_result.temperature,
            top_p=validation_result.top_p,
            top_k=validation_result.top_k,
            ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
            seeds=[3, 75, 346],
            debug=True,
            metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
            experiment_name=experiment_name,
            env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
            prompt_type=prompt_type,
            )
        results[model_name] = (test_results, final_result)

    else:
        validation_result = get_best_params( 
            experiment_name, 
            model, 
            metrics[-1], # optimizer_metric
            k = 1
        )
        # validation_result = Run(
        #     phase="validation",
        #     temperature=0.6,
        #     top_p=0.9,
        #     top_k=10,
        #     metric_results={
        #         "pass@1": 0.2,
        #     },
        #     seed=None
        # )
    
        validation_result.print()
        if confirm_testing_rerun(experiment_name, model_name): # If results exists for model, prompt user to rerun testing. If not exists, return true to run testing first time.
                test_results, final_result = run_testing(
                    client,
                    model,
                    available_nodes,
                    test_data,
                    selector,
                    temperature=validation_result.temperature,
                    top_p=validation_result.top_p,
                    top_k=validation_result.top_k,
                    ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
                    seeds=[3, 75, 346],
                    debug=True,
                    metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
                    experiment_name=experiment_name,
                    env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
                    prompt_type = prompt_type,
                )
                results[model_name] = (test_results, final_result)
    
        
    print(f"======\n{model_name} FINAL RESULTS: ===============================")
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    test_runs = results[model_name][0]
    final_result = results[model_name][1]
    if test_runs:
        for run in test_runs:
            run.print()
    if final_result:
        final_result.print()

      
# Print the final results
print("\nALL MODELS FINAL RESULTS:")
for model_name, (test_runs, final_result) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    for test_run in test_runs:
        test_run.print()
    final_result.print()



📂 Experiemnt 'signature_coverage_10_shot' already exists.

🔍 Existing collections for signature_coverage_10_shot: 

📂 Collection: signature_coverage_10_shot_results 
--------------------------------------------------
⚠️ No data found in this collection.

📂 Collection: signature_coverage_10_shot_errors 
--------------------------------------------------
⚠️ No data found in this collection.

📂 Collection: signature_coverage_10_shot_best_params 
--------------------------------------------------
⚠️ No data found in this collection.

=== Running quality checks for experiment: signature_coverage_10_shot ===
No models found for experiment 'signature_coverage_10_shot'.
✅ All models passed quality checks successfully.
=== End of quality checks ===




⚠️ No best params found for model 'gpt-4o' in experiment 'signature_coverage_10_shot_best_params'. Ready to rerun! 

Starting validation phase..
[36m[1mValidation Phase:[0m
Optimizing for metric: semantic
RESETTING TOP_K TO NONE FOR OPENAI MOD

KeyboardInterrupt: 

<!-- ## Langsmith evaluate -->

# Pretty print

In [17]:
experiment_name = f"regular_coverage_10_shot"
pretty_print_experiment_collections(experiment_name, exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"])


🔍 Existing collections for regular_coverage_10_shot: 

📂 Collection: regular_coverage_10_shot_best_params 
--------------------------------------------------
                     model_name optimizer_metric  temperature  top_p  top_k  seed              created_at
           qwq:32b-preview-fp16           syntax          0.6    0.2     10     9 2025-03-02 14:49:17.326
                  phi4:14b-fp16           syntax          0.2    0.2     10     9 2025-03-02 14:49:17.532
qwen2.5-coder:32b-instruct-fp16           syntax          0.2    0.2     10     9 2025-03-02 14:49:17.697
      qwen2.5:14b-instruct-fp16           syntax          0.2    0.6     50     9 2025-03-02 14:49:18.034
     llama3.3:70b-instruct-fp16           syntax          0.2    0.2     10     9 2025-03-02 14:49:18.362
...
Total documents/rows: 18      
--------------------------------------------------

📂 Collection: regular_coverage_10_shot_errors 
--------------------------------------------------
               model

## Testing one run:

In [None]:
from my_packages.evaluation.code_evaluation import evaluate_code, run_model


model_result, largest_context = run_model(
    client,
    "phi4:14b-fp16",
    available_nodes,
    val_data,
    selector,
    model["max_tokens"],
    0.2,
    0.9,
    50,
    [1],
    9,
    True, 
    ["tests"]
)

metric_results_lists = evaluate_code (
    model_result,
    ks=[1],
    evaluation_metric=["tests"],
    experiment_name=experiment_name,
    model_name=model["name"],
    env="dev",
    hyperparams={"seed": 9, "temperature": 0.2, "top_p": 0.9, "top_k": 50},
    phase="validation"
)
## Optimizing for the first k in the ks list
pass_at_k_dict = metric_results_lists[0]
val_metric = pass_at_k_dict[f"pass@1"]
print(f"Validation with temp={0.2}, top_k={50} and top_p={0.9}. Gave pass@1={val_metric} and pass@ks={pass_at_k_dict}")

RuntimeError: Only a single TORCH_LIBRARY can be used to register the namespace prims; please put all of your definitions in a single TORCH_LIBRARY block.  If you were trying to specify implementations, consider using TORCH_LIBRARY_IMPL (which can be duplicated).  If you really intended to define operators for a single namespace in a distributed way, you can use TORCH_LIBRARY_FRAGMENT to explicitly indicate this.  Previous registration of TORCH_LIBRARY was registered at /dev/null:241; latest registration was registered at /dev/null:241

# Write experiment to file

In [None]:
from my_packages.db_service.few_shots.few_shot_db_service import export_collection


export_collection(experiment_name, "errors", "json", exclude_columns=[])

# Visualization

In [None]:
from my_packages.db_service.few_shots.few_shot_db_service import visualize_error_flow_for_model
# Assuming the experiment 'GPT4_signature_exp1' exists,
# and you evaluated 100 candidates for model 'GPT-4':
visualize_error_flow_for_model(experiment_name, models[0])
