# Install dependencies

In [1]:


# # For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama


# # For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install evaluate

# # For WebUI testing:
# %pip install open-webui



# Prepare data

In [2]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.data_processing.split_dataset import split_on_shots, read_dataset_to_json
from my_packages.data_processing.get_labels_data import used_libraries_from_dataset, used_libraries_to_string
from my_packages.analysis.analyze_datasets import analyze_library_distribution, analyze_instance_distribution, analyze_visual_node_types_distribution

main_dataset_folder = '../../data/mbpp_transformed_code_examples/sanitized-MBPP-midio.json'
DATA_DIR = '../../data/mbpp_transformed_code_examples/only_files'

dataset = read_dataset_to_json(main_dataset_folder)
    
num_shot = 5 # Few-shot examples
eval_size_percentage = 0.5

train_data, val_data, test_data = split_on_shots(num_shot, eval_size_percentage, dataset, seed = 64, write_to_file=True)

# Extract all unique nodes (library_functions) across datasets
used_libraries_json = used_libraries_from_dataset(train_data)
explained_used_libraries = used_libraries_to_string(used_libraries_json)

#Bar chart of distribuation
# analyze_library_distribution(train_data, val_data, test_data)
# analyze_instance_distribution(train_data, val_data, test_data)
# analyze_visual_node_types_distribution(train_data, val_data, test_data)

6
3
3
Training set size: 5
Validation set size: 3
Test set size: 3
Library functions included in the dataset: 21


# Prepare prompts

In [3]:
from my_packages.prompting.few_shot import split_and_format

train_prompts, train_responses = split_and_format(train_data,"CODE_TEMPLATE",'../../data/mbpp_transformed_code_examples/only_files/')
val_prompts, val_responses = split_and_format(val_data,"CODE_TEMPLATE", '../../data/mbpp_transformed_code_examples/only_files/')
test_prompts, test_responses = split_and_format(test_data ,"CODE_TEMPLATE", '../../data/mbpp_transformed_code_examples/only_files/')

# Init model provider

In [4]:
import os
import ollama
from openai import OpenAI
from anthropic import Anthropic
from dotenv import load_dotenv
from my_packages.prompting.few_shot import create_few_shot_messages
from my_packages.utils.tokens_utils import write_models_code_tokens_to_file, models_not_in_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable

model_provider = 'ollama'

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        embed_client = OpenAI(
            base_url=host+'/v1/',
            api_key='ollama',
        )
        client = ollama.Client(
            host=host,
        )
        models = [
            #14b models:
            "phi4:14b-fp16",
            # "qwen2.5:14b-instruct-fp16",
            # #32b models:
            # "qwq:32b-preview-fp16",
            # "qwen2.5-coder:32b-instruct-fp16",
            # #70b models:
            # "llama3.3:70b-instruct-fp16", 
            # "qwen2.5:72b-instruct-fp16",
            #ollama rm llama3.1:8b-instruct-fp16
        ]
        few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "CODE_GENERATOR_TEMPLATE", "system")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_code_tokens_to_file(embed_client, models_not_tokenized, DATA_DIR, 'code_max_tokens.json')
        
    case 'openai':
        load_dotenv("../.env")
        openai_token = os.getenv('OPENAI_API_KEY')
        few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "CODE_GENERATOR_TEMPLATE", "developer")
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = OpenAI(api_key=openai_token)
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_code_tokens_to_file(client, models_not_tokenized, DATA_DIR, 'code_max_tokens.json')

    case 'anthropic':
        load_dotenv("../.env")
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = Anthropic(api_key=anthropic_token)
        models = [
            "claude-3-5-sonnet-20241022" #latest
        ]
        few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "CODE_GENERATOR_TEMPLATE", "system")
        models_not_tokenized = models_not_in_file(models, 'code_max_tokens.json')
        write_models_code_tokens_to_file(client, models_not_tokenized, DATA_DIR, 'code_max_tokens.json')
        
    case _:
        raise Exception("Model provider not supported")




Server is reachable.


In [5]:
import numpy as np
print(np.array([0.279,0.22, 0.22]).mean())

0.23966666666666667


## Evalaution

In [8]:
from my_packages.evaluation.code_evaluation import Run, calculate_final_result, run_testing, run_validation
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style

results = {}
for model_name in models:
    print(f"Model: {model_name}")
    model = get_model_code_tokens_from_file(model_name, 'code_max_tokens.json')

    validation_result = None
    validation_result = {
        "seed": None,
        "val_f1": 0.56,
        "val_pass_ks": {"pass@1": 0.3},
        "temperature": 0.2,
        "top_p": 0.8,
    }
    validation_result = Run(
        phase="validation",
        temperature=0.5,
        top_p=0.5,
        top_k=0.5,
        metric_results={f"pass@k_semantic": {"pass@1": 0.5}},
        seed=9
    )

    if validation_result == None:
        validation_result = run_validation(
            client,
            few_shot_messages,
            model,
            val_prompts,
            val_responses,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
            optimizer_metric="semantic", # "semantic"  "syntax", "tests" or "all"
        )
    validation_result.print()

    test_results = run_testing(
        client,
        few_shot_messages,
        model,
        test_prompts,
        test_responses,
        temperature=validation_result.temperature,
        top_p=validation_result.top_p,
        top_k=validation_result.top_k,
        ks=[1, 5], # k generations per task
        seeds=[3, 75, 346],
        debug=True,
        metrics=["syntax", "semantic"] # current metrics: "semantic",  "syntax", "tests"
    )
    results[model["name"]] = (validation_result, test_results)
      
# Print the final results
print("\nFINAL RESULTS:")
for model_name, (val_run, test_runs) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    val_run.print()
    for test_run in test_runs:
        test_run.print()
    
    result_run = calculate_final_result(test_runs)
    result_run.print()

Model: phi4:14b-fp16

===  VALIDATION ===
  > Temperature: 0.50
  > Top_k: 0.50
  > Top_p: 0.50
[32m[1m  > Optimized metric result:: {
    "pass@k_semantic": {
        "pass@1": 0.5
    }
}[0m
[36m[1mTesting Phase:[0m

Generating response for sample 0..

[1m=== Sample: 0 ===
[36m[1m User prompt: [0m
You are an expert Midio code programmer. Genereate the code for the following task:
Create a flow to multiply two numbers.

You must return the code in the following form:

```midio

    //Midio Code

```

[33m[1m Assistant response: #1:
import("std", Std_k98ojb)
import("http", Http_q7o96c)

module() main { 
    func(doc: "Multiplies two numbers.") multiply {
        in(x: -56.1, y: 74.8, name: "num1") property(Number) num1_0e9a4f
        in(x: -54.6, y: -52.2, name: "num2") property(Number) num2_b7c0d5
        in(x: 3.7, y: -13.8, name: "execute") trigger() execute_92b2e1

        out(x: 91.4, y: -21.6, name: "continue") trigger() continue_ea1f50
        out(x: 96.5, y: 78.3, n

KeyboardInterrupt: 