## Install dependencies

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama



# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"


#For WebUI testing:
# %pip install open-webui



# Get data from Langsmith

In [10]:
from dotenv import load_dotenv
from langsmith import Client

dataset_name = "Node prediction"

load_dotenv("../../.env")
langsmith_client = Client()

if langsmith_client.has_dataset(dataset_name=dataset_name):
    langsmith_dataset=langsmith_client.read_dataset(dataset_name=dataset_name)
    print("Existing dataset found.")

    train_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["train"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of train examples loaded: {len(train_data)}")
    val_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["validation"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of validation examples loaded: {len(val_data)}")
    test_data = list(langsmith_client.list_examples(
        dataset_name=dataset_name,
        splits=["test"],
        # metadata={"task_id": "1"},
        # limit=5
    ))
    print(f"Number of test examples loaded: {len(test_data)}")
    print(f"Test: {test_data[0]}")
 
else:
    print(f"No existing dataset found with name: {dataset_name}.")

Existing dataset found.
Number of train examples loaded: 32
Number of validation examples loaded: 9
Number of test examples loaded: 9
Test: dataset_id=UUID('50a83b87-83af-4b42-ba0d-4e382fbc92b0') inputs={'task': 'Create a flow to find the last position of an element in a sorted array.'} outputs={'response': 'Std.For, Std.IfExpression'} metadata={'task_id': '41', 'MBPP_task_id': '793', 'dataset_split': ['test']} id=UUID('76cb9f27-b56d-49be-a6a1-e38dfb0d89e3') created_at=datetime.datetime(2025, 1, 31, 20, 8, 56, 792013, tzinfo=datetime.timezone.utc) modified_at=datetime.datetime(2025, 1, 31, 20, 8, 56, 792013, tzinfo=datetime.timezone.utc) runs=[] source_run_id=None attachments={}


# Init model provider

In [12]:
import os
import sys
from dotenv import load_dotenv
sys.path.append('../')  # Add the path to the my_packages module
from my_packages.utils.tokens_utils import models_not_in_file, write_models_nodes_tokens_to_file
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from langchain_ollama import OllamaEmbeddings, ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import OpenAIEmbeddings, ChatOpenAI

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample['response'] for sample in train_data] + [sample['response'] for sample in val_data] + [sample['response'] for sample in test_data]

match model_provider:
    case 'ollama':
        host = 'http://localhost:11434'
        if is_remote_server_reachable(url = host + "/api/tags"):
            print("Server is reachable.")
        else:
            server_diagnostics()
            print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

        embed_client = OllamaEmbeddings
        client = ChatOllama
        models = [
            #14b models:
            "phi4:14b-fp16",
            # "qwen2.5:14b-instruct-fp16",
            #32b models:
            # "qwq:32b-preview-fp16",
            # "qwen2.5-coder:32b-instruct-fp16",
        #     # "deepseek-r1"
        #     #70b models:
            # "llama3.3:70b-instruct-fp16",
            # "qwen2.5:72b-instruct-fp16",
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "system")
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_nodes_tokens_to_file(embed_client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')

    case 'openai':
        openai_token = os.getenv('OPENAI_API_KEY')
        if not openai_token:
            raise Exception("OpenAI API key not found in .env file")
        client = ChatOpenAI
        embed_client = OpenAIEmbeddings
        models = [
            "gpt-4o",
            # "o1-preview", 
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "developer")
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_nodes_tokens_to_file(client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')

    case 'anthropic':
        anthropic_token = os.getenv('ANTHROPIC_API_KEY')
        if not anthropic_token:
            raise Exception("Anthropic API key not found in .env file")
        client = ChatAnthropic
        # embed_client = AnthropicEmbeddings
        models = [
            "claude-3-5-sonnet-latest"
        ]
        # few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE", "system")
        models_not_tokenized = models_not_in_file(models, 'nodes_max_tokens.json')
        write_models_nodes_tokens_to_file(client, models_not_tokenized, all_responses, 'nodes_max_tokens.json')
    case _:
        raise Exception("Model provider not supported")




ModuleNotFoundError: No module named 'my_packages'

## Experiments settings

In [None]:
from my_packages.prompting.few_shot import get_semantic_similarity_example_selector


prompt_prefix = "Create a flow" # "e.g., "Creat a function"

selector = get_semantic_similarity_example_selector(
        train_data, 
        embed_client(model=models[0]),
        shots=5,
    )

all_1 = set()

task =  val_data[1]["task"]
examples = selector.select_examples({"task": task})
print(examples)
print("> " + task)

for example in examples:
    print(example['task'])
    print(example['response'])
    print("\n")
    all_1.add(example['task'])

all_2 = set()

task =  val_data[1]["task"]
examples = selector.select_examples({"task": task})
print(examples)
print("> " + task)

for example in examples:
    print(example['task'])
    print(example['response'])
    print("\n")
    all_2.add(example['task'])

all_3 = set()

task =  val_data[1]["task"]
examples = selector.select_examples({"task": task})
print(examples)
print("> " + task)

for example in examples:
    print(example['task'])
    print(example['response'])
    print("\n")
    all_3.add(example['task'])
    
print(len(all_1.intersection(all_2.intersection(all_3))))

# Evaluation

In [None]:
import json
import sys
import numpy as np
from my_packages.prompting.few_shot import get_random_example_selector, get_semantic_similarity_example_selector
sys.path.append('../')  # Add the path to the my_packages module
from my_packages.evaluation.node_evaluation import run_validation, run_testing, print_validation_result, print_test_result, calculate_deviation
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from colorama import Fore, Back, Style
NUM_SHOTS = 5

random_selector = False

results = {}
for model_name in models:
    print(f"Model: {model_name}")
    model = get_model_code_tokens_from_file(model_name, 'nodes_max_tokens.json')
    #***************************Need to select few-shots examples for every task here. 
    # And send a dictionary of tasks to the run_validation and run_testing function
    # Check that all exmaples produced are unique! Not same task is selected multiple times.
    if random_selector:
        selector = get_random_example_selector(
            train_data, 
            embed_client(model=model_name),
            shots=NUM_SHOTS,
            seed=9
        )
    else:
        selector = get_semantic_similarity_example_selector(
            train_data, 
            embed_client(model=model_name),
            shots=NUM_SHOTS,
        )

    validation_result = None
    # validation_result = {
    #     "seed": None,
    #     "val_f1": 0.58,
    #     "val_pass_ks": {"pass@1": 0.2},
    #     "temperature": 0.6,
    #     "top_k": 10,
    #     "top_p": 0.9,
    # }

    if validation_result == None:
  
        validation_result = run_validation(
            client,
            model,
            available_nodes,
            val_data,
            selector,
            temperatures=[0.2, 0.6, 0.9],
            top_ps=[0.2, 0.6, 0.9],
            top_ks=[10, 50, 100],
            ks=[1],
            seed=9,
            debug=True,
        )
    print_validation_result(validation_result)

    test_results = run_testing(
        client,
        model,
        available_nodes,
        test_data,
        selector,
        temperature=validation_result["temperature"],
        top_p=validation_result["top_p"],
        top_k=validation_result["top_k"],
        ks=[1, 5], # k generations per task
        seeds=[3, 75, 346],
        debug=False,
    )
    results[model["name"]] = (validation_result, test_results)
      
# Print the final results
print("\nTesting and validation result:")
for model_name, (val_run, test_runs) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    print_validation_result(val_run)
    for run in test_runs:
        print_test_result(run)
    

    mean_f1, f1_std, mean_pass_k, pass_ks_std = calculate_deviation(test_runs)
    print("\nFINAL RESULTS:")
    print(f"\n{Fore.GREEN}{Style.BRIGHT}")
    print(f"Test F1 mean: {mean_f1}\n")
    print(f"Test F1 std: {f1_std}\n")
    print(f"Test Pass@ks mean: {json.dumps(mean_pass_k, indent=4)}\n")
    print(f"Test Pass@ks std: {json.dumps(pass_ks_std, indent=4)}\n")
    f"{Style.RESET_ALL}"
