## Install dependencies

In [1]:


%pip install ollama
%pip install openai
%pip install transformers
%pip install tiktoken
%pip install huggingface-hub
%pip install python-dotenv


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


## Prepare data

In [2]:
import sys
sys.path.append('../')  # Add the path to the my_packages module
from my_packages.data_processing.split_dataset import split_on_shots, split, read_dataset_to_json
from my_packages.data_processing.get_labels_data import used_libraries_from_dataset, used_libraries_to_string
from my_packages.analysis.analyze_datasets import analyze_library_distribution, analyze_instance_distribution, analyze_visual_node_types_distribution

main_dataset_folder = '../data/mbpp_transformed_code_examples/sanitized-MBPP-midio.json'
dataset = read_dataset_to_json(main_dataset_folder)

num_shot = 10 # Few-shot examples
eval_size_percentage = 0.5

train_data, val_data, test_data = split_on_shots(num_shot, eval_size_percentage, dataset, seed = 64, write_to_file=True)

# Extract all unique nodes (library_functions) across datasets
used_libraries_json = used_libraries_from_dataset(train_data)
available_nodes = set(item['function_name'] for item in used_libraries_json)
explained_used_libraries = used_libraries_to_string(used_libraries_json)

#Bar chart of distribuation
# analyze_library_distribution(train_data, val_data, test_data)
# analyze_instance_distribution(train_data, val_data, test_data)
# analyze_visual_node_types_distribution(train_data, val_data, test_data)

15
7
8
Library functions included in the dataset: 32


## Prompt utils

In [3]:
from my_packages.prompting.few_shot import split_and_format, create_few_shot_messages

train_prompts, train_responses = split_and_format(train_data, "NODES_TEMPLATE")
val_prompts, val_responses = split_and_format(val_data, "NODES_TEMPLATE")
test_prompts, test_responses = split_and_format(test_data, "NODES_TEMPLATE")

few_shot_messages = create_few_shot_messages(explained_used_libraries, train_prompts, train_responses, "NODE_GENERATOR_TEMPLATE")


# Open source models

In [4]:
from openai import OpenAI


open_models = [
    #llama models:
    {"name": "llama3.3:70b-instruct-fp16"},
    {"name": "llama3.3:70b-instruct-q8_0"},
    {"name": "llama3.1:8b-instruct-fp16"},
    {"name": "codestral:22b-v0.1-q8_0"},
    
    # {"name": "qwq:32b-preview-fp16"},
    # {"name": "mistral-large:123b-instruct-2407-q4_K_M"},
]


### Calculate max tokens for each model

In [5]:
from my_packages.utils.tokens_utils import find_max_tokens_nodes_tokenizer, find_max_tokens_nodes_api

DATA_DIR = '../data/mbpp_transformed_code_examples/only_files'
OUTPUT_JSON = 'token_counts.json'
host = 'http://localhost:11434'

client = OpenAI(
  base_url=host+'/v1/',
  api_key='ollama' #Ignored with 'ollama'. To use GPT models, this key need to be set.
)
all_responses = train_responses + val_responses + test_responses
open_models_to_test = []
for model_info in open_models:

    max_tokens = find_max_tokens_nodes_api(all_responses, model_info["name"], client.embeddings)
    client
    model_result = {
        "name": model_info["name"],
        "max_tokens": max_tokens,
    }
    open_models_to_test.append(model_result)

print(open_models_to_test)

[{'name': 'codestral:22b-v0.1-q8_0', 'max_tokens': 43}]


# GPT-models

### Calculate max tokens for each model

In [6]:
import tiktoken
gpt_models = [
    # {"name": "o1-preview", "tokenization": "o200k_base"},
    {"name": "gpt-4o", "tokenization": "o200k_base"},
]

gpt_models_to_test = []
for model_info in gpt_models:
    encoding = tiktoken.encoding_for_model(model_info["name"])
    max_tokens = find_max_tokens_nodes_tokenizer(all_responses, encoding)
    model_result = {
        "name": model_info["name"],
        "max_tokens": max_tokens
    }
    gpt_models_to_test.append(model_result)
print(gpt_models_to_test)

[{'name': 'gpt-4o', 'max_tokens': 32}]


# Evaluation

## Open source models

In [11]:
from my_packages.evaluation.nodes_accuracy import run_nodes_evaluation, evaluate_nodes_accuracy
from my_packages.utils.server_utils import server_diagnostics, is_remote_server_reachable
from openai import OpenAI

host = 'http://localhost:11434'
if is_remote_server_reachable(url = host + "/api/tags"):
    print("Server is reachable.")
else:
    server_diagnostics()
    print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

client = OpenAI(
  base_url=host+'/v1/',
  api_key='ollama' #Ignored with 'ollama'. To use GPT models, this key need to be set.
)

results = {}
available_nodes = set(item['function_name'] for item in used_libraries_json)
for model in open_models_to_test:
    print(f"model: {model['name']}")
    result = run_nodes_evaluation(
        client,
        few_shot_messages,
        model,
        available_nodes,
        val_prompts,
        val_responses,
        test_prompts,
        test_responses,
        temperatures=[0.2, 0.5, 0.8],
        top_ps=[0.2, 0.5, 1.0],
        seeds=[3, 75, 346],
        debug = True
    )
    results[model["name"]] = result
      
# Print the final results
print("\nFinal Results:")
for model_name, runs in results.items():
    print(runs)
    for run in runs:
        seed = run["seed"]
        val_accuracy= run["validation_accuracy"]
        test_accuracy= run["test_accuracy"]
        best_temperature = run["temperature"]
        best_top_p = run["top_p"]
        print(
            f"  Seed {seed}: "
            f"Validation accuracy: {val_accuracy:.2f}, "
            f"Test accuracy: {test_accuracy:.2f}, "
            f"Best temperature: {best_temperature:.2f},"
            f"Best top_p: {best_top_p:.2f},"
        )


Server is reachable.
model: codestral:22b-v0.1-q8_0
VALIDATION Phase
Validating with temperature: 0.2, and top_p: 0.2
Tested with temp=0.2 and top_p=0.2. Gave accuracy=0.4
Validating with temperature: 0.2, and top_p: 0.5
Generating response for sample 2..

KeyboardInterrupt: 

In [8]:

from openai import OpenAI
from dotenv import load_dotenv
import os

load_dotenv("../.env")
openai_token = os.getenv('OPENAI_API_KEY')

if openai_token:
    client = OpenAI(api_key=openai_token)
    
    results = {}
    available_nodes = set(item['function_name'] for item in used_libraries_json)
    for model in gpt_models_to_test:
        result = run_nodes_evaluation(
            client,
            few_shot_messages,
            model,
            available_nodes,
            val_prompts,
            val_responses,
            test_prompts,
            test_responses,
            temperatures=[0.2, 0.5, 1.0],
            top_ps=[0.2, 0.5, 0.8],
            seeds=[3, 75, 346]
        )
        results[model["name"]] = result
        
    # Print the final results
    print("\nFinal Results:")
    for model_name, runs in results.items():
        for run in runs:
            print(f"Model: {model_name}")
            seed = run["seed"]
            val_accuracy= run["validation_accuracy"]
            test_accuracy= run["test_accuracy"]
            best_temperature = run["temperature"]
            best_top_p = run["top_p"]
            print(
                f"  Seed {seed}: "
                f"Validation accuracy: {val_accuracy:.2f}, "
                f"Test accuracy: {test_accuracy:.2f}, "
                f"Best temperature: {best_temperature:.2f},"
                f"Best top_p: {best_top_p:.2f},"
            )
else:
    print("OPENAI_API_KEY is not set in your environment variables.")

VALIDATION Phase
Validating with temperature: 0.2, and top_p: 0.2
Generating response for sample 2..

KeyboardInterrupt: 