## Install dependencies

In [None]:


%pip install ollama
%pip install openai
%pip install transformers
%pip install tiktoken
%pip install huggingface-hub
%pip install python-dotenv


## Prepare data

In [None]:
import json
import sys
sys.path.append('../')  # Add the path to the my_packages module
from my_packages.data_processing.split_dataset import split_on_shots, split
from my_packages.data_processing.get_labels_data import used_libraries_from_dataset
from my_packages.analysis.analyze_datasets import analyze_library_distribution, analyze_instance_distribution, analyze_visual_node_types_distribution

def used_libraries_to_string(data):
    name_doc_string = ""
    for func in data:
        name_doc_string += f"Name: {func['function_name']}\nDocumentation: {func['doc']}\n\n"
    return name_doc_string

main_dataset_folder = '../data/mbpp_transformed_code_examples/sanitized-MBPP-midio.json'

# main dataset
with open(main_dataset_folder, 'r') as file:
    dataset = json.load(file)
    
num_shot = 10 # Few-shot examples
eval_size_percentage = 0.5
train_data, val_data, test_data = split_on_shots(num_shot, eval_size_percentage, dataset, seed = 64, write_to_file=True)

# Extract all unique nodes (library_functions) across datasets
used_libraries_json = used_libraries_from_dataset(train_data)
explained_used_libraries = used_libraries_to_string(used_libraries_json)

#Bar chart of distribuation
analyze_library_distribution(train_data, val_data, test_data)
analyze_instance_distribution(train_data, val_data, test_data)
analyze_visual_node_types_distribution(train_data, val_data, test_data)

## Prompt utils

In [None]:
import os
import re
def extract_nodes(response_text):
    """Extract nodes from the response using regex."""
    # Match content between ```language and ```
    match = re.search(fr"```midio(.*?)```", response_text, re.DOTALL)

    # Extract and clean up the nodes
    if match:
        return match.group(1).strip()  # Return only the midio block

    # If no match, assume the response might already be nodes without markdown formatting
    return response_text.strip()

def read_file(_file):
    with open(_file) as reader:
        return reader.read()

def get_prompt_responses(data: json):
    script_path = os.path.dirname(os.getcwd())
    prompt_template_path = os.path.join(script_path, 'templates/prompts/FEW_SHOT_NODES_TEMPLATE.file')
    node_template_path = os.path.join(script_path, 'templates/responses/NODES_TEMPLATE.file')

    if not (os.path.exists(prompt_template_path)):
        print("Few_shot_template.file not found!!")
        return
    if not (os.path.exists(node_template_path)):
        print("Nodes_template.file not found!!")
        return
    prompt_template = read_file(prompt_template_path)
    node_template = read_file(node_template_path)

    prompts = [prompt_template.format(task_description=task['prompts'][0]) for task in data]
    responses = [node_template.format(nodes=(", ".join(task['library_functions']))) for task in data]
    return (prompts, responses)

def create_few_shot_prompt(train_prompts, train_responses):
    guiding = "You are an expert node selector for the flow-based language Midio. You will get prompted with a task and you will respond with the built-in nodes (aka library functions) that can be used to solve the task.\n" 
    node_list = f"Only use the following nodes (library functions):\n {explained_used_libraries}\n\n"
    context = guiding + node_list
    few_shots_messages = [{"role": "developer", "content": context}] 
    for i, (prompt, response) in enumerate(zip(train_prompts, train_responses)):
        few_shots_messages.append({"role": "user", "content": prompt})
        few_shots_messages.append({"role": "assistant", "content": response})
    return few_shots_messages

train_prompts, train_responses = get_prompt_responses(train_data)
val_prompts, val_responses = get_prompt_responses(val_data)
test_prompts, test_responses = get_prompt_responses(test_data)

few_shot_messages = create_few_shot_prompt(train_prompts, train_responses)
print(few_shot_messages)

## Models to test and Calculations of max tokens

1. Define models
2. Loop through all responses for train, validation and test data, created from get_prompt_responses() function in previous cell

### Open source models

In [None]:
from openai import OpenAI
from my_packages.utils.tokens import find_max_tokens_nodes_tokenizer, find_max_tokens_nodes_api
from transformers import AutoTokenizer
from huggingface_hub import login
import os
from dotenv import load_dotenv

open_models = [
    {"name": "llama3.1:8b"},
    # {"name": "llama3.3:70b-instruct-fp16", "tokenization": "meta-llama/Meta-Llama-3.1-8B"},
    # {"name": "llama3.3:70b-instruct-q8_0", "tokenization": "meta-llama/Meta-Llama-3.1-8B"},
    # {"hg_name": "mistralai/Mistral-Small-Instruct-2409", "ollama_name": ""},
    # {"hg_name": "meta-llama/Llama-3.3-70B-Instruct", "ollama_name": ""},
    # {"hg_name": "meta-llama/CodeLlama-70b-Instruct-hf", "ollama_name": ""},
    # {"hg_name": "meta-llama/Llama-3.2-90B-Vision-Instruct", "ollama_name": ""}
]

DATA_DIR = '../data/mbpp_transformed_code_examples/only_files'
OUTPUT_JSON = 'token_counts.json'
host = 'http://localhost:11434'

client = OpenAI(
  base_url=host+'/v1/',
  api_key='ollama' #Ignored with 'ollama'. To use GPT models, this key need to be set.
)
all_responses = train_responses + val_responses + test_responses
open_models_to_test = []
for model_info in open_models:

    max_tokens = find_max_tokens_nodes_api(all_responses, model_info["name"], client.embeddings)
    client
    model_result = {
        "name": model_info["name"],
        "max_tokens": max_tokens,
    }
    open_models_to_test.append(model_result)

print(open_models_to_test)

### GPT-models

In [None]:
import tiktoken

gpt_models = [
    {"name": "o1-preview", "tokenization": "o200k_base"},
    {"name": "gpt-4o", "tokenization": "o200k_base"},
]

gpt_models_to_test = []
for model_info in gpt_models:
    encoding = tiktoken.encoding_for_model(model_info["name"])
    max_tokens = find_max_tokens_nodes_tokenizer(all_responses, encoding)
    model_result = {
        "name": model_info["name"],
        "max_tokens": max_tokens
    }
    gpt_models_to_test.append(model_result)
print(gpt_models_to_test)

## Test some Models, with different seeds, temperatures, top_ps

In [None]:
from my_packages.utils.ollama_utils import is_remote_server_reachable
import ollama # https://github.com/ollama/ollama-python
from openai import OpenAI
import time
from subprocess import call

host = 'http://localhost:11434'

if is_remote_server_reachable(host + "/api/tags"):
    print("Server is reachable.")
else:
    rc = call("../SSH_FORWARDING.sh")
    print("Ollama server is not reachable. Batch job might have finished. Try running bash script again.")

ollama.Client(
  host=host,
)
client = OpenAI(
  base_url=host+'/v1/',
  api_key='ollama' #Ignored with 'ollama'. To use GPT models, this key need to be set.
)

# Function to generate and evaluate responses
def evaluate(client, model, prompts, responses, seed, max_new_tokens=50, temperature=0.7, top_p=0.9):
    correct = 0
    total = len(prompts)
    for index, (prompt, true_response) in enumerate(zip(prompts, responses)):
        max_retries = 3
        retries = 0
        while retries < max_retries:
            try:
                print(f"Generating response..", end="\r")
                generated = client.chat.completions.create(
                    model=model,
                    messages=few_shot_messages + [{"role": "user", "content": prompt}],
                    # template="{{ if .Prompt }}<|start_header_id|>user<|end_header_id|>\n\n{{ .Prompt }}<|eot_id|>{{ end }}<|start_header_id|>assistant<|end_header_id|>\n\n```midio\n{{ .Response }}\n```<|eot_id|>", # Template enforces code block
                    max_tokens=max_new_tokens,
                    seed=seed,
                    temperature=temperature,
                    top_p=top_p,
                    stream=False,
                    stop=["```<|eot_id|>"]  # Ensure the response stops after the code block
                )
                
                break
            except Exception as e:
                retries += 1
                print(f"Attempt {retries} failed with error: {e}")
                if is_remote_server_reachable(host + "/api/tags"):
                    print("Server is reachable.")
                else:
                    rc = call("../SSH_FORWARDING.sh")
                    # Check the result
                    if rc == 0:
                        print("Command executed successfully!")
                    else:
                        print(f"SSH command failed with return code {rc}")
                        print(f"Try to manually connect to the server again. ")
                        #Wait one minute before trying again
                        try:
                            for remaining in range(60, 0, -1):
                                print(f"Next try in {remaining} seconds", end="\r")  # Print on the same line
                                time.sleep(1)  # Wait for 1 second
                            print("Time's up!                          \n")  # Clear the line after completion
                        except KeyboardInterrupt:
                            print("\nCountdown interrupted!")
    
        else:
            print("Failed to get a response from the server after " + str(retries) + " attempts.")
            generated_nodes = ""
            return -1

        # Extract nodes from the generated response and transform to a set
        generated_nodes = extract_nodes(generated.choices[0].message.content)
        generated_nodes_set = set(generated_nodes.replace(",", "").split())
        # Extract nodes from the true response and transform to a set
        true_response_nodes = extract_nodes(true_response)
        true_response_set = set(true_response_nodes.replace(",", "").split())
        # Extract available library functions from the used_libraries_json and transform to a set
        library_functions = set(item['function_name'] for item in used_libraries_json)

        print(f"\n\nSample: {index}")
        print(f"Prompt: {prompt}")
        print(f"Generated response:\n {generated_nodes}")
        print(f"True response:\n {true_response}")

        #Remove invalid nodes from the generated nodes
        valid_generated_nodes = generated_nodes_set.intersection(library_functions)
        
        if true_response_set.issubset(valid_generated_nodes):
            print("correct response")
            correct += 1
        else:
            print("Invalid response")
    
    return correct / total

results = {}

for model in open_models_to_test:
    print(f"Model: {model['name']}")
    ollama.pull(model['name'])  # Pull the model from the server

    # Validate the model on the validation set with different hyperparameters abd a fixed seed
    SEED = 42 #During Validation Phase
    print("max_tokens in dataset with current pipeline:", model["max_tokens"])

    temperatures = [0.5, 0.7, 0.9]
    top_ps = [0.2, 0.5, 1.0]
    best_accuracy = 0
    best_params = {"temperature": 0.7, "top_p": 0.9}

    for temp in temperatures:
        for top_p in top_ps:
            accuracy = evaluate (
                client,
                model['name'],
                val_prompts,
                val_responses,
                SEED,
                model["max_tokens"],
                temp,
                top_p
            )
            print(f"Tested with temp={temp} and top_p={top_p}. Gave accuracy={accuracy}")
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = {"temperature": temp, "top_p": top_p}

    print(f"Best Hyperparameters for {model['name']}: {best_params}, Validation Accuracy: {best_accuracy:.2f}")
    
    #Test the model on the test setm with different seeds and the best hyperparameters.
    seeds = [3, 75, 346]
    for seed in seeds:
        print(f"\nTesting with Seed: {seed}")

        test_accuracy = evaluate(
            client,
            model['name'],
            test_prompts,
            test_responses,
            seed,
            model["max_tokens"],
            temperature=best_params["temperature"],
            top_p=best_params["top_p"]
        )

        print(f"Test Accuracy for {model['name']}: {test_accuracy:.2f}")

        if model["name"] not in results:
            results[model["name"]] = []
        results[model["name"]].append({
            "seed": seed,
            "validation_accuracy": best_accuracy,
            "test_accuracy": test_accuracy
        })

print("\nFinal Results:")
for model_name, metrics in results.items():
    print(f"{model_name}:")
    for run in metrics:print(f"  Seed {run['seed']}: Validation Accuracy: {run['validation_accuracy']:.2f}, Test Accuracy: {run['test_accuracy']:.2f}")
