<!-- ## Install dependencies -->

In [1]:


# For Notebook:
# %pip install ollama
# %pip install openai
# %pip install anthropic
# %pip install transformers
# %pip install tiktoken
# %pip install python-dotenv
# %pip install colorama

# For packages: 
# %pip install scikit-learn
# %pip install matplotlib
# %pip install numpy
# %pip install pandas
# %pip install iterative-stratification
# %pip install langchain
# %pip install "langsmith<0.2.0"

# For WebUI testing:
# %pip install open-webui
import os
os.environ['EXPERIMENT_DB_NAME'] = "context_experiments"

%load_ext autoreload

# Static splits

In [2]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.data_processing.attributes_processing import used_functions_to_string
from my_packages.prompting.prompt_building import transform_code_data
from my_packages.utils.file_utils import read_dataset_to_json

main_dataset_folder = '../../data/MBPP_Midio_50/'


def get_hold_out_splits(main_dataset_folder):
    train_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/train_dataset.json'))
    val_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/validation_dataset.json'))
    test_data = transform_code_data(read_dataset_to_json(main_dataset_folder + 'splits/hold_out/test_dataset.json'))

    print(f"Train data: {len(train_data)}")
    print(f"Val data: {len(val_data)}")
    print(f"Test data: {len(test_data)}")
    return train_data, val_data, test_data

# Use the static validation split for hyperparmeter tuning:
_, val_data, _ = get_hold_out_splits(main_dataset_folder=main_dataset_folder)

dataset = read_dataset_to_json(main_dataset_folder + "MBPP-Midio-50.json")
used_functions_json = read_dataset_to_json(main_dataset_folder + "/metadata/used_external_functions.json")
available_nodes = used_functions_to_string(used_functions_json)
print(used_functions_json)




Script is located in: /root/Thesis_project/my_packages/common
Project is located in: /root/Thesis_project
Train data: 32
Val data: 9
Test data: 9
[{'type': 'function', 'function_name': 'Std.If', 'module_path': 'Std', 'doc': 'If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.', 'body': 'extern func(doc: "If `input` is true, the `then` trigger is executed, otherwise the `else` trigger is executed. `value` contains the value of `input`.") If {\n\n        in(x: 0, y: 0, name: "execute") trigger() execute\n\n        in(x: 0, y: 0, name: "input") property(Bool) predicate\n\n        out(x: 0, y: 0, name: "then") trigger() then\n\n        out(x: 0, y: 0, name: "else") trigger() else\n\n        out(x: 0, y: 0, name: "value") property(Bool) value\n\n    }', 'types': 'type CountContext Number\ntype AnyContext Any'}, {'type': 'function', 'function_name': 'Std.IfExpression', 'module_path': 'Std', 'doc': 'IfExpression 

# 3_fold splits

In [3]:

import os
from my_packages.data_processing.split_dataset import create_kfold_splits

main_dataset_folder = '../../data/MBPP_Midio_50/'
run_k_fold = 0 # 0-2, because we have 3 folds

def get_k_fold_splits(main_dataset_folder, k, k_folds=3):

    if os.path.exists(main_dataset_folder + f'splits/{k_folds}_fold'):
        print(f"Using existing {k_folds}-fold splits")
    else:
        print(f"Creating {k_folds}-fold splits")
        train, _ ,test = get_hold_out_splits(main_dataset_folder)
        # Only use train+train for k-fold splits
        create_kfold_splits((train+test), k_folds=k_folds, write_to_file=True)

    train_data = read_dataset_to_json(main_dataset_folder + f'splits/{k_folds}_fold/train_dataset_{k}.json')
    test_data = read_dataset_to_json(main_dataset_folder + f'splits/{k_folds}_fold/test_dataset_{k}.json')

    print(f"Train data: {len(train_data)}")
    print(f"Test data: {len(test_data)}")
    return train_data, test_data

# Use the K_fold split for testing:
train_data, test_data = get_k_fold_splits(
    main_dataset_folder=main_dataset_folder, 
    k=run_k_fold, 
    k_folds=3
)
print(f"Chosen fold: {run_k_fold}")



Using existing 3-fold splits
Train data: 27
Test data: 14
Chosen fold: 0


# Init client and selected models

<!-- ## Experiments settings -->

In [4]:

import sys
sys.path.append('../../')  # Add the path to the my_packages module
import sys
from dotenv import load_dotenv
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.common.config import model_configs
from langchain_ollama import OllamaEmbeddings

model_provider = 'ollama'

load_dotenv("../../.env")
all_responses = [sample["response"] for sample in train_data] + [sample["response"] for sample in val_data] + [sample["response"] for sample in test_data]
print(f"Number of all responses: {len(all_responses)}")

client, models = model_configs(
    all_responses=all_responses,
    model_provider=model_provider,
    models=["qwq:32b-fp16"],
)

Script is located in: /root/Thesis_project/my_packages/common
Project is located in: /root/Thesis_project
Env is located in: /root/Thesis_project/.env
Number of all responses: 50
Url: http://localhost:11434/api/tags
Response: <Response [200]>
Server is reachable.


# Init Example selector

In [5]:
import sys
sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.prompting.example_selectors import get_coverage_example_selector, get_semantic_similarity_example_selector

# prompt_prefix = "Create a function" # "e.g., "Create a flow"
NUM_SHOTS = 1
semantic_selector = True

example_pool = train_data
example_pool.sort(key=lambda x: int(x['task_id']))
print(f"Number of examples in the pool: {len(example_pool)}")

if semantic_selector:
    selector = get_semantic_similarity_example_selector(
        example_pool, 
        OllamaEmbeddings(model="nomic-embed-text"),
        shots=NUM_SHOTS,
        input_keys=["task"],
    )
else:
    selector = get_coverage_example_selector(
        example_pool, 
        label = "external_functions",
        shots=NUM_SHOTS,
        seed=9
    )
used_ids = []
# for task in train_data:
#     examples = selector.select_examples({"task": task["task"]})
#     used_ids += [e['task_id'] for e in examples]
#     print(f"Task: {task['task']}")


Number of examples in the pool: 27


## Evaluation

In [6]:
import os
import sys
sys.path.append('../../')  # Add the path to the my_packages module
%autoreload 2
%reload_ext autoreload
os.environ['EXPERIMENT_DB_NAME'] = "context_experiments"
from my_packages.common.classes import PromptType
from my_packages.db_service import get_db_connection
from my_packages.db_service.best_params_service import get_db_best_params
from my_packages.db_service.experiment_service import confirm_rerun, experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection
from my_packages.evaluation.code_evaluation import run_validation, run_testing
from my_packages.utils.tokens_utils import get_model_code_tokens_from_file
from my_packages.common.rag import init_rag_data
from colorama import Fore, Back, Style


script_dir = os.getcwd()
project_dir = os.path.abspath(f"{script_dir}/../..")

sys.path.append(project_dir)
print("Script is located in:", script_dir)
print("Project is located in:", project_dir)


metrics = ['syntax', 'semantic'] #['syntax', 'semantic', 'tests'] OR ['syntax', 'semantic']
prompt_type = PromptType.REGULAR
example_selector = f"{'similarity' if semantic_selector else 'coverage'}"
experiment_type = "RAG" # "RAG" or "full-context"
max_ctx = 16000
context_data = init_rag_data()
experiment_name = f"{prompt_type.value}_{experiment_type}_{NUM_SHOTS}_shot"
best_params_experiment_name = f"{prompt_type.value}_{example_selector}_{NUM_SHOTS}_shot"

print(experiment_name)

# Create separate database for this experiment
if experiment_exists(experiment_name):
    # delete_experiment(experiment_name)
    print(f"📂 Experiemnt '{experiment_name}' already exists.")
    pretty_print_experiment_collections(
        experiment_name,
        exclude_columns=["stderr", "stdout", "code_candidate", "test_result", "error_msg"]
    )
    if not run_experiment_quality_checks(experiment_name, ignore_best_params=True):
        print("\n❌ Experiment quality checks failed. Exiting.")
        raise Exception("Experiment quality checks failed.")
else:
    setup_experiment_collection(experiment_name)

results = {}
for model_name in models:
    print("\n\n")
    model = get_model_code_tokens_from_file(model_name, f'{project_dir}/data/max_tokens.json')
    results[model_name] = (None, None)

    validation_result = get_db_best_params( 
        best_params_experiment_name, 
        model["name"], 
        [metrics[-1]], # optimizer_metric
        k = 1,
        eval_method="hold_out",
        db_connection=get_db_connection("few_shot_experiments") # Use the few_shot_experiments database to get best params
    )[0]


    if confirm_rerun(experiment_name, model_name, eval_method="3_fold", phase="testing"): # If results exists for model, prompt user to rerun testing. If not exists, return true to run testing first time.
            test_results, final_result = run_testing(
                client,
                model,
                available_nodes,
                test_data,
                selector,
                temperature=validation_result["temperature"],
                top_p=validation_result["top_p"],
                top_k=validation_result["top_k"],
                ks=[1, 2, 5, 10], # max(ks) generations per task. Thus, 10 generations per task.
                seeds=[3, 75, 346],
                debug=True,
                metrics=metrics, # current metrics: ["syntax", "semantic"] OR ["syntax", "semantic", "tests"]
                experiment_name=experiment_name,
                env="DEV", #'dev' for file storage and 'prod' for database storage, for errors and final result
                prompt_type = prompt_type,
                rag_data=context_data,
                max_ctx=max_ctx
            )
            results[model_name] = (test_results, final_result)

        
    print(f"======\n{model_name} FINAL RESULTS: ===============================")
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    test_runs = results[model_name][0]
    final_result = results[model_name][1]
    if test_runs:
        for run in test_runs:
            run.print()
    if final_result:
        final_result.print()

      
# Print the final results
print("\nALL MODELS FINAL RESULTS:")
for model_name, (test_runs, final_result) in results.items():
    print(f"{Style.BRIGHT}{Fore.CYAN} Model: {model_name} {Style.RESET_ALL}")
    for test_run in test_runs:
        test_run.print()
    final_result.print()



Script is located in: /root/Thesis_project/experiments/context
Project is located in: /root/Thesis_project
Number of language docs: 20
Number of node docs: 290
✅ FAISS index saved successfully.
✅ FAISS index saved successfully.
regular_RAG_1_shot
📂 Experiemnt 'regular_RAG_1_shot' already exists.

🔍 Existing collections for regular_RAG_1_shot: 

📂 Collection: regular_RAG_1_shot_errors 
--------------------------------------------------
⚠️ No data found in this collection.

📂 Collection: regular_RAG_1_shot_results 
--------------------------------------------------
⚠️ No data found in this collection.

📂 Collection: regular_RAG_1_shot_best_params 
--------------------------------------------------
⚠️ No data found in this collection.

=== Running quality checks for hold_out experiment: regular_RAG_1_shot ===
No models found for hold_out experiment 'regular_RAG_1_shot'.
✅ All models passed quality checks successfully.
=== End of quality checks ===




✅ Existing hold_out best parameters f

Token indices sequence length is longer than the specified maximum sequence length for this model (2718 > 1024). Running this sequence through the model will result in indexing errors


Using 16000 for context window

Prompt size = 2718, leaving 8282 tokens for RAG + buffer.
Total available tokens after prompt:  8282
allocating remaining 5632  tokens to node context
Estimated to extract k = 55 documents.
Initial docs contain 8195 tokens of availbale 5632 tokens
Removed 19 documents to fit within 5632 tokens. From 55 docs to 36 docs
Final prompt size: 10935 (plus 5000 for output).
    > Generating n response..  (1/10)

KeyboardInterrupt: 

<!-- ## Langsmith evaluate -->

## Testing one run:

In [None]:
%autoreload 2
%reload_ext autoreload
from my_packages.common.rag import init_rag_data
from my_packages.evaluation.code_evaluation import evaluate_code, run_model

context = init_rag_data()
tokens = client(model=model["name"], num_ctx=4000).get_num_tokens(context.formatted_language_context) 
print(f"Number of tokens: {tokens}")



model_result, largest_context = run_model(
    client,
    model_name,
    available_nodes=available_nodes,
    data=val_data[:2],
    example_pool=selector,
    max_new_tokens = 1000,
    temperature = 0.2,
    top_p = 0.9,
    top_k = 50,
    n = 1,
    seed = 9,
    debug =True, 
    prompt_type=PromptType.REGULAR,
    rag_data=init_rag_data(),
    max_ctx=16000
)

metric_results_lists = evaluate_code (
    model_result,
    ks=[1],
    evaluation_metrics=["syntax"],
    experiment_name="test_RAG_5-shot",
    model_name=model_name,
    env="dev",
    hyperparams={"seed": 9, "temperature": 0.2, "top_p": 0.9, "top_k": 50},
    phase="validation"
)
## Optimizing for the first k in the ks list
pass_at_k_dict = metric_results_lists[0]
val_metric = pass_at_k_dict[f"pass@1"]
print(f"Validation with temp={0.2}, top_k={50} and top_p={0.9}. Gave pass@1={val_metric} and pass@ks={pass_at_k_dict}")


Number of language docs: 20
Number of node docs: 290
✅ FAISS index saved successfully.
✅ FAISS index saved successfully.
Number of tokens: 2612
Number of language docs: 20
Number of node docs: 290
✅ FAISS index saved successfully.
✅ FAISS index saved successfully.
Total available tokens:  9282


AttributeError: 'RagData' object has no attribute 'language_docs'