In [None]:
import datetime
import json
import numpy as np
import re
from my_packages.evaluation.metrics import estimate_pass_at_k
from my_packages.prompting.few_shot import create_few_shot_prompt, create_final_node_prompt, get_semantic_similarity_example_selector
from my_packages.utils.server_utils import server_diagnostics
from my_packages.evaluation.models import invoke_anthropic_model, invoke_openai_model, invoke_o1_model, invoke_ollama_model
from colorama import Fore, Style
from sklearn.metrics import f1_score
from langchain_ollama import ChatOllama
from langchain_anthropic import ChatAnthropic
from langchain_openai import ChatOpenAI
from langsmith import Client, evaluate
from langsmith.schemas import Example, Run
from langsmith.beta import convert_runs_to_test
from sklearn.metrics import f1_score
from langsmith.schemas import Example

def f1_score_summary_evaluator(outputs: list[dict], reference_outputs: list[dict]) -> dict:
    """
    Computes the mean F1 score across all examples.
    
    Parameters:
    - outputs: List of input examples.
    - reference_outputs: List of corresponding output examples.
    
    Returns:
    - dict: Summary containing the mean F1 score.
    """

    f1_scores = []
    print(outputs)
    for target, response in zip(outputs, reference_outputs):
    
        true_response = set(target["response"].replace(",", "").split())
        predicted_response = set(response["response"].replace(",", "").split())

        # Combine all nodes to ensure correct alignment
        all_nodes = sorted(true_response.union(predicted_response))
        y_true = [1 if node in true_response else 0 for node in all_nodes]
        y_pred = [1 if node in predicted_response else 0 for node in all_nodes]

        # Compute F1 score for this example
        score = f1_score(y_true, y_pred)
        f1_scores.append(score)

    # Compute mean F1 score
    mean_f1_score = sum(f1_scores) / len(f1_scores) if f1_scores else 0.0

    return {"key": "mean_f1_score", "score": mean_f1_score}


def extract_nodes(response_text: str):
    """Extract nodes from the response using regex."""
    match = re.search(fr"```midio(.*?)```", response_text, re.DOTALL)
    return match.group(1).strip() if match else response_text.strip()

# def calculate_pass_at_k_scores(target, response):
#     """Computes Pass@K for LangSmith evaluation."""
#     print("Output: ", target)
#     result_dict = {response.outputs["response"]: target.outputs["response"]}
#     pass_at_k = estimate_pass_at_k(result_dict, ks=[1, 3, 5])  # Adjust `ks` as needed
#     return pass_at_k.mean()  # Return average score for LangSmith tracking

def calculate_f1_score(target: Example, response: Example):
    """Computes F1 score for LangSmith evaluation."""

    true_response = set(response.outputs["response"].replace(",", "").split())
    predicted_response = set( target.outputs["response"].replace(",", "").split())

    # Compute F1 score using sklearn
    all_nodes = sorted(true_response.union(predicted_response))
    y_true = [1 if node in true_response else 0 for node in all_nodes]
    y_pred = [1 if node in predicted_response else 0 for node in all_nodes]

    return f1_score(y_true, y_pred)

def generate_response(client, model, task, available_nodes, example_pool):
    """Generates response using the model and selected few-shot examples."""
    similar_examples = example_pool.select_examples({"task": task})
    few_shot = create_few_shot_prompt(similar_examples, "NODES_TEMPLATE")
    final_prompt_template = create_final_node_prompt(few_shot, "NODE_GENERATOR_TEMPLATE", "NODES_TEMPLATE")
    prompt = final_prompt_template.format(task=task, external_functions=available_nodes)
    print(f"Prompt: {prompt}")

    llm = client(
        model=model,
        temperature=0.7,
        num_predict=256,
        top_p=0.9,
        top_k=50,
        stream=False,
        num_ctx=10000,
        stop=["```<|eot_id|>"],
    )
    print("generating repsonse ..")
    chain = (final_prompt_template | llm)
    response = chain.invoke({"task": task, "external_functions": available_nodes}, {"run_name": "Node Prediction"})

    return {"response": extract_nodes(response.content)}  # If string retuned, LangSmith will convert it to an dict with 'output' as key

def evaluate_nodes(client, model, available_nodes, example_pool):
    """Runs model evaluation using LangSmith `evaluate()`."""
    results = evaluate(
        lambda inputs: generate_response(client, model, inputs["task"], available_nodes, example_pool),
        data=[test_data[0]], #client.list_examples(dataset_name=dataset_name, splits=["test", "training"]),
        evaluators=[
            # calculate_pass_at_k_scores,
            calculate_f1_score
        ],
        summary_evaluators= [
            f1_score_summary_evaluator
        ],
        experiment_prefix="Node Prediction Evaluation",
        upload_results=True
    )
    print(results)
    return results

selector = get_semantic_similarity_example_selector(
    [example_to_dict(example) for example in train_data], 
    embed_client(model=models[0]),
    shots=5,
    input_keys=["task"],
)
client = ChatOllama
langsmith_client = Client()
evaluate_nodes(client, models[0], available_nodes, selector)

# langsmith_client.create_run(
#     name="ExampleRun",
#     run_type="chain",
#     inputs={"input": "Test input"},
#     outputs={"output": "Tesst output"},
#     project_name="Thesis_project",
#     end_time=datetime.datetime.now(),
# )

# convert_runs_to_test(
    
# )
# Select runs named "extractor" whose root traces received good feedback
# runs = client.list_runs(
#     project_name="<your_project>",
#     filter='eq(name, "extractor")',
#     trace_filter='and(eq(feedback_key, "user_score"), eq(feedback_score, 1))',
# )
# runs_as_test(runs, dataset_name="Extraction Good")
