In [40]:
import re
import os
import sys
import pandas as pd

from my_packages.utils.file_utils import read_code_file

sys.path.append('../../')  # Add the path to the my_packages module
os.environ['EXPERIMENT_DB_NAME'] = "few_shot_experiments"
from my_packages.db_service.error_service import delete_error_docs, errors_to_df, pretty_print_errors
from my_packages.evaluation.midio_compiler import compile_code
from my_packages.db_service.experiment_service import experiment_exists, pretty_print_experiment_collections, run_experiment_quality_checks, setup_experiment_collection

experiment_name = f"signature_similarity_10_shot"
error_type = "tests"
filter = {
    "experiment_name": experiment_name,
    "model": "llama3.2:3b-instruct-fp16",
    "eval_method": "3_fold"
}
df = errors_to_df(experiment_name, filter=filter)

def categorize_syntax_error(stderr):
    match = re.search(r'Error:\s*(.*?)(?:\n|:)', stderr)

    if match:
        return match.group(1).strip()
    
    if "expected node" in stderr.lower():
        return "Unexpected node"
    
    if "code is not compile ready" in stderr.lower():
        return "Not compile ready"
    
    return match.group(1).strip() if match else "Other syntax error"


def categorize_semantic_errors(messages):
    categorized_errors = set()

    for msg in messages:
        msg_lower = msg.lower()

        if "unable to resolve type" in msg_lower or "failed to resolve symbol" in msg_lower:
            categorized_errors.add("Unresolved symbol")

        elif "arrow from" in msg_lower and "is not allowed" in msg_lower:
            categorized_errors.add("Invalid connection")

        elif "negative context production" in msg_lower:
            categorized_errors.add("Invalid context dependency")

        elif "function header" in msg_lower:
            categorized_errors.add("Invalid function header")

        elif "leaf node" in msg_lower:
            categorized_errors.add("Invalid AST structure")

        elif "expected function or event" in msg_lower:
            categorized_errors.add("Expected function or event")

        elif "compiler plugin encountered errors" in msg_lower:
            categorized_errors.add("Compiler plugin error")


    if not categorized_errors:
        return {"Other Semantic Error"}

    return categorized_errors


def categorize_test_errors(test_result):
    if test_result:
    
        total = 0
        passed = 0
        for test in test_result.get("test_results", []):
            for assertion in test.get("assertions", []):
                total += 1
                if assertion.get("kind") == "Passed":
                    passed += 1
        if total < 3:
            total = 3
        return f"{passed}/{total}"
    return "0/3"


def extract_semantic_errors(messages):
    semantic_errors = []
    for msg in messages:
        match = re.search(r"SemanticAnalysisError\(@\d+\): (.+?)(?:, backtrace|$)", msg)
        if match:
            semantic_errors.append(match.group(1).strip())
    return semantic_errors if semantic_errors else None

def extract_failed_tests(test_result, category):
    if not isinstance(test_result, dict):
        return "No test results found."

    failed_msgs = []
    for test in test_result.get("test_results", []):
        for assertion in test.get("assertions", []):
            if assertion.get("kind") == "Failed":
                expected = assertion.get("expect")
                actual = assertion.get("actual")
                failed_msgs.append(f"Failed test: expected `{expected}`, got `{actual}`")

    return "\n".join(failed_msgs)

def extract_test_error(category, error_msg, test_result):
    message = f"{category} tests passed.\n "

    message += f"Test results:\n{extract_failed_tests(test_result, category)}\n"
    message += "\n".join(error_msg[:4])
    return message

    

df["syntax_error"] = df["stderr"]
df["syntax_category"] = df["stderr"].apply(categorize_syntax_error)

df["semantic_error"] = df["error_msg"].apply(extract_semantic_errors)
df["semantic_category"] = df["error_msg"].apply(categorize_semantic_errors)

df["tests_category"] = df["test_result"].apply(categorize_test_errors)
df["tests_error"] = df.apply(
    lambda row: extract_test_error(row["tests_category"], row["error_msg"], row["test_result"]),
    axis=1
)


# Show all rows
pd.set_option('display.max_rows', None)     

# Show all columns
pd.set_option('display.max_columns', None)

# Show full content in each cell
pd.set_option('display.max_colwidth', None)

print(df.columns)
print(df["error_type"].unique())
filtered_df  = df[df["error_type"] == error_type][[
    "test_result",
    "task_id", 
    "error_type",
    f"{error_type}_category", 
    "error_msg", 
    f"{error_type}_error", 
    "stderr", 
    "stdout"
]]


# Create a DataFrame of unique error categories
def get_error_category_counts(error_df, column_name):
    # Drop rows where the category column is null
    filtered = error_df[error_df[column_name].notnull()]
    filtered = error_df

    # Ensure the values are sets or lists, then flatten them
    all_categories = []

    for categories in filtered[column_name]:
        if isinstance(categories, (set, list)):
            all_categories.extend(categories)
        elif isinstance(categories, str):
            # Handle comma-separated strings, just in case
            all_categories.extend([cat.strip() for cat in categories.split(",")])
        else:
            all_categories.append(str(categories))  # fallback

    # Count frequencies
    category_series = pd.Series(all_categories)
    category_counts = category_series.value_counts().reset_index()
    category_counts.columns = ["category", "count"]

    return category_counts
print(get_error_category_counts(filtered_df, f"{error_type}_category"))

filtered_df



Index(['model_name', 'task_id', 'candidate_id', 'metric', 'error_type',
       'error_msg', 'code_candidate', 'test_result', 'stderr', 'stdout',
       'phase', 'seed', 'temperature', 'top_p', 'top_k', 'created_at',
       'eval_method', 'fold', 'syntax_error', 'syntax_category',
       'semantic_error', 'semantic_category', 'tests_category', 'tests_error'],
      dtype='object')
['syntax' 'semantic' 'tests']
  category  count
0      0/3    814
1      1/3    191
2      2/3    120
3      0/4     27
4      2/4     24


Unnamed: 0,test_result,task_id,error_type,tests_category,error_msg,tests_error,stderr,stdout
70,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test check_greater', 'assertions': [{'kind': 'Passed', 'expect': 'null', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}], 'passed': False}]}",8,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test check_greater"",\n ""assertions"": [\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
72,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test check_greater', 'assertions': [{'kind': 'Passed', 'expect': 'null', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}], 'passed': False}]}",8,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test check_greater"",\n ""assertions"": [\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
74,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
75,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
76,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
77,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
78,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
79,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
80,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"
81,"{'num_tests': 1, 'num_passed': 0, 'test_results': [{'name': 'Test text_match_wordz', 'assertions': [{'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Failed', 'expect': 'true', 'actual': 'null'}, {'kind': 'Passed', 'expect': 'null', 'actual': 'null'}], 'passed': False}]}",10,tests,1/3,[],"1/3 tests passed.\n Test results:\nFailed test: expected `true`, got `null`\nFailed test: expected `true`, got `null`\n",,"Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n ""num_tests"": 1,\n ""num_passed"": 0,\n ""test_results"": [\n {\n ""name"": ""Test text_match_wordz"",\n ""assertions"": [\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Failed"",\n ""expect"": ""true"",\n ""actual"": ""null""\n },\n {\n ""kind"": ""Passed"",\n ""expect"": ""null"",\n ""actual"": ""null""\n }\n ],\n ""passed"": false\n }\n ]\n}"


In [2]:

sys.path.append('../../')  # Add the path to the my_packages module
from my_packages.db_service.data_visualization import visualize_error_flow_for_model



# Assuming the experiment 'GPT4_signature_exp1' exists,
# and you evaluated 100 candidates for model 'GPT-4':
# visualize_error_flow_for_model(experiment_name, "gpt-4o")
pretty_print_experiment_collections(experiment_name, filter={"eval_method": "hold_out"})


NameError: name 'sys' is not defined

In [32]:
import json
from my_packages.evaluation.midio_compiler import get_json_test_result


test = """
Installing dependencies for midio_example@0.1.0\n\nNo external dependencies\n\nBuilding package...\nPackage built successfully!\n\n{\n "num_tests": 1,\n "num_passed": 1,\n "test_results": [\n {\n "name": "Test text_match_wordz",\n "assertions": [\n {\n "kind": "Passed",\n "expect": "true",\n "actual": "true"\n },\n {\n "kind": "Passed",\n "expect": "true",\n "actual": "true"\n },\n {\n "kind": "Passed",\n "expect": "false",\n "actual": "false"\n }\n ],\n "passed": true\n }\n ]\n}CUSTOM WARNING: Orignal code starts with 'func' keyword, but added imports and modules manually\n
"""
import re
import json

def json_e(output):
    try:
        # Match the first full JSON object using balanced braces (basic greedy match)
        match = re.search(r'{.*}', output, re.DOTALL)
        if not match:
            return {}
        json_text = match.group(0)
        return json.loads(json_text)
    except json.JSONDecodeError as e:
        return {}

    
json_e(test)

{'num_tests': 1,
 'num_passed': 1,
 'test_results': [{'name': 'Test text_match_wordz',
   'assertions': [{'kind': 'Passed', 'expect': 'true', 'actual': 'true'},
    {'kind': 'Passed', 'expect': 'true', 'actual': 'true'},
    {'kind': 'Passed', 'expect': 'false', 'actual': 'false'}],
   'passed': True}]}