In [None]:
from utils.helper import get_api_key, validate_prediction, load_data, ExperimentStats
import dspy
import mlflow

If the cell below takes to long, please check terminal with `mlflow` server running.

Spin up MLflow server by command:

```bash
mlflow server --backend-store-uri sqlite:///data/mlflow.db --port 5005
```

In [None]:
# Set up experiment
# Before execution this cell
# run in terminal:
# mlflow server --backend-store-uri sqlite:///data/mlflow.db --port 5005

mlflow.set_tracking_uri("http://127.0.0.1:5005")
mlflow.set_experiment("dspy_evaluation")

# Enable automatic logging for DSPy
mlflow.dspy.autolog()
print("✓ MLflow tracking enabled")
print("View results: http://localhost:5005 or http://127.0.0.1:5005")

In [None]:
# Configure your values here
model_name = 'groq/llama-3.1-8b-instant'
api_key = get_api_key('GROQ_API_KEY')
api_endpoint = 'https://api.groq.com/openai/v1'
useCache = False

In [None]:
llm = dspy.c(
    model_name,
    api_key=api_key,
    api_base=api_endpoint,
    cache=useCache
)

# Set default LLM
dspy.settings.configure(lm=llm)

In [None]:
ds = load_data('../data/dataset.yaml')

In [None]:
# ========================================
# DEMO: Metric Function (provided)
# ========================================
# This function evaluates how good a fix is

def metric_function(example, prediction, trace=None):
    fixed_code = prediction.fixed_code
    score, comment = validate_prediction(fixed_code, example['test_case'])
    return score


In [None]:
# Baseline: Dummy fixer (returns original code)
class DummyFixer(dspy.Module):
    """A dummy fixer that returns the original code"""

    def forward(self, content, traceback) -> dspy.Prediction:
        return dspy.Prediction(
            analysis="Code analysis",
            fixed_code=content)

dummy_fixer = DummyFixer()

In [None]:
# Let's map our format to dspy's `Example` type
dataset = [dspy.Example(v).with_inputs('content', 'traceback') for v in ds['workshop']]

In [None]:
# ========================================
# Your CodeFixer from Section 3
# ========================================

class AnalyzeSignature(dspy.Signature):
    """Explain the problem in the code"""
    snippet = dspy.InputField(description="Code snippet")
    context = dspy.InputField(description="Extra context about issue, like syntax error, etc.")
    summary = dspy.OutputField(description="Issue details")

class FixSignature(dspy.Signature):
    """Fix the code based on analysis"""
    snippet = dspy.InputField(description="Code snippet")
    context = dspy.InputField(description="Extra context about issue, like syntax error, etc.")
    analysis = dspy.InputField(description="Analysis of the issue")
    fixed_code = dspy.OutputField(description="Fixed code snippet")


class CodeFixer(dspy.Module):
    """Module to analyze and fix code issues"""

    def __init__(self):
        self.analyze = dspy.ChainOfThought(AnalyzeSignature)
        self.fix = dspy.Predict(FixSignature)

    def forward(self, content, traceback) -> dspy.Prediction:
        analysis_res = self.analyze(snippet=content, context=traceback)
        fix_res = self.fix(snippet=content, context=traceback, analysis=analysis_res.summary)
        return dspy.Prediction(
            analysis=analysis_res.summary,
            fixed_code=fix_res.fixed_code)

fixer = CodeFixer()

In [None]:
# ========================================
# DEMO: Evaluation Setup (provided)
# ========================================
stats = ExperimentStats(dataset)
evaluate = dspy.Evaluate(
    devset=dataset,
    metric=metric_function,
    display_progress=True,
    num_threads=1
)

In [None]:
# Run evaluation for dummy fixer (baseline) AND track it in MLflow
with mlflow.start_run(run_name="baseline_dummy"):
    print("Evaluating dummy fixer (baseline)...")
    dummy_result = evaluate(dummy_fixer)
    stats.add_experiment('dummy', dummy_result)

    # MLflow auto-logs metrics, but we can add custom info
    mlflow.log_param("fixer_type", "dummy")
    mlflow.log_metric("pass_rate", dummy_result.score / 100)

print(f"✓ Logged to MLflow: baseline_dummy")

In [None]:
stats.get_stats()

In [None]:
# ========================================
# TASK 1: Evaluate Your CodeFixer (with MLflow)
# ========================================
# Run evaluation AND track it in MLflow
#
# TODO:
# 1. Start an MLflow run with name "my_codefixer"
# 2. Run evaluate(fixer)
# 3. Add to stats
# 4. Log the pass rate to MLflow
# ========================================

# YOUR CODE HERE:
with mlflow.start_run(run_name="my_codefixer"):
    result = evaluate(fixer)
    stats.add_experiment('my_fixer', result)
    mlflow.log_metric("pass_rate", result.score / 100)
    mlflow.log_param("fixer_type", "codefixer_v1")

In [None]:
stats.get_stats()

In [None]:
# ========================================
# Analyze Results
# ========================================
# Look at the stat table and ML Flow UI above
#
# Questions to discuss:
# 1. Which types of errors does your fixer handle best?
# 2. Which examples still fail? Why?
# 3. What's the score improvement over baseline?
# ========================================