In [None]:
from core.markdown_correction_service import MarkdownCorrectionService
import logging
import sys
import os

# Add src directory to system path
sys.path.append(os.path.abspath('../src'))

from utils import (
    load_config_and_secrets,
)

# Create Logger
logger = logging.getLogger("english-correction-notebook")
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", 
                             datefmt="%Y-%m-%d %H:%M:%S") 

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.propagate = False

CONFIG_PATH = "../configs/configs.yaml"
SECRETS_PATH = "../configs/secrets.yaml"
LOCAL_MODEL_PATH = "/home/jovyan/datafabric/llama3.1-8b-instruct/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"

config, secrets = load_config_and_secrets(CONFIG_PATH, SECRETS_PATH)

In [None]:
import mlflow
from mlflow.models import evaluate

mlflow.set_experiment("markdown-correction-experiment")

with mlflow.start_run(run_name="markdown-correction-run") as run:
    MarkdownCorrectionService.log_model(
        llm_artifact=LOCAL_MODEL_PATH,
        config_yaml=CONFIG_PATH,
        secrets_yaml=SECRETS_PATH,
    )

    model_uri = f"runs:/{run.info.run_id}/markdown_corrector"
    mlflow.register_model(model_uri, "MarkdownCorrector")

    logger.info(f"Model registered: MarkdownCorrector")

In [None]:
import pandas as pd

from mlflow.metrics import (
    ari_grade_level,
    flesch_kincaid_grade_level,
    exact_match,
    rouge1,
    rougeL
)
from core.llm_metrics import (
    semantic_similarity_metric,
    grammar_error_count_metric,
    grammar_error_rate_metric,
    grammar_improvement_metric,
    grammar_score_metric,
    readability_improvement_metric,
    llm_judge_metric,
    llm_judge_metric_local,
    generate_gpt_gold_standards
)

# Generate GPT gold standards
print("Generating GPT gold standards...")

import json

with open("results.json", "r") as f:
    results = json.load(f)

original_texts = [item["original"] for item in results]

# Pass API key to the function
api_key = secrets.get("OPEN_AI_API_KEY") if secrets else None
gpt_gold_standards = generate_gpt_gold_standards(original_texts, api_key)

# Create evaluation DataFrame
eval_df = pd.DataFrame([
    {
        "markdown": original,
        "gpt_corrected": gpt_gold  # GPT's correction as gold standard
    }
    for original, gpt_gold in zip(original_texts, gpt_gold_standards)
])

# Run evaluation
results = mlflow.evaluate(
    model=model_uri,
    data=eval_df,
    targets="gpt_corrected",
    feature_names=["markdown"],
    extra_metrics=[
        ari_grade_level(),
        flesch_kincaid_grade_level(),
        exact_match(),
        rouge1(),
        rougeL(),
        semantic_similarity_metric,
        grammar_error_count_metric,
        grammar_error_rate_metric,
        grammar_improvement_metric,
        grammar_score_metric,
        readability_improvement_metric,
        #llm_judge_metric,
        llm_judge_metric_local
    ]
)

logger.info("Evaluation results:")
logger.info(results.metrics)
mlflow.log_metrics(results.metrics)