<h1 style=\"text-align: center; font-size: 50px;\"> Automated Evaluation  with Structured Outputs </h1>

# Notebook Overview
- Imports
- Configurations
- Verify Assets
- Load Data
- Load Data & Validate Columns
- Sort TotalScore and scores
- Logging Model to MLflow
- Fetching the Latest Model Version from MLflow
- Loading the Model and Running Inference


# Imports

In [None]:
%pip install -r ../requirements.txt --quiet

In [None]:
# ─────── Standard Library ───────
import time
import json
import logging
import re
from typing import List, Dict, Any
import httpx
from pathlib import Path
import warnings
import sys
import os
import torch

import pandas as pd
from tqdm.auto import tqdm
from llama_cpp import Llama, LlamaGrammar
import multiprocessing

# ─────── MLflow Integration ───────
import mlflow
import mlflow.pyfunc
from mlflow.models.signature import ModelSignature
from mlflow.types import Schema, ColSpec, DataType, ParamSpec, ParamSchema
from mlflow.tracking import MlflowClient

# === Internal modules ===

# Add 'src' directory to system path (2 levels up)
src_path = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if src_path not in sys.path:
    sys.path.append(src_path)


# Configurations

In [None]:
warnings.filterwarnings("ignore")

In [None]:
# Measure total runtime
start_notebook = time.time()

In [None]:
# 2. Constants & Configuration
INPUT_PATH    = "../data/2025 ISEF Project Abstracts.csv"  # <-- set your input CSV here
OUTPUT_PATH   = "../Sorted_by_Score.csv"             # <-- set your output CSV here
KEY_COLUMN    = "BoothNumber"                     # <-- unique ID column
EVAL_COLUMN   = "AbstractText"                    # <-- text column to evaluate
CRITERIA      = [                                  # <-- list your evaluation criteria
    "Originality",
    "ScientificRigor",
    "Clarity",
    "Relevance",
    "Feasibility",
    "Brevity",
]
BATCH_SIZE    = 5
# ─────── MLflow Experiment Configuration ───────
EXPERIMENT_NAME = "LLaMA_Evaluator_Experiment"
RUN_NAME        = "LLaMA_Evaluator_Run"
MODEL_NAME      = "LlamaEvaluatorModel"
# ─────── PATHS ───────
#LLAMA_GGUF_PATH = "/home/jovyan/datafabric/Meta-Llama-3.1-8B-Instruct-Q8_0/Meta-Llama-3.1-8B-Instruct-Q8_0.gguf"
LLAMA_GGUF_PATH = "/home/jovyan/datafabric/llama2-7b/ggml-model-f16-Q5_K_M.gguf"

In [None]:
%%time

# 3. Load & Configure Local LLaMA
#local_model_path = "/home/jovyan/datafabric/Meta-Llama-3-8B-Instruct-Q8_0/Meta-Llama-3-8B-Instruct-Q8_0.gguf"
local_model_path = LLAMA_GGUF_PATH

llm = Llama(
            model_path=local_model_path,
            n_gpu_layers=-1,
            n_batch=128,
            n_ctx=8192,
            max_tokens=512,
            f16_kv=True,
            use_mmap=True,
            low_vram=True,
            rope_scaling=None,
            temperature=0.0,
            repeat_penalty=1.0,
            streaming=False,
            stop=None,
            seed=42,
            num_threads=multiprocessing.cpu_count(),
            verbose=False,
        )

In [None]:
# Assuming you have:
#   KEY_COLUMN = "BoothNumber"
#   CRITERIA   = ["Originality","ScientificRigor","Clarity","Relevance","Feasibility"]

# Build the bullet list dynamically:
criteria_bullets = "\n".join(f"- {c}" for c in CRITERIA)

# Build the example-object fields (all with dummy value 7):
example_fields = ", ".join(f'"{c}": 7' for c in CRITERIA)

SYSTEM_INSTRUCTIONS = (
    "You are an expert evaluator.  "
    f"For each input record, score 1–10 on these criteria:\n"
    f"{criteria_bullets}\n\n"
    "Respond *only* with a valid JSON object of the form:\n"
    "{\n"
    '  "results": [\n'
    f'    {{ "{KEY_COLUMN}": "...", {example_fields} }}\n'
    "  ]\n"
    "}\n"
    "Do not include any other text, explanation, or markup."
)

In [None]:
logger = logging.getLogger("automated_evaluation_logger")
logger.setLevel(logging.INFO)

formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)
logger.propagate = False

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

In [None]:
logger.info('Notebook execution started.')

## Verify Assets

In [None]:
def log_asset_status(asset_path: str, asset_name: str, success_message: str, failure_message: str) -> None:
    """
    Logs the status of a given asset based on its existence.

    Parameters:
        asset_path (str): File or directory path to check.
        asset_name (str): Name of the asset for logging context.
        success_message (str): Message to log if asset exists.
        failure_message (str): Message to log if asset does not exist.
    """
    if Path(asset_path).exists():
        logger.info(f"{asset_name} is properly configured. {success_message}")
    else:
        logger.info(f"{asset_name} is not properly configured. {failure_message}")


# Check and log status for model
log_asset_status(
    asset_path=local_model_path,
    asset_name="LLaMA Local model",
    success_message="",
    failure_message="Please create and download the required assets in your project on AI Studio."
)

log_asset_status(
    asset_path=LLAMA_GGUF_PATH,
    asset_name="LLaMA model",
    success_message="",
    failure_message="Please create and download the required assets in your project on AI Studio."
)

# Helper Functions

In [None]:
def chunk_list(lst: List[int], size: int) -> List[List[int]]:
    return [lst[i : i + size] for i in range(0, len(lst), size)]

# Load the “json_arr” grammar for a top‐level JSON array
GRAMMAR_URL = "https://raw.githubusercontent.com/ggerganov/llama.cpp/master/grammars/json_arr.gbnf"
grammar_text = httpx.get(GRAMMAR_URL).text
json_arr_grammar = LlamaGrammar.from_string(grammar_text)

def evaluate_batch(batch_df: pd.DataFrame) -> List[Dict[str, Any]]:
    """
    Scores a batch of rows under the json_arr grammar,
    returning a flat list of dicts with KEY_COLUMN + CRITERIA keys.
    """
    payload = [
        {KEY_COLUMN: str(r[KEY_COLUMN]), EVAL_COLUMN: r[EVAL_COLUMN]}
        for _, r in batch_df.iterrows()
    ]
    prompt = SYSTEM_INSTRUCTIONS + "\n\n" + json.dumps(payload, indent=2)

    resp: Dict[str, Any] = llm(
        prompt,
        grammar=json_arr_grammar,
        #grammar=objarr_grammar,   # ← now only allows [ { … }, { … } ]
        max_tokens=-1,
        temperature=0.0,
    )
    # Extract text
    text = resp["choices"][0]["text"]
    data = json.loads(text)
    if isinstance(data, list):
        return data
    else:
        raise RuntimeError(f"Expected JSON array, got {type(data)}:\n{text}")

# Load Data & Validate Columns

In [None]:
df = pd.read_csv(INPUT_PATH)
for col in (KEY_COLUMN, EVAL_COLUMN):
    if col not in df.columns:
        raise KeyError(f"Required column '{col}' not found in input CSV")

df[KEY_COLUMN] = df[KEY_COLUMN].astype(str)

# 6. Batch Evaluation Loop
results: List[Dict[str, Any]] = []
for batch_idxs in tqdm(
    chunk_list(df.index.tolist(), BATCH_SIZE),
    desc="Scoring batches",
    unit="batch"
):
    batch_df      = df.loc[batch_idxs]
    batch_results = evaluate_batch(batch_df)
    results.extend(batch_results)

# Sort TotalScore and scores

In [None]:
flat_results: List[Dict[str, Any]] = []
for batch in results:
    # If each batch is a dict with a "results" key, use that list
    if isinstance(batch, dict) and "results" in batch and isinstance(batch["results"], list):
        flat_results.extend(batch["results"])
    # If somehow you ended up with lists directly, handle those too
    elif isinstance(batch, list):
        flat_results.extend(batch)
    else:
        # Ignore anything else (e.g. stray floats)
        logging.warning(f"Ignoring unexpected batch entry: {batch!r}")

# 8. Build the scores DataFrame
scores_df = pd.DataFrame(flat_results)

# 9. Sanity check: ensure your key column is present
if KEY_COLUMN not in scores_df.columns:
    raise KeyError(
        f"Expected column '{KEY_COLUMN}' in scores_df, but got: {scores_df.columns.tolist()}"
    )

# 10. Cast keys to string on both sides
scores_df[KEY_COLUMN] = scores_df[KEY_COLUMN].astype(str)
df[KEY_COLUMN]      = df[KEY_COLUMN].astype(str)

# 11. Merge, compute TotalScore, sort, and export
combined = df.merge(scores_df, on=KEY_COLUMN, how="left")
combined["TotalScore"] = combined[CRITERIA].sum(axis=1)
combined.sort_values("TotalScore", ascending=False, inplace=True)
combined.reset_index(drop=True, inplace=True)

combined.to_csv(OUTPUT_PATH, index=False)
elapsed = time.time() - start_notebook
print(f"✅ Done in {elapsed:.1f}s — output saved to '{OUTPUT_PATH}'")

In [None]:
combined

# Logging Model to MLflow

In [None]:
GRAMMAR_URL = (
    "https://raw.githubusercontent.com/ggerganov/llama.cpp/"
    "master/grammars/json_arr.gbnf"
)

class LlamaEvaluatorModel(mlflow.pyfunc.PythonModel):
    """
    A PythonModel that uses a local LLaMA model to score texts on multiple criteria.

    Predict signature:
      predict(self, context, model_input: DataFrame, params: Dict[str,Any])
    where `params` must include:
      - key_column (str)
      - eval_column (str)
      - criteria (JSON-encoded list of str)
      - batch_size (int)
    """
    def load_context(self, context):
        # 1. Load LLaMA model
        model_path = context.artifacts["llama_model_path"]
        self.llm = Llama(
            model_path=model_path,
            n_gpu_layers=-1,
            n_batch=128,
            n_ctx=8192,
            max_tokens=512,
            f16_kv=True,
            use_mmap=True,
            low_vram=True,
            rope_scaling=None,
            temperature=0.0,
            repeat_penalty=1.0,
            streaming=False,
            stop=None,
            seed=42,
            num_threads=multiprocessing.cpu_count(),
            verbose=False,
        )
        # 2. Load JSON-array grammar
        grammar_text = httpx.get(GRAMMAR_URL).text
        self.grammar = LlamaGrammar.from_string(grammar_text)

    def predict(self, context, model_input: pd.DataFrame, params: Dict[str,Any]) -> pd.DataFrame:
        # 1. Extract config from params
        try:
            key_column   = params["key_column"]
            eval_column  = params["eval_column"]
            criteria     = params["criteria"]
            batch_size   = int(params["batch_size"])
        except KeyError as e:
            raise KeyError(f"Missing required param: {e}")

        # If criteria passed as JSON string, parse it
        if isinstance(criteria, str):
            criteria = json.loads(criteria)
        # 2. Validate DataFrame columns
        for col in (key_column, eval_column):
            if col not in model_input.columns:
                raise KeyError(f"Input DataFrame must contain column '{col}'")

        df = model_input.copy()
        df[key_column] = df[key_column].astype(str)
        # 3. Build prompt template
        bullets = "\n".join(f"- {c}" for c in criteria)
        example_fields = ", ".join(f'"{c}": 7' for c in criteria)
        prompt_template = (
            "You are an expert evaluator. For each input record, "
            "score 1–10 on these criteria:\n"
            f"{bullets}\n\n"
            "Respond *ONLY* with a JSON array of objects. Each element *MUST* be an object containing the *EXACT* fields shown below;"
            "*NEVER* output numbers, strings, or null values.\n"
            "[\n"
            f'  {{ "{key_column}": "...", {example_fields} }},\n'
            "  { … }\n"
            "]\n"
            "No wrapper, no extra text."
            "Do not include any other text, explanation, or markup."
            "Return *ONLY* with a JSON array of objects."
        )

        prompt_template = (
            "You are an expert evaluator.  "
            f"For each input record, score 1–10 on these criteria:\n"
            f"{bullets}\n\n"
            "Respond *only* with a valid JSON object of the form:\n"
            "{\n"
            '  "results": [\n'
            f'    {{ "{key_column}": "...", {example_fields} }}\n'
            "  ]\n"
            "}\n"
            "Do not include any other text, explanation, or markup."
        )
        # 4. Helper to chunk indices
        def chunk_list(lst: List[int], n: int):
            for i in range(0, len(lst), n):
                yield lst[i : i + n]
        # 5. Score in batches
        scored: List[Dict[str,Any]] = []
        for idxs in chunk_list(df.index.tolist(), batch_size):
            batch = df.loc[idxs]
            payload = [
                {key_column: r[key_column], eval_column: r[eval_column]}
                for _, r in batch.iterrows()
            ]
            prompt = prompt_template + "\n\n" + json.dumps(payload, indent=2)
            resp = self.llm(prompt, grammar=self.grammar, max_tokens=-1, temperature=0.0)
            arr = json.loads(resp["choices"][0]["text"])
            if not isinstance(arr, list):
                raise RuntimeError(f"Expected JSON array, got {type(arr)}:\n{arr!r}")
            scored.extend(arr)
        # 6. Flatten & clean model output
        flat: List[Dict[str, Any]] = []
        for item in scored:
            if isinstance(item, dict) and "results" in item:
                flat.extend(item["results"])
            elif isinstance(item, dict):
                flat.append(item)
            elif isinstance(item, list):
                # e.g. model returned nested array: flatten one level
                flat.extend(obj for obj in item if isinstance(obj, dict))
            else:
                # Skip numbers, nulls, etc.
                logging.warning("Discarding non‑object item from model output: %r", item)
        
        if not flat:
            raise RuntimeError("Model returned no valid score objects; check prompt/grammar.")
        # 7. Build scores DataFrame
        scores_df = pd.DataFrame(flat)
        if key_column not in scores_df.columns:
            raise KeyError(f"Missing '{key_column}' in scored output")

        scores_df[key_column] = scores_df[key_column].astype(str)
        # 8. Merge & compute TotalScore
        combined = df.merge(scores_df, on=key_column, how="left")
        combined["TotalScore"] = combined[criteria].sum(axis=1)
        return combined

    @classmethod
    def log_model(
        cls,
        model_name: str,
        llama_model_path: str,
    ):
        """
        Logs the model to MLflow with signature requiring:
          - DataFrame input with any columns
          - params: key_column (str), eval_column (str), criteria (JSON string), batch_size (int)
        """
        DEMO_PATH = "../demo"
        artifacts = {
            "llama_model_path": llama_model_path,
            "demo": DEMO_PATH,
                    }

        # Input schema: DataFrame only
        input_schema = None  # allow arbitrary columns

        # Output schema: will match input DF plus criteria columns + TotalScore
        # we omit explicit output schema for flexibility

        # Params schema: four required params
        params_schema = ParamSchema([
            ParamSpec("key_column",  DataType.string,  None),
            ParamSpec("eval_column", DataType.string,  None),
            ParamSpec("criteria",    DataType.string,  '["Originality","Clarity","Relevance","Feasibility","Feasibility"]'),
            ParamSpec("batch_size",  DataType.long,    5),
        ])

        signature = ModelSignature(inputs=input_schema, outputs=None, params=params_schema)

        mlflow.pyfunc.log_model(
            artifact_path=model_name,
            python_model=cls(),
            artifacts=artifacts,
            signature=signature,
            registered_model_name=model_name,
        )
        logging.info(f"Logged LlamaEvaluatorModel '{model_name}' requiring key_column, eval_column, criteria, batch_size")


In [None]:
# 1 ──────────────────────────────────────────────────────────────────────────
# global settings
# ────────────────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("llm‑mlflow")

In [None]:
%%time
# 2 ──────────────────────────────────────────────────────────────────────────
# Log and register the model
# ────────────────────────────────────────────────────────────────────────────
logger.info(f"Starting experiment: {EXPERIMENT_NAME}")
mlflow.set_tracking_uri('/phoenix/mlflow')
mlflow.set_experiment(EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME) as run:
    run_id = run.info.run_id
    logger.info("Run ID: %s", run_id)

    # log_model now only needs model_name & gguf path (all runtime settings come as params at inference)
    LlamaEvaluatorModel.log_model(
        model_name      = MODEL_NAME,
        llama_model_path= LLAMA_GGUF_PATH,
    )

    # Ensure the model is registered (in case autolog didn't)
    mlflow.register_model(
        model_uri=f"runs:/{run_id}/{MODEL_NAME}",
        name     =MODEL_NAME
    )
    logger.info("Registered model: %s", MODEL_NAME)

# Fetching the Latest Model Version from MLflow

In [None]:
# 3 ──────────────────────────────────────────────────────────────────────────
# Retrieve the latest version & signature
# ────────────────────────────────────────────────────────────────────────────
client = MlflowClient()
latest_version = client.get_latest_versions(MODEL_NAME, stages=["None"])[0].version
logger.info("Latest model version: %s", latest_version)

mi = mlflow.models.get_model_info(f"models:/{MODEL_NAME}/{latest_version}")
logger.info("Model signature:\n%s", mi.signature)

# Loading the Model and Running Inference

In [None]:
%%time
# 4 ──────────────────────────────────────────────────────────────────────────
# Load the model
# ────────────────────────────────────────────────────────────────────────────
model_uri = f"models:/{MODEL_NAME}/{latest_version}"
model     = mlflow.pyfunc.load_model(model_uri)

In [None]:
%%time
# 5 ──────────────────────────────────────────────────────────────────────────
# Run inference
# ────────────────────────────────────────────────────────────────────────────

sample_df = pd.DataFrame({
    "BoothNumber": ["TEST001", "TEST002"],
    "AbstractText": [
        "Investigating the effects of microplastics on marine life populations.",
        "Developing a low‑cost solar charger for off‑grid applications."
    ]
})

predictions = model.predict(
    sample_df,
    params={
        "key_column":  "BoothNumber",
        "eval_column": "AbstractText",
        "criteria":    json.dumps(
            ["Originality", "ScientificRigor", "Clarity",
             "Relevance", "Feasibility", "Brevity"]
        ),
        "batch_size":  5
    }
)

logger.info("Inference results:")

predictions

In [None]:
logger.info('Notebook execution completed.')

Built with ❤️ using Z by HP AI Studio.