# MLFLow 3 GenAI Walkthrough
This code walks through some of the key features of MLFlow for generative AI applications. Some additional documentation can be found here:

- [MLFlow 3 for Generative AI](https://docs.databricks.com/aws/en/mlflow3/genai/)

In [0]:
%pip install -U --quiet mlflow

In [0]:
import mlflow
import mlflow.deployments
import pandas as pd
import json
import time
from datetime import datetime

MLFlow has two key places it tracks experiments and models - the tracking server and the registry. These are automatically set with MLFLow 3 and we can verify below.

In [0]:
print(f"MLflow Tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow Registry URI: {mlflow.get_registry_uri()}")
print(f"MLflow Version: {mlflow.__version__}")

# Using the Deploy Client
The main way to interact with custom agents and foundation models when using a serving endpoint or application is the deploy client.

In [0]:
# Initialize
client = mlflow.deployments.get_deploy_client("databricks")

# List available endpoints
endpoints = client.list_endpoints()

print("Available endpoints:")
for endpoint in endpoints:
    if 'databricks' in endpoint['name']:
        print(f"- {endpoint['name']}")

## Tracing
One of the most important parts of agent development is the ability to trace the path (and latency) of each agent step. In MLFLow 3, experiments are the main container and we recommend one experiment per application.

In [0]:
MODEL_ENDPOINT = "databricks-llama-4-maverick"

@mlflow.trace
def query_foundation_model(prompt, temperature=0.7, max_tokens=100):
    """
    Query the foundation model via MLflow deployments.

    Parameters:
    prompt (str): The input prompt to send to the model.
    temperature (float): Sampling temperature for the model's response. Higher values mean more random completions.
                         Lower values make the output more focused and deterministic. Adjusting the temperature 
                         allows control over the creativity and diversity of the generated responses.
    max_tokens (int): The maximum number of tokens to generate in the response.
    """
    """Query the foundation model via MLflow deployments"""
    try:
        response = client.predict(
            endpoint=MODEL_ENDPOINT,
            inputs={
                "messages": [{"role": "user", "content": prompt}],
                "temperature": temperature,
                "max_tokens": max_tokens
            }
        )
        return response['choices'][0]['message']['content']
    except Exception as e:
        print(f"Error querying model: {e}")
        return None

In [0]:
## Set the experiment

# Experiments are generally stored under usernames in the workspace
username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().userName().get()

# Create or set experiment
experiment_name = f"/Users/{username}/gen_ai_prompt_engineering"
mlflow.set_experiment(experiment_name)

In [0]:
mlflow.autolog()
query_foundation_model("Quote Marcus Aurelius", temperature=1.5)

In [0]:
# Define different prompt templates for customer service
prompt_templates = {
    "basic": "Answer this customer question: {question}",
    "friendly": "As a helpful customer service representative, please answer this question with a warm and friendly tone: {question}",
    "detailed": "Provide a comprehensive and detailed answer to this customer service question. Include relevant context and next steps: {question}",
    "concise": "Give a brief, direct answer to this customer question: {question}"
}

# Sample customer questions
customer_questions = [
    "How do I return a product?",
    "What is your refund policy?",
    "My order hasn't arrived yet, what should I do?",
    "Can I change my shipping address?"
]

# Function to run prompt engineering experiment
def run_prompt_experiment(template_name, template, questions, temperature=0.7):
    """Run an experiment with a specific prompt template"""
    
    with mlflow.start_run(run_name=f"prompt_template_{template_name}"):
        # Log parameters
        mlflow.log_param("template_name", template_name)
        mlflow.log_param("template", template)
        mlflow.log_param("temperature", temperature)
        mlflow.log_param("model_endpoint", MODEL_ENDPOINT)
        mlflow.log_param("num_questions", len(questions))
        
        results = []
        total_tokens = 0
        start_time = time.time()
        
        for i, question in enumerate(questions):  # enumerate provides a counter (i) and the item (question) from the iterable (questions)
            # Format prompt
            formatted_prompt = template.format(question=question)
            
            # Get model response
            response = query_foundation_model(formatted_prompt, temperature=temperature)
            
            if response:
                result = {
                    "question": question,
                    "formatted_prompt": formatted_prompt,
                    "response": response,
                    "response_length": len(response),
                    "timestamp": datetime.now().isoformat()
                }
                results.append(result)
                total_tokens += len(response.split())
            else:
                # Handle case where model response failed
                result = {
                    "question": question,
                    "formatted_prompt": formatted_prompt,
                    "response": "ERROR: No response received",
                    "response_length": 0,
                    "timestamp": datetime.now().isoformat()
                }
                results.append(result)
            
            # Small delay to avoid rate limiting
            time.sleep(0.5)
        
        end_time = time.time()
        
        # Log metrics
        mlflow.log_metric("total_responses", len(results))
        if results:  # Avoid division by zero
            mlflow.log_metric("avg_response_length", sum(r["response_length"] for r in results) / len(results))
        mlflow.log_metric("total_tokens", total_tokens)
        mlflow.log_metric("execution_time_seconds", end_time - start_time)
        
        # Log results as artifact
        results_df = pd.DataFrame(results)
        results_df.to_csv("prompt_results.csv", index=False)
        mlflow.log_artifact("prompt_results.csv")
        
        # Log individual responses as text files
        for i, result in enumerate(results):
            mlflow.log_text(
                f"Q: {result['question']}\n\nPrompt: {result['formatted_prompt']}\n\nA: {result['response']}", 
                f"qa_pair_{i}.txt"
            )
        
        print(f"Completed experiment for template: {template_name}")
        return results
    
# Run experiments for each prompt template
all_results = {}
for template_name, template in prompt_templates.items():
    print(f"\n--- Running experiment: {template_name} ---")
    results = run_prompt_experiment(template_name, template, customer_questions)
    all_results[template_name] = results

## Model Evaluation
This notebook goes through some basic model evaluation with custom metrics. We can also use `databricks-agent` to use tuned and efficient LLM judges.

In [0]:

# Define custom evaluation functions
def calculate_response_quality_score(question, response):
    """Simple heuristic scoring for response quality"""
    score = 0
    
    # Length check (not too short, not too long)
    if 20 <= len(response) <= 500:
        score += 25
    
    # Politeness indicators
    polite_words = ['please', 'thank you', 'sorry', 'help', 'assist']
    if any(word in response.lower() for word in polite_words):
        score += 25
    
    # Relevance check (contains key terms from question)
    question_words = set(question.lower().split())
    response_words = set(response.lower().split())
    if len(question_words.intersection(response_words)) > 0:
        score += 25
    
    # Completeness (contains actionable information)
    action_words = ['contact', 'visit', 'call', 'email', 'process', 'steps']
    if any(word in response.lower() for word in action_words):
        score += 25
    
    return score

def evaluate_template_performance(template_name, results):
    """Evaluate overall performance of a prompt template"""
    
    with mlflow.start_run(run_name=f"evaluation_{template_name}"):
        mlflow.log_param("evaluation_template", template_name)
        mlflow.log_param("evaluation_timestamp", datetime.now().isoformat())
        mlflow.log_param("total_responses_evaluated", len(results))
        
        scores = []
        detailed_scores = []
        error_count = 0
        
        # Evaluate each response
        for i, result in enumerate(results):
            # Skip error responses
            if result["response"].startswith("ERROR:"):
                error_count += 1
                continue
                
            quality_score = calculate_response_quality_score(
                result["question"], 
                result["response"]
            )
            scores.append(quality_score)
            
            detailed_scores.append({
                "question_id": i,
                "question": result["question"],
                "response": result["response"],
                "quality_score": quality_score,
                "response_length": result["response_length"],
                "timestamp": result["timestamp"]
            })
        
        # Handle case where no valid responses exist
        if not scores:
            print(f"Warning: No valid responses found for template '{template_name}'")
            mlflow.log_metric("avg_quality_score", 0)
            mlflow.log_metric("error_rate", 1.0)
            mlflow.log_metric("valid_responses", 0)
            return {
                "template_name": template_name,
                "avg_quality_score": 0,
                "total_questions": len(results),
                "valid_responses": 0,
                "error_count": error_count,
                "evaluation_date": datetime.now().isoformat()
            }
        
        # Calculate aggregate metrics
        avg_score = sum(scores) / len(scores)
        min_score = min(scores)
        max_score = max(scores)
        score_std = pd.Series(scores).std() if len(scores) > 1 else 0
        error_rate = error_count / len(results)
        
        # Log evaluation metrics
        mlflow.log_metric("avg_quality_score", avg_score)
        mlflow.log_metric("min_quality_score", min_score)
        mlflow.log_metric("max_quality_score", max_score)
        mlflow.log_metric("score_std", score_std)
        mlflow.log_metric("error_rate", error_rate)
        mlflow.log_metric("valid_responses", len(scores))
        mlflow.log_metric("error_count", error_count)
        
        # Calculate additional quality metrics
        high_quality_responses = sum(1 for score in scores if score >= 75)
        medium_quality_responses = sum(1 for score in scores if 50 <= score < 75)
        low_quality_responses = sum(1 for score in scores if score < 50)
        
        mlflow.log_metric("high_quality_count", high_quality_responses)
        mlflow.log_metric("medium_quality_count", medium_quality_responses)
        mlflow.log_metric("low_quality_count", low_quality_responses)
        mlflow.log_metric("high_quality_percentage", (high_quality_responses / len(scores)) * 100)
        
        # Create evaluation summary
        eval_summary = {
            "template_name": template_name,
            "avg_quality_score": avg_score,
            "min_quality_score": min_score,
            "max_quality_score": max_score,
            "score_std": score_std,
            "total_questions": len(results),
            "valid_responses": len(scores),
            "error_count": error_count,
            "error_rate": error_rate,
            "high_quality_count": high_quality_responses,
            "medium_quality_count": medium_quality_responses,
            "low_quality_count": low_quality_responses,
            "high_quality_percentage": (high_quality_responses / len(scores)) * 100,
            "evaluation_date": datetime.now().isoformat()
        }
        
        # Log detailed evaluation results
        if detailed_scores:
            eval_df = pd.DataFrame(detailed_scores)
            eval_df.to_csv("detailed_evaluation.csv", index=False)
            mlflow.log_artifact("detailed_evaluation.csv")
            
            # Create score distribution summary
            score_distribution = {
                "score_bins": [0, 25, 50, 75, 100],
                "score_counts": [
                    sum(1 for score in scores if 0 <= score < 25),
                    sum(1 for score in scores if 25 <= score < 50),
                    sum(1 for score in scores if 50 <= score < 75),
                    sum(1 for score in scores if 75 <= score <= 100)
                ]
            }
            
            # Log score distribution
            with open("score_distribution.json", "w") as f:
                json.dump(score_distribution, f, indent=2)
            mlflow.log_artifact("score_distribution.json")
        
        # Log summary as JSON
        with open("evaluation_summary.json", "w") as f:
            json.dump(eval_summary, f, indent=2)
        mlflow.log_artifact("evaluation_summary.json")
        
        # Create human-readable evaluation report
        report = f"""=== EVALUATION REPORT: {template_name.upper()} ===
Total Questions: {len(results)}
Valid Responses: {len(scores)}
Error Rate: {error_rate:.2%}

QUALITY SCORES:
- Average: {avg_score:.2f}/100
- Range: {min_score:.0f} - {max_score:.0f}
- Standard Deviation: {score_std:.2f}

QUALITY DISTRIBUTION:
- High Quality (75-100): {high_quality_responses} ({(high_quality_responses/len(scores)*100):.1f}%)
- Medium Quality (50-74): {medium_quality_responses} ({(medium_quality_responses/len(scores)*100):.1f}%)
- Low Quality (0-49): {low_quality_responses} ({(low_quality_responses/len(scores)*100):.1f}%)

Evaluation completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}"""
        
        # Log the report
        mlflow.log_text(report, "evaluation_report.txt")
        
        print(f"Template '{template_name}' - Average Quality Score: {avg_score:.2f}/100")
        print(f"Valid Responses: {len(scores)}/{len(results)} (Error Rate: {error_rate:.2%})")
        
        return eval_summary

In [0]:
# # COMMAND ----------

# Evaluate all templates
evaluation_results = {}
print("=== EVALUATION RESULTS ===")
for template_name, results in all_results.items():
    eval_result = evaluate_template_performance(template_name, results)
    evaluation_results[template_name] = eval_result
    print("-" * 50)

In [0]:
# Compare all templates
comparison_df = pd.DataFrame([
    {
        "Template": name,
        "Avg Quality Score": result["avg_quality_score"],
        "Valid Responses": result["valid_responses"],
        "Error Rate": f"{result['error_rate']:.1%}",
        "High Quality %": f"{result['high_quality_percentage']:.1f}%"
    }
    for name, result in evaluation_results.items()
]).sort_values("Avg Quality Score", ascending=False)

print("\n=== TEMPLATE COMPARISON ===")
print(comparison_df.to_string(index=False))

# Log comparison results in the same experiment
with mlflow.start_run(run_name="template_comparison"):
    mlflow.log_param("comparison_timestamp", datetime.now().isoformat())
    mlflow.log_param("templates_compared", list(evaluation_results.keys()))
    
    # Log best performing template
    best_template = comparison_df.iloc[0]["Template"]
    best_score = comparison_df.iloc[0]["Avg Quality Score"]
    
    mlflow.log_metric("best_template_score", best_score)
    mlflow.log_param("best_template_name", best_template)
    
    # Save comparison table
    comparison_df.to_csv("template_comparison.csv", index=False)
    mlflow.log_artifact("template_comparison.csv")
    
    print(f"\n🏆 Best performing template: {best_template} (Score: {best_score:.2f})")

## Deployment and Monitoring
Once we've done some development and evaluation, the next thing we will want to do is deploy the chatbot so it can easily be used by an application. The deployment abstracts away the logic of the agent / generative AI application.

In [0]:
from mlflow.pyfunc import ChatAgent
from mlflow.types.agent import (
    ChatAgentChunk,
    ChatAgentMessage,
    ChatAgentResponse,
    ChatContext,
)
from typing import Any, Generator, Optional, Sequence, Union

# Deploy the best template for production use
best_template_name = comparison_df.iloc[0]["Template"]
best_template = prompt_templates[best_template_name]

print(f"🚀 PRODUCTION DEPLOYMENT")
print(f"Selected template: {best_template_name}")
print(f"Template: {best_template}")



class ProdChatbot(ChatAgent):
    def predict(
        self,
        messages: list[ChatAgentMessage],
        context: Optional[ChatContext] = None,
        custom_inputs: Optional[dict[str, Any]] = None,
    ) -> ChatAgentResponse:
        request = {"messages": self._convert_messages_to_dict(messages[-1])}

        messages = []
        for event in self.agent.stream(request, stream_mode="updates"):
            for node_data in event.values():
                messages.extend(
                    ChatAgentMessage(**msg) for msg in node_data.get("messages", [])
                )
        return ChatAgentResponse(messages=messages)

def production_chatbot(question, user_id=None):
    """Production-ready chatbot function"""
    start_time = time.time()
    
    # The 'Logic'
    formatted_prompt = best_template.format(question=question)
    response = query_foundation_model(formatted_prompt, temperature=0.7)
    
    return response

# Test production deployment
test_questions = [
    "I want to return my order, how do I do that?",
    "What's your customer service phone number?",
    "My package is late, what should I do?"
]

print("\n=== PRODUCTION TESTING ===")
for i, question in enumerate(test_questions, 1):
    print(f"\nTest {i}: {question}")
    response = production_chatbot(question, user_id=f"test_user_{i}")
    print(f"Response: {response}")