<h1 style="text-align: center; font-size: 50px;"> 🤖 MLFlow Registration for Multimodal RAG</h1>

In [1]:
%pip install -r ../requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


# MLFlow Model Service 

In this section, we demonstrate how to deploy a RAG-based chatbot service. This service provides a REST API endpoint that allows users to query the knowledge base with natural language questions, upload new documents to the knowledge base, and manage conversation history, all with built-in safeguards against sensitive information and toxicity. This service encapsulates all the functionality we developed in this notebook, including the document retrieval system, RAG-based question answering capabilities, and Galileo integration for protection, observation and evaluation. It demonstrates how to use our ChatbotService from the src/service directory. 

## Step 0: Imports and Environment Setup

In [2]:
# === Standard Library Imports ===
import gc
import json
import logging
import math
import os
import sys
import time
import tempfile
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, TypedDict

# === Third-Party Library Imports ===
import matplotlib.pyplot as plt
import mlflow
import numpy as np
import pandas as pd
import torch
from langchain_core.embeddings import Embeddings
from langchain.schema.document import Document
from langchain.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings
from mlflow.models.signature import ModelSignature
from mlflow.tracking import MlflowClient
from mlflow.types import ColSpec, DataType, Schema, TensorSpec
from PIL import Image as PILImage
from sentence_transformers import CrossEncoder, SentenceTransformer
from transformers import pipeline, AutoImageProcessor, AutoModel, AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, SiglipModel, SiglipProcessor

# === Project-Specific Imports ===
# Add the project root to the system path to allow importing from 'src'
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.components import SemanticCache, SiglipEmbeddings
from src.local_genai_judge import LocalGenAIJudge
from src.utils import (
    configure_hf_cache,
)

2025-07-27 09:28:59.824944: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-27 09:28:59.885032: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753608539.915328   10683 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753608539.927023   10683 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753608539.965360   10683 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [3]:
logger = logging.getLogger("multimodal_rag_register_notebook")
logger.setLevel(logging.INFO)
if not logger.handlers:
    formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
    stream_handler = logging.StreamHandler(sys.stdout)
    stream_handler.setFormatter(formatter)
    logger.addHandler(stream_handler)
logger.propagate = False

## Step 1: Configurations

In [4]:
# --- MLflow Configuration ---
MODEL_NAME = "AIStudio-Multimodal-Chatbot-Model"
RUN_NAME = f"Register_{MODEL_NAME}"
EXPERIMENT_NAME = "AIStudio-Multimodal-Chatbot-Experiment"

LOCAL_MODEL_PATH = "/home/jovyan/datafabric/InternVL3-8B-Instruct"
# Set MLflow tracking URI and experiment
# This should be configured for your environment, e.g., a remote server or local file path
mlflow.set_tracking_uri(os.getenv("MLFLOW_TRACKING_URI", "/phoenix/mlflow"))
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

logger.info(f"Using MLflow tracking URI: {mlflow.get_tracking_uri()}")
logger.info(f"Using MLflow experiment: '{EXPERIMENT_NAME}'")

Traceback (most recent call last):
  File "/opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 329, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 427, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1373, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1366, in _read_helper
    result = read_yaml(root, file_name)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.1

2025-07-27 09:29:02 - INFO - Using MLflow tracking URI: /phoenix/mlflow
2025-07-27 09:29:02 - INFO - Using MLflow experiment: 'AIStudio-Multimodal-Chatbot-Experiment'


In [5]:
warnings.filterwarnings("ignore")


In [6]:
logger.info("Notebook execution started.")

2025-07-27 09:29:02 - INFO - Notebook execution started.


In [7]:
configure_hf_cache()

## Step 2: MLflow Model Setup

In [8]:
class MultimodalRagModel(mlflow.pyfunc.PythonModel):
    """
    An MLflow PythonModel that encapsulates the entire Multimodal RAG pipeline.

    This class faithfully reproduces the workflow from the `run-notebook.ipynb`, including
    data loading, multi-stage retrieval (vector search + reranking), and multimodal
    generation with the InternVL model.

    Expected Artifacts during logging/loading:
      - "chroma_dir": Path to the persisted Chroma vectorstore directory.
      - "context_dir": Path to the root data directory, containing the `images` subdirectory.
      - "cache_dir": Path to the directory for the semantic cache.
    """

    # --------------------------------------------------------------------------
    # Helper Classes (Encapsulated from the original notebook)
    # --------------------------------------------------------------------------

    class InternVLMM:
        """Minimal, self-contained multimodal QA wrapper around InternVL-Chat-V1-5."""
        def __init__(self, model_path: str, device: str, cache: Any):
            self.device = device
            self.model = None
            self.model_path = model_path
            self.tok = None
            self.image_processor = None
            self.cache = cache
            self._load()

        def generate(self, query: str, context: Dict[str, Any], force_regenerate: bool = False) -> Dict[str, Any]:
            if not force_regenerate:
                cached_result = self.cache.get(query, threshold=0.92)
                if cached_result:
                    logger.info(f"SEMANTIC CACHE HIT for query: '{query}'")
                    return cached_result

            if force_regenerate:
                logger.info(f"Forced regeneration for query: '{query}'. Clearing old cache entry.")
                self.cache.delete(query)

            logger.info(f"CACHE MISS for query: '{query}'. Running full pipeline.")
            if self.model is None or self.tok is None:
                return {"reply": "Error: model not initialised.", "used_images": []}

            hits = self._retrieve_mm(query, **context)
            docs, images = hits["docs"], hits["images"]

            if not docs and not images:
                return {"reply": "I don't know based on the provided context.", "used_images": []}

            # Build prompt
            context_str = "\n\n".join(
                f"<source_document name=\"{d.metadata.get('source', 'unknown')}\">\n{d.page_content}\n</source_document>"
                for d in docs
            )

            visual_analysis_prompt = ""
            if images:
                visual_analysis_prompt = """
                ## **Visual Analysis**
                #### Answer Here
                First, provide a detailed description of what is shown in the provided image(s), based only on what you can see.
                """
            
            # Construct the final prompt
            user_content = f"""
                <task_instructions>
                Your response must follow this exact structure:
                {visual_analysis_prompt}
                ## **Synthesized Answer**
                #### Answer Here
                Next, answer the user's original query. Your answer must be synthesized from the provided text `<context>`. Use the visual analysis only if it is relevant. If the image is not relevant, rely solely on the text context to formulate your answer.
        
                ## **Source Documents**
                #### Answer Here
                At the very end of your response, cite the source from the context in brackets and backticks, like this: [`source-file-name.md`].
                </task_instructions>
        
                <context>
                    {context_str}
                </context>
        
                <user_query>
                     {query}
                </user_query>
        
                Now, generate the response following all instructions.
                """
            SYSTEM_PROMPT = """
                You are AI Studio DevOps Assistant. Your function is to analyze images and text, then answer questions based ONLY on the provided materials.
                
                **PERMANENT INSTRUCTIONS:**
                1.  **Analyze and Answer from Context**: Your entire response MUST be derived thoroughly from the provided `<context>` block or the user's image(s).
                2.  **Follow Output Structure**: You MUST follow the multi-part response structure outlined in the user's message. Completing all sections is mandatory.
                3.  **No External Knowledge**: You MUST NOT use any information outside the provided materials.
                4.  **No Hallucination**: Do not invent or assume any details. If information is not present, it does not exist.
                5.  **Handle Missing Information**: If the provided context or image(s) do not contain the answer, your ONLY response will be: "Based on the provided context, I cannot answer this question." Do not add any other words or explanation.
                """
            conversation = [
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_content}
            ]

            prompt = self.tok.apply_chat_template(
                conversation,
                tokenize=False,
                add_generation_prompt=True
            )

            # Generate
            try:
                self._clear_cuda()
                pixel_values = self._process_images(images) if images else None
                reply = self.model.chat(
                    self.tok, pixel_values, prompt,
                    generation_config=dict(
                        max_new_tokens=16384, 
                        pad_token_id=self.tok.pad_token_id, 
                        eos_token_id=self.tok.eos_token_id,
                        repetition_penalty=1.10,
                        # --- Add these lines for creative sampling ---
                        do_sample=True,         # This is required to enable sampling
                        temperature=0.3,        # Controls randomness. Lower is more predictable, higher is more creative.
                        top_p=0.9,              # Nucleus sampling: considers the smallest set of tokens whose cumulative probability exceeds top_p.
                    ),
                )
                self._clear_cuda()
                result_dict = {"reply": reply, "used_images": images}

                self.cache.set(query, result_dict)
                return result_dict
            except RuntimeError as e:
                logger.error("InternVL generation failed: %s", e)
                return {"reply": f"Error during generation: {e}", "used_images": images}

        def _retrieve_mm(self, query: str, text_db: Chroma, image_db: Chroma, siglip_embeds: Any, cross_encoder: Any, k_txt: int = 4, k_img: int = 8, fetch_k: int = 20) -> Dict[str, Any]:
            """Performs hybrid retrieval for text and associated images."""
            # 1. Coarse recall (text)
            docs_and_init = text_db.similarity_search_with_score(query, k=fetch_k)
            if not docs_and_init:
                return {"docs": [], "images": []}
            docs, init_scores = zip(*docs_and_init)

            # 2. Rerank (text)
            rerank_scores = cross_encoder.predict([(query, d.page_content) for d in docs])

            # 3. Hybrid scoring
            hybrid_scores = [0.4 * init + 0.6 * rerank for init, rerank in zip(init_scores, rerank_scores)]

            # 4. Select top-k text
            scored_docs = sorted(zip(docs, hybrid_scores), key=lambda x: x[1], reverse=True)
            selected_docs = [doc for doc, score in scored_docs[:k_txt]]

            # 5. Image retrieval
            sources = [d.metadata["source"] for d in selected_docs]
            q_emb = siglip_embeds.embed_query(query)
            img_hits = image_db.similarity_search_by_vector(q_emb, k=k_img * 2, filter={"source": {"$in": sources}})
            images = [img.page_content for img in img_hits[:k_img]]

            return {"docs": selected_docs, "images": images}

        def _load(self):
            logger.info("Loading %s...", self.model_path)
            self._clear_cuda()

            self.tok = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True)
            if self.tok.pad_token is None: self.tok.pad_token = self.tok.eos_token

            self.image_processor = AutoImageProcessor.from_pretrained(self.model_path, trust_remote_code=True, use_fast=True)

            q_cfg = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16,
                bnb_4bit_use_double_quant=True,
            ) if self.device == "cuda" else None

            self.model = AutoModel.from_pretrained(
                self.model_path,
                quantization_config=q_cfg,
                torch_dtype=(torch.bfloat16 if self.device == "cuda" else torch.float32),
                low_cpu_mem_usage=True,
                use_flash_attn=False,
                trust_remote_code=True,
                device_map="auto" if self.device == "cuda" else None,
            ).eval()
            logger.info("Model loaded on %s.", self.device)

        def _process_images(self, image_paths: List[str]):
            if not image_paths: return None
            try:
                pil_images = [PILImage.open(p).convert("RGB") for p in image_paths]
                processed_data = self.image_processor(images=pil_images, return_tensors="pt")
                # Ensure pixel values are on the same device and dtype as the model
                pixel_values = processed_data['pixel_values'].to(device=self.device, dtype=next(self.model.parameters()).dtype)
                return pixel_values
            except Exception as e:
                logger.error("Image processing failed: %s", e)
                return None

        def _clear_cuda(self):
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
                torch.cuda.synchronize()

    # --------------------------------------------------------------------------
    # MLflow pyfunc Methods
    # --------------------------------------------------------------------------

    def load_context(self, context: mlflow.pyfunc.PythonModelContext) -> None:
        """
        This method is called when loading an MLflow model. It initializes all
        necessary components using the artifacts logged with the model.
        """
        logger.info("--- Initializing MultimodalRagModel context ---")
        
        # This part remains the same
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        logger.info(f"Running on device: {self.device}")
    
        # Get the path to the bundled model artifacts that MLflow provides
        # The key "local_model_dir" must match the key used in the `artifacts` dict during logging
        model_artifact_path = Path(context.artifacts["local_model_dir"])
    
        self.model_path = model_artifact_path.resolve()
        logger.info(f"Resolved local model path to: {self.model_path}")
        
        # The rest of the method now uses this resolved path
        self.chroma_dir = Path(context.artifacts["chroma_dir"])
        self.context_dir = Path(context.artifacts["context_dir"])
        self.cache_dir = Path(context.artifacts["cache_dir"])
        logger.info(f"Artifacts loaded: chroma_dir='{self.chroma_dir}', context_dir='{self.context_dir}', cache_dir='{self.cache_dir}'")
    
        # This loads from the local artifacts bundled with your MLflow model
        logger.info("Loading embedding models and cross-encoder from local artifacts...")
        
        # 1. Get the local paths from the MLflow context
        e5_model_path = context.artifacts["e5_model_dir"]
        siglip_model_path = context.artifacts["siglip_model_dir"]
        cross_encoder_path = context.artifacts["cross_encoder_dir"]
        
        # 2. Initialize models using the local paths
        self.text_embed_model = HuggingFaceEmbeddings(
            model_name=e5_model_path, 
            model_kwargs={"device": self.device}
        )
        self.siglip_embed_model = SiglipEmbeddings(
            model_id=siglip_model_path, 
            device=self.device
        )
        self.cross_encoder = CrossEncoder(
            cross_encoder_path, 
            device=self.device
        )
        
        logger.info("✅ Models loaded successfully from artifacts.")
    
        logger.info("Loading ChromaDB vector stores...")
        self.text_db = Chroma(
            collection_name="mm_text",
            persist_directory=str(self.chroma_dir),
            embedding_function=self.text_embed_model,
        )
        self.image_db = Chroma(
            collection_name="mm_image",
            persist_directory=str(self.chroma_dir),
            embedding_function=self.siglip_embed_model,
        )
        logger.info(f"Text DB count: {self.text_db._collection.count()}, Image DB count: {self.image_db._collection.count()}")
    
        self.cache = SemanticCache(persist_directory=self.cache_dir, embedding_function=self.text_embed_model)
    
        # The InternVLMM will now be initialized with the safe, absolute path
        self.mm_llm = self.InternVLMM(model_path=self.model_path, device=self.device, cache=self.cache)

        logger.info("Initializing evaluation judge by sharing the main model...")
        try:
            # Instantiate the MODIFIED judge with the model, not a pipeline
            self.judge = LocalGenAIJudge(
                model=self.mm_llm.model,      # Pass the model directly
                tokenizer=self.mm_llm.tok   # Pass the tokenizer
            )
            logger.info("✅ Evaluation judge initialized successfully.")
            
        except Exception as e:
            logger.error(f"Failed to initialize evaluation judge: {e}")
            self.judge = None

        logger.info("--- Context initialization complete ---")


    def predict(self, context: mlflow.pyfunc.PythonModelContext, model_input: pd.DataFrame) -> pd.DataFrame:
        """
        MLflow inference entrypoint.
        Expects a pandas DataFrame with a "query" column.
        Returns a DataFrame with "reply" and "used_images" columns.
        """
        logger.info("Received prediction request.")
        queries = model_input["query"].tolist()
        force_regenerate = model_input.get("force_regenerate", pd.Series([False] * len(queries))).tolist()
        results = []

        retrieval_context = {
            "text_db": self.text_db,
            "image_db": self.image_db,
            "siglip_embeds": self.siglip_embed_model,
            "cross_encoder": self.cross_encoder
        }

        for i, query in enumerate(queries):
            logger.info(f"Processing query: '{query}'")
            
            # 1. Generate the answer
            response_dict = self.mm_llm.generate(query, retrieval_context, force_regenerate=force_regenerate[i])

            # Re-run retrieval to get the context string for evaluation
            retrieved_info = self.mm_llm._retrieve_mm(query, **retrieval_context)
            context_str = "\n\n".join(d.page_content for d in retrieved_info["docs"])

            # 2. Run evaluation if the judge was loaded successfully
            if self.judge:
                # Create a single-row DataFrame for the judge
                eval_df = pd.DataFrame([{
                    "questions": query,
                    "result": response_dict["reply"],
                    "source_documents": context_str
                }])
                
                # Get scores and add them to the response dictionary
                response_dict["faithfulness"] = self.judge.evaluate_faithfulness(eval_df).iloc[0]
                response_dict["relevance"] = self.judge.evaluate_relevance(eval_df).iloc[0]
            else:
                # Provide default null values if the judge isn't available
                response_dict["faithfulness"] = None
                response_dict["relevance"] = None
            
            results.append(response_dict)

        return pd.DataFrame(results)

    @classmethod
    def log_model(cls, model_name: str, local_model_path: str) -> None:
        """
        Helper class method to log the MultimodalRagModel to MLflow.
        This version downloads supporting models to a temporary directory that is
        automatically cleaned up after logging.
        """
        logger.info(f"--- Logging '{model_name}' to MLflow ---")
        
        # Use a temporary directory that gets automatically deleted
        with tempfile.TemporaryDirectory() as temp_dir:
            logger.info(f"Created temporary directory for models: {temp_dir}")
            temp_path = Path(temp_dir)
    
            # --- 1. Download models into the temporary directory ---
            # e5 model
            e5_path = temp_path / "e5-large-v2"
            e5_model = SentenceTransformer("intfloat/e5-large-v2")
            e5_model.save(str(e5_path))
            logger.info(f"✅ Temporarily saved e5-large-v2 to {e5_path}")
    
            # SigLIP model
            siglip_path = temp_path / "siglip2-base-patch16-224"
            SiglipModel.from_pretrained("google/siglip2-base-patch16-224").save_pretrained(siglip_path)
            SiglipProcessor.from_pretrained("google/siglip2-base-patch16-224").save_pretrained(siglip_path)
            logger.info(f"✅ Temporarily saved SigLIP to {siglip_path}")
    
            # Cross-Encoder model
            cross_encoder_path = temp_path / "ms-marco-MiniLM-L-6-v2"
            CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2").save(str(cross_encoder_path))
            logger.info(f"✅ Temporarily saved Cross-Encoder to {cross_encoder_path}")
    
            # --- 2. Define artifacts using paths from the temporary directory ---
            project_root = Path.cwd().parent.resolve()
            artifacts = {
                "local_model_dir": local_model_path,
                "e5_model_dir": str(e5_path),
                "siglip_model_dir": str(siglip_path),
                "cross_encoder_dir": str(cross_encoder_path),
                "chroma_dir": str(project_root / "data" / "chroma_store"),
                "context_dir": str(project_root / "data" / "context"),
                "cache_dir": str(project_root / "data" / "chroma_store" / "semantic_cache"),
            }
    
            # --- 3. Log the model (MLflow will copy from the temp dir) ---
            input_schema = Schema([
                ColSpec(DataType.string, "query"),
                ColSpec(DataType.boolean, "force_regenerate")
            ])
            output_schema = Schema([
                ColSpec(DataType.string, "reply"),
                ColSpec(DataType.string, "used_images"),
                ColSpec(DataType.double, "faithfulness"),
                ColSpec(DataType.double, "relevance"),
            ])
            signature = ModelSignature(inputs=input_schema, outputs=output_schema)
    
            mlflow.pyfunc.log_model(
                artifact_path=model_name,
                python_model=cls(),
                artifacts=artifacts,
                pip_requirements="../requirements.txt",
                signature=signature,
                code_paths=["../src"],
            )
    
        # The temporary directory and all its contents are automatically deleted here
        logger.info(f"✅ Successfully logged '{model_name}' and cleaned up temporary files.")

## Step 3: Start Run, Log & Register Model

In [9]:
%%time

# --- Start MLflow Run and Log the Model ---
try:
    with mlflow.start_run(run_name=RUN_NAME) as run:
        run_id = run.info.run_id
        logger.info(f"Started MLflow run: {run_id}")

        # Use the class method to log the model and its artifacts
        MultimodalRagModel.log_model(model_name=MODEL_NAME, local_model_path=LOCAL_MODEL_PATH)


        model_uri = f"runs:/{run_id}/{MODEL_NAME}"
        logger.info(f"Registering model from URI: {model_uri}")
        
        # R.0egister the model in the MLflow Model Registry
        mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)
        logger.info(f"✅ Successfully registered model '{MODEL_NAME}'")

except FileNotFoundError as e:
    logger.error(f"Error: A required file or directory was not found. Please ensure the project structure is correct.")
    logger.error(f"Details: {e}")
except Exception as e:
    logger.error(f"An unexpected error occurred during the MLflow run: {e}", exc_info=True)

2025-07-27 09:29:02 - INFO - Started MLflow run: 7ba5a0f2454a460e85ff2f294ed62d31
2025-07-27 09:29:02 - INFO - --- Logging 'AIStudio-Multimodal-Chatbot-Model' to MLflow ---
2025-07-27 09:29:02 - INFO - Created temporary directory for models: /tmp/tmp76qsgq6l
2025-07-27 09:29:07 - INFO - ✅ Temporarily saved e5-large-v2 to /tmp/tmp76qsgq6l/e5-large-v2


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


2025-07-27 09:29:14 - INFO - ✅ Temporarily saved SigLIP to /tmp/tmp76qsgq6l/siglip2-base-patch16-224
2025-07-27 09:29:15 - INFO - ✅ Temporarily saved Cross-Encoder to /tmp/tmp76qsgq6l/ms-marco-MiniLM-L-6-v2


Downloading artifacts:   0%|          | 0/49 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/11 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/741 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2025-07-27 09:33:25 - INFO - ✅ Successfully logged 'AIStudio-Multimodal-Chatbot-Model' and cleaned up temporary files.
2025-07-27 09:33:25 - INFO - Registering model from URI: runs:/7ba5a0f2454a460e85ff2f294ed62d31/AIStudio-Multimodal-Chatbot-Model


Registered model 'AIStudio-Multimodal-Chatbot-Model' already exists. Creating a new version of this model...


2025-07-27 09:33:25 - INFO - ✅ Successfully registered model 'AIStudio-Multimodal-Chatbot-Model'
CPU times: user 5.2 s, sys: 56.2 s, total: 1min 1s
Wall time: 4min 23s


Created version '12' of model 'AIStudio-Multimodal-Chatbot-Model'.


In [10]:
# --- Retrieve the latest version from the Model Registry ---
try:
    client = MlflowClient()
    versions = client.get_latest_versions(MODEL_NAME, stages=["None"])
    if not versions:
        raise RuntimeError(f"No registered versions found for model '{MODEL_NAME}'.")
    
    latest_version = versions[0]
    logger.info(f"Found latest version '{latest_version.version}' for model '{MODEL_NAME}' in stage '{latest_version.current_stage}'.")
    model_uri_registry = latest_version.source

except Exception as e:
    logger.error(f"Failed to retrieve model from registry: {e}", exc_info=True)
    model_uri_registry = None # Ensure variable exists


2025-07-27 09:33:26 - INFO - Found latest version '12' for model 'AIStudio-Multimodal-Chatbot-Model' in stage 'None'.


In [11]:
if model_uri_registry:
    try:
        logger.info(f"Loading model from: {model_uri_registry}")
        loaded_model = mlflow.pyfunc.load_model(model_uri=model_uri_registry)
        logger.info("✅ Successfully loaded model from registry.")
    except Exception as e:
        logger.error(f"Failed to load model from registry URI: {e}", exc_info=True)
        loaded_model = None
else:
    logger.warning("Skipping model loading due to previous errors.")
    loaded_model = None

2025-07-27 09:33:26 - INFO - Loading model from: /phoenix/mlflow/745750915506484464/7ba5a0f2454a460e85ff2f294ed62d31/artifacts/AIStudio-Multimodal-Chatbot-Model
2025-07-27 09:33:26 - INFO - --- Initializing MultimodalRagModel context ---
2025-07-27 09:33:26 - INFO - Running on device: cuda
2025-07-27 09:33:26 - INFO - Resolved local model path to: /phoenix/mlflow/745750915506484464/7ba5a0f2454a460e85ff2f294ed62d31/artifacts/AIStudio-Multimodal-Chatbot-Model/artifacts/InternVL3-8B-Instruct
2025-07-27 09:33:26 - INFO - Artifacts loaded: chroma_dir='/phoenix/mlflow/745750915506484464/7ba5a0f2454a460e85ff2f294ed62d31/artifacts/AIStudio-Multimodal-Chatbot-Model/artifacts/chroma_store', context_dir='/phoenix/mlflow/745750915506484464/7ba5a0f2454a460e85ff2f294ed62d31/artifacts/AIStudio-Multimodal-Chatbot-Model/artifacts/context', cache_dir='/phoenix/mlflow/745750915506484464/7ba5a0f2454a460e85ff2f294ed62d31/artifacts/AIStudio-Multimodal-Chatbot-Model/artifacts/semantic_cache'
2025-07-27 09:33

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

2025-07-27 09:34:54 - INFO - Model loaded on cuda.
2025-07-27 09:34:54 - INFO - Initializing evaluation judge by sharing the main model...
2025-07-27 09:34:54 - INFO - ✅ Evaluation judge initialized successfully.
2025-07-27 09:34:54 - INFO - --- Context initialization complete ---
2025-07-27 09:34:54 - INFO - ✅ Successfully loaded model from registry.


## Step 4: Display Results

In [12]:
# --- Helper Function to Display Test Results ---
def display_results(query: str, result_df: pd.DataFrame):
    """Helper to neatly print the query, reply, and display images."""
    if result_df.empty:
        print("Received an empty result.")
        return

    reply = result_df["reply"].iloc[0]
    # Images are stored as a string representation of a list, so we need to evaluate it
    image_paths_str = result_df["used_images"].iloc[0]
    image_paths = eval(image_paths_str) if isinstance(image_paths_str, str) and image_paths_str.startswith('[') else []
    
    print("---" * 20)
    print(f"❓ Query:\n{query}\n")
    print(f"🤖 Reply:\n{reply}\n")
    
    if image_paths:
        print(f"🖼️ Displaying {len(image_paths)} retrieved image(s):")
        # You can integrate the display_images function from the original notebook here
        # For simplicity, we'll just print the paths.
        for path in image_paths:
            print(f"  - {path}")
    else:
        print("▶ No images were retrieved for this query.")
    print("---" * 20 + "\n")

In [13]:
if loaded_model:
    logger.info("Running sample inference with the loaded model...")
    
    sample_queries = [
        "What are the AI Blueprints Repository best practices?",
        "What are some feature flags that i can enable in AIStudio?",
        "How do i manually clean my environment without hooh?",
    ]

    for query in sample_queries:
        try:
            # --- MODIFIED LINE ---
            # Add 'force_regenerate': False to the dictionary when creating the DataFrame
            input_payload = pd.DataFrame([{"query": query, "force_regenerate": False}])
            
            result = loaded_model.predict(input_payload)

            print("faithfulness:", result["faithfulness"][0], "relevance: ", result["relevance"][0])
            display_results(query, result)
        except Exception as e:
            logger.error(f"Prediction failed for query '{query}': {e}", exc_info=True)

else:
    logger.warning("Skipping sample inference because the model was not loaded.")

2025-07-27 09:34:54 - INFO - Running sample inference with the loaded model...
2025-07-27 09:34:54 - INFO - Received prediction request.
2025-07-27 09:34:54 - INFO - Processing query: 'What are the AI Blueprints Repository best practices?'
2025-07-27 09:34:54 - INFO - Most similar cached query: 'How do i run blueprints locally?' (Similarity: 0.6612)
2025-07-27 09:34:54 - INFO - CACHE MISS for query: 'What are the AI Blueprints Repository best practices?'. Running full pipeline.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


2025-07-27 09:35:02 - INFO - Added query to semantic cache: 'What are the AI Blueprints Repository best practices?'
DEBUG: Judge model raw responses: ['0.9']
DEBUG: Judge model raw responses: ['0.7']
faithfulness: 0.9 relevance:  0.7
------------------------------------------------------------
❓ Query:
What are the AI Blueprints Repository best practices?

🤖 Reply:
## **Synthesized Answer**
The AI Blueprints Repository best practices include:

1. Using `logging` module for enhanced maintainability instead of `print()`.
2 Use the Jupyter notebook if working with larger inputs.
* Define constants at the top to avoid hardcoded values.
* Utilize `pathlib` library for OS compatibility in file paths.
* Implement two main notebooks (`run-workflow.ipynb` and `register-model.ipynb`) per project.

## **Source Documents**

[`Data-Science-Team/Best-Practices-for-HP-AI-Studio-Blueprints-Repository.md`]

▶ No images were retrieved for this query.
-----------------------------------------------------

Built with ❤️ using Z by HP AI Studio.

# Step 5: Evaluate Hallucinations & Relevance

In [15]:
# In register_model.ipynb, add this as the final cell

# Check if the model was loaded and the original run_id is available
if loaded_model and 'run_id' in locals():
    logger.info(f"--- Reopening original run ({run_id}) to log pre-computed evaluations ---")

    # 1. Define your evaluation dataset
    evaluation_payload = pd.DataFrame([
        {"query": "What are the AI Blueprints Repository best practices?", "force_regenerate": True},
        {"query": "What are some feature flags that i can enable in AIStudio?", "force_regenerate": True},
        {"query": "How do i manually clean my environment without hooh?", "force_regenerate": True},
    ])

    # 2. Run predict() to get results with the embedded scores
    results_df = loaded_model.predict(evaluation_payload)
    
    # Add the original query to the results for clarity in the logged table
    results_df['query'] = evaluation_payload['query']

    # 3. Reopen the existing run using its ID
    with mlflow.start_run(run_id=run_id) as run:
        logger.info("Successfully reopened existing run. Logging metrics and artifacts...")

        # 4. Calculate average scores from the DataFrame
        avg_faithfulness = results_df["faithfulness"].mean()
        avg_relevance = results_df["relevance"].mean()

        # 5. Log the average scores as metrics to the original run
        mlflow.log_metrics({
            "avg_faithfulness": avg_faithfulness,
            "avg_relevance": avg_relevance
        })

        # 6. Log the full results DataFrame as a table artifact to the original run
        mlflow.log_table(data=results_df, artifact_file="inline_evaluation_results.json")
        
        logger.info("✅ Successfully logged metrics and artifacts to the original model run.")

else:
    logger.warning("Skipping logging because the model was not loaded or run_id was not found.")

2025-07-27 09:35:05 - INFO - --- Reopening original run (7ba5a0f2454a460e85ff2f294ed62d31) to log pre-computed evaluations ---
2025-07-27 09:35:05 - INFO - Received prediction request.
2025-07-27 09:35:05 - INFO - Processing query: 'What are the AI Blueprints Repository best practices?'
2025-07-27 09:35:05 - INFO - Forced regeneration for query: 'What are the AI Blueprints Repository best practices?'. Clearing old cache entry.
2025-07-27 09:35:05 - INFO - CACHE MISS for query: 'What are the AI Blueprints Repository best practices?'. Running full pipeline.
2025-07-27 09:35:18 - INFO - Added query to semantic cache: 'What are the AI Blueprints Repository best practices?'
DEBUG: Judge model raw responses: ['0.9']
DEBUG: Judge model raw responses: ['0.8']
2025-07-27 09:35:19 - INFO - Processing query: 'What are some feature flags that i can enable in AIStudio?'
2025-07-27 09:35:19 - INFO - Forced regeneration for query: 'What are some feature flags that i can enable in AIStudio?'. Clearing

In [14]:
logger.info("✅ Notebook execution completed.")


2025-07-27 09:35:05 - INFO - ✅ Notebook execution completed.
