In [1]:
import logging
import time

# Configure logger
logger: logging.Logger = logging.getLogger("register_model_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  

logger.info("Model registration notebook execution started.")

2025-07-22 22:33:55 - INFO - Model registration notebook execution started.


<h1 style="text-align: center; font-size: 50px;"> Fine-Tuned Model Registration Service </h1>

This notebook demonstrates how to register a fine-tuned LLM comparison service that allows switching between base and fine-tuned models through a single MLflow endpoint. This follows the same pattern used across all AI-Blueprints for consistent model deployment and serving.

In [3]:
!pip install -r ../requirements.txt --quiet

In [4]:
import os
import sys
import yaml
from pathlib import Path
import logging
import warnings
import mlflow

# Add the core directory to the path to import utils
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

# ===============================
# üß† Model Selection & Loading
# ===============================
from core.selection.model_selection import ModelSelector

# ===============================
# üöÄ Deployment & Registration
# ===============================
from core.deploy.deploy_fine_tuning import register_llm_comparison_model

# ===============================
# ‚öôÔ∏è Utility Functions
# ===============================
from src.utils import (
    load_config_and_secrets,
    configure_proxy,
    login_huggingface,
    get_configs_dir,
    get_fine_tuned_models_dir,
    get_models_dir,
    format_model_path
)

2025-07-22 22:34:07.220240: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-22 22:34:07.239627: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753223647.275187    7895 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753223647.282643    7895 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753223647.300244    7895 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

[2025-07-22 22:34:10,346] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/opt/conda/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


## Configuration

In [5]:
# Suppress Python warnings
warnings.filterwarnings("ignore")

In [6]:
# Configuration paths and parameters
CONFIG_PATH = str(get_configs_dir() / "config.yaml")
SECRETS_PATH = str(get_configs_dir() / "secrets.yaml")
MLFLOW_EXPERIMENT_NAME = "AIStudio-Fine-Tuning-Experiment"
MODEL_SERVICE_RUN_NAME = "AIStudio-Fine-Tuning-Service-Run"
MODEL_SERVICE_NAME = "AIStudio-Fine-Tuning-Model"

# Model configuration - update these based on your training
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"  # Update to match your base model
FINE_TUNED_MODEL_NAME = "Orpo-TinyLlama-1.1B-Chat-v1.0-FT"  # Update to match your fine-tuned model

logger.info("üìã Model Registration Configuration:")
logger.info(f"   ‚Ä¢ Base model (HF): {BASE_MODEL}")
logger.info(f"   ‚Ä¢ Fine-tuned model: {FINE_TUNED_MODEL_NAME}")
logger.info(f"   ‚Ä¢ MLflow experiment: {MLFLOW_EXPERIMENT_NAME}")
logger.info(f"   ‚Ä¢ Service name: {MODEL_SERVICE_NAME}")

2025-07-22 22:34:12 - INFO - üìã Model Registration Configuration:
2025-07-22 22:34:12 - INFO -    ‚Ä¢ Base model (HF): TinyLlama/TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:12 - INFO -    ‚Ä¢ Fine-tuned model: Orpo-TinyLlama-1.1B-Chat-v1.0-FT
2025-07-22 22:34:12 - INFO -    ‚Ä¢ MLflow experiment: AIStudio-Fine-Tuning-Experiment
2025-07-22 22:34:12 - INFO -    ‚Ä¢ Service name: AIStudio-Fine-Tuning-Model


In [7]:
# Load configuration and configure proxy if needed
config, secrets = load_config_and_secrets(CONFIG_PATH, SECRETS_PATH)
configure_proxy(config)

logger.info("‚úÖ Configuration loaded successfully")

2025-07-22 22:34:12 - INFO - ‚úÖ Configuration loaded successfully


In [8]:
# Login to Hugging Face (required for downloading gated models)
try:
    login_huggingface(secrets)
    logger.info("‚úÖ Hugging Face authentication successful")
except Exception as e:
    logger.warning(f"‚ö†Ô∏è Hugging Face authentication failed: {e}")
    logger.info("Some models may not be accessible if they require authentication")

2025-07-22 22:34:12 - INFO - ‚úÖ Hugging Face authentication successful


‚úÖ Logged into Hugging Face successfully.


## Verify and Prepare Model Assets

Before registering the models, let's verify that both the base model and fine-tuned model are accessible. If the base model hasn't been downloaded locally yet, we'll download it using the same approach as the training workflow.

In [9]:
def verify_and_prepare_model_assets():
    """Verify and prepare both base and fine-tuned model assets."""
    
    # Check fine-tuned model directory
    fine_tuned_dir = get_fine_tuned_models_dir()
    fine_tuned_path = fine_tuned_dir / FINE_TUNED_MODEL_NAME
    
    if fine_tuned_path.exists():
        logger.info(f"‚úÖ Fine-tuned model found: {fine_tuned_path}")
        fine_tuned_available = True
    else:
        logger.warning(f"‚ö†Ô∏è Fine-tuned model not found: {fine_tuned_path}")
        logger.info("Please run the run-workflow.ipynb notebook first to create the fine-tuned model")
        fine_tuned_available = False
    
    # Handle base model - download locally if needed using ModelSelector
    logger.info(f"üîç Checking base model: {BASE_MODEL}")
    
    try:
        # Use ModelSelector to handle model downloading and verification
        selector = ModelSelector()
        selector.select_model(BASE_MODEL)
        
        # Get the local model path
        base_model_local_path = selector.format_model_path(BASE_MODEL)
        
        logger.info(f"‚úÖ Base model prepared locally: {base_model_local_path}")
        
        return fine_tuned_available, base_model_local_path
        
    except Exception as e:
        logger.error(f"‚ùå Failed to prepare base model: {str(e)}")
        return False, None

# Verify and prepare assets
assets_verified, base_model_path = verify_and_prepare_model_assets()

2025-07-22 22:34:12 - INFO - ‚úÖ Fine-tuned model found: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/output/fine_tuned_models/Orpo-TinyLlama-1.1B-Chat-v1.0-FT
2025-07-22 22:34:12 - INFO - üîç Checking base model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:12,866 ‚Äî INFO ‚Äî [ModelSelector] Selected model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:12,869 ‚Äî INFO ‚Äî [ModelSelector] Downloading model snapshot to: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0


Fetching 10 files:   0%|          | 0/10 [00:00<?, ?it/s]

2025-07-22 22:34:13,089 ‚Äî INFO ‚Äî [ModelSelector] ‚úÖ Model downloaded successfully to: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:13,090 ‚Äî INFO ‚Äî [ModelSelector] Loading model and tokenizer from: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:49,994 ‚Äî INFO ‚Äî [ModelSelector] Checking model for ORPO compatibility...
2025-07-22 22:34:49,996 ‚Äî INFO ‚Äî [ModelSelector] ‚úÖ Model 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' is ORPO-compatible.
2025-07-22 22:34:50 - INFO - ‚úÖ Base model prepared locally: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0


## Adaptive Model Registration Service

This section demonstrates how to register the **adaptive** LLM comparison model that automatically adjusts to different hardware and memory constraints. The model provides a single API endpoint that works efficiently across various deployment environments.

### Key Adaptive Features:
- **Automatic Device Selection**: Intelligently chooses between CPU and GPU based on availability
- **Dynamic Memory Management**: Adapts memory usage patterns based on available resources  
- **Smart Device Mapping**: Uses transformers' auto device mapping for optimal model distribution
- **Precision Optimization**: Automatically selects FP16 on GPU, FP32 on CPU for best performance
- **Robust Error Handling**: Graceful fallbacks when advanced features aren't available
- **Universal Compatibility**: Works in both memory-constrained and resource-rich environments

The service provides:

- **Base Model Inference**: Access to the original pre-trained model
- **Fine-Tuned Model Inference**: Access to the ORPO fine-tuned model  
- **Comparison Mode**: Switch between models using the `use_finetuning` parameter
- **Adaptive Performance**: Automatically optimizes for the deployment environment
- **Flexible Input**: Support for custom prompts and generation parameters

In [10]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri('/phoenix/mlflow')
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

if assets_verified and base_model_path:
    try:
        logger.info(f"üìù Registering comparison model with:")
        logger.info(f"   ‚Ä¢ Base model path: {base_model_path}")
        logger.info(f"   ‚Ä¢ Fine-tuned model: {FINE_TUNED_MODEL_NAME}")
        
        # Register the adaptive LLM comparison model
        register_llm_comparison_model(
            model_base_path=base_model_path,
            model_finetuned_path=FINE_TUNED_MODEL_NAME,
            experiment=MLFLOW_EXPERIMENT_NAME,
            run_name=MODEL_SERVICE_RUN_NAME,
            registry_name=MODEL_SERVICE_NAME,
            config_path=CONFIG_PATH
        )
        
        logger.info("‚úÖ Adaptive LLM comparison model registered successfully!")
        logger.info(f"Model name: {MODEL_SERVICE_NAME}")
        logger.info(f"Experiment: {MLFLOW_EXPERIMENT_NAME}")
        logger.info("This model automatically adapts to memory constraints and available hardware.")
        
    except Exception as e:
        logger.error(f"‚ùå Failed to register comparison model: {str(e)}")
        logger.info("Please check the error details above and ensure all dependencies are installed")
        
else:
    logger.error("‚ùå Cannot register model - required assets not found or not prepared")
    if not assets_verified:
        logger.info("Please run the run-workflow.ipynb notebook first to create the fine-tuned model")
    if not base_model_path:
        logger.info("Base model could not be downloaded or prepared locally")

2025-07-22 22:34:50 - INFO - üìù Registering comparison model with:
2025-07-22 22:34:50 - INFO -    ‚Ä¢ Base model path: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:50 - INFO -    ‚Ä¢ Fine-tuned model: Orpo-TinyLlama-1.1B-Chat-v1.0-FT
2025-07-22 22:34:50,156 ‚Äî INFO ‚Äî Resolved base model path: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/models/TinyLlama__TinyLlama-1.1B-Chat-v1.0
2025-07-22 22:34:50,157 ‚Äî INFO ‚Äî Resolved fine-tuned model path: /home/jovyan/AI-Blueprints/generative-ai/fine-tuning-with-orpo/output/fine_tuned_models/Orpo-TinyLlama-1.1B-Chat-v1.0-FT


Downloading artifacts:   0%|          | 0/31 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/6 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Registered model 'AIStudio-Fine-Tuning-Model' already exists. Creating a new version of this model...
Created version '3' of model 'AIStudio-Fine-Tuning-Model'.
2025-07-22 22:37:59,471 ‚Äî INFO ‚Äî ‚úÖ Adaptive LLM comparison model registered as `AIStudio-Fine-Tuning-Model` (run bfa8cafd821547edbcc3814fdd65a711)
2025-07-22 22:37:59 - INFO - ‚úÖ Adaptive LLM comparison model registered successfully!
2025-07-22 22:37:59 - INFO - Model name: AIStudio-Fine-Tuning-Model
2025-07-22 22:37:59 - INFO - Experiment: AIStudio-Fine-Tuning-Experiment
2025-07-22 22:37:59 - INFO - This model automatically adapts to memory constraints and available hardware.


## Usage Instructions

Once the adaptive model is registered, you can use it through the MLflow model serving interface. The adaptive version automatically optimizes performance and memory usage based on your environment.

### Input Format
The model expects a pandas DataFrame with the following columns:
- `prompt` (string): The text prompt to generate from
- `use_finetuning` (boolean): Whether to use the fine-tuned model (True) or base model (False)
- `max_tokens` (integer, optional): Maximum number of tokens to generate (default: 128)

### Example Usage
```python
import pandas as pd
import mlflow

# Load the registered adaptive model
model = mlflow.pyfunc.load_model(f"models:/{MODEL_SERVICE_NAME}/latest")

# Create input data
input_data = pd.DataFrame({
    "prompt": ["Explain the importance of sustainable agriculture."],
    "use_finetuning": [True],  # Use fine-tuned model
    "max_tokens": [200]
})

# Generate response (automatically optimized)
response = model.predict(input_data)
print(response["response"].iloc[0])
```

### Adaptive Comparison Mode
The adaptive version efficiently handles model switching with automatic optimization:

```python
# Compare base vs fine-tuned (adaptive optimization)
prompts = ["Your test prompt here"]

for use_ft in [False, True]:
    input_data = pd.DataFrame({
        "prompt": prompts,
        "use_finetuning": [use_ft],
        "max_tokens": [150]
    })
    response = model.predict(input_data)
    model_type = "Fine-tuned" if use_ft else "Base"
    print(f"{model_type} Model: {response['response'].iloc[0]}")
    # Model automatically handles device placement and memory management
```

### Adaptive Benefits
- **Environment Detection**: Automatically detects available hardware and memory
- **Performance Optimization**: Uses best settings for each deployment environment
- **Memory Safety**: Prevents OOM errors through intelligent memory management
- **Hardware Efficiency**: Leverages GPU acceleration when available, graceful CPU fallback
- **Robust Operation**: Handles various deployment scenarios without configuration changes

In [11]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"‚è±Ô∏è Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("‚úÖ Model registration notebook execution completed successfully.")

2025-07-22 22:37:59 - INFO - ‚è±Ô∏è Total execution time: 4m 3.60s
2025-07-22 22:37:59 - INFO - ‚úÖ Model registration notebook execution completed successfully.


Built with ‚ù§Ô∏è using [**HP AI Studio**](https://hp.com/ai-studio).