<h1 style=\"text-align: center; font-size: 50px;\"> Register Model </h1>

# Notebook Overview

- Start Execution
- Install and Import Libraries
- Define User Constants
- Import Workflow Methods
- Log Results to MLFlow

# Start Execution

In [1]:
import json
import logging
from datetime import datetime
import time

# Configure logger
logger: logging.Logger = logging.getLogger("register_model_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  
logger.info("Notebook execution started.")

2025-07-15 06:33:22 - INFO - Notebook execution started.


# Install and Import Libraries

In [3]:
%%time

%pip install -r ../../requirements.txt --quiet


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
CPU times: user 54.2 ms, sys: 7.61 ms, total: 61.8 ms
Wall time: 1.58 s


In [4]:
# =============================
# Standard Library Imports
# =============================
import os
import sys
import types
import nbformat
import importlib.util
import warnings           # To manage and filter Python warnings
from pathlib import Path  # For object-oriented filesystem paths

# =============================
# Third-Party Library Imports
# =============================
import pandas as pd       # Data manipulation and analysis
import mlflow             # Experiment tracking and model logging

# Define User Constants

In [5]:
# ------------------------ Suppress Verbose Logs ------------------------
warnings.filterwarnings("ignore")

In [6]:
# Directory containing the USA stock parquet datasets
DATASET_DIR = Path("/home/jovyan/datafabric/USA_Stocks/")

# Sample sizes (in millions of rows) to evaluate during the analysis
SAMPLE_SIZES_TO_TEST = [5, 10]

# Rolling window size (in days) used for time-series statistical operations
ROLLING_WINDOW_SIZE = 7

# Name of the MLflow experiment for tracking performance and metrics
MLFLOW_EXPERIMENT_NAME = "USA Stock Analysis with Pandas"

# Import Workflow Methods

In [7]:
%%capture --no-display

nb_path = Path.cwd() / "run-workflow.ipynb"      # same folder as this notebook

module_name = "run_workflow_nb"

if module_name not in sys.modules:
    mod = types.ModuleType(module_name)
    sys.modules[module_name] = mod

    nb = nbformat.read(nb_path, as_version=4)
    code_cells = [c.source for c in nb.cells if c.cell_type == "code"]
    exec("\n\n".join(code_cells), mod.__dict__)

# ------------------------------------------------------------------------
from run_workflow_nb import describe_dataframe, aggregate_by_ticker, aggregate_by_ticker_week, compute_rolling_mean


# Log Results to MLFlow

In this section, we will log the dataset analysis results into MLFlow, particularly the necessary time it took for each dataset to run successfully and the different operations performed on them. We will be calling the functions defined in the workflow notebook which will be applied to the given set of samples in sample_sizes (e.g. [5, 10]).

In [8]:
mlflow.set_tracking_uri('/phoenix/mlflow')
# Set the MLflow experiment to track runs
mlflow.set_experiment(experiment_name=MLFLOW_EXPERIMENT_NAME)

# Loop through each dataset sample size and run analysis
for sample_size in SAMPLE_SIZES_TO_TEST:
    run_name = f"Standard Analysis - {sample_size}M"
    
    with mlflow.start_run(run_name=run_name):
        # Log configuration parameters
        mlflow.log_param("Computing", "cpu")
        mlflow.log_param("Dataset size in millions of rows", sample_size)
        
        # Load dataset corresponding to the current sample size
        dataset_path = f"/home/jovyan/datafabric/USA_Stocks/usa_stocks_{sample_size}m.parquet"
        df = pd.read_parquet(dataset_path)

        print(f"\n--- Running Analysis for {sample_size}M Rows ---")
        
        # Description
        description_time, _ = describe_dataframe(df)
        mlflow.log_metric("Description_time_seconds", description_time)
        
        # Simple Aggregation
        simple_agg_time, _ = aggregate_by_ticker(df)
        mlflow.log_metric("Simple_aggregation_time_seconds", simple_agg_time)
        
        # Composite Aggregation
        composite_agg_time, _ = aggregate_by_ticker_week(df)
        mlflow.log_metric("Composite_aggregation_time_seconds", composite_agg_time)
        
        # Rolling Window
        rolling_time, _ = compute_rolling_mean(df, ROLLING_WINDOW_SIZE)
        mlflow.log_metric(f"Rolling_window_{ROLLING_WINDOW_SIZE}D_time_seconds", rolling_time)


--- Running Analysis for 5M Rows ---

--- Running Analysis for 10M Rows ---


In [9]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"⏱️ Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("✅ Notebook execution completed successfully.")

2025-07-15 06:34:13 - INFO - ⏱️ Total execution time: 0m 51.44s
2025-07-15 06:34:13 - INFO - ✅ Notebook execution completed successfully.


Built with ❤️ using [**HP AI Studio**](https://hp.com/ai-studio).