<h1 style=\"text-align: center; font-size: 50px;\"> Register Model </h1>

# Notebook Overview

- Start Execution
- Install and Import Libraries
- Define User Constants
- Define Workflow Methods Loaded with cuDF
- Log Results to MLFlow

# Start Execution

In [None]:
import logging
import time

# Configure logger
logger: logging.Logger = logging.getLogger("run_workflow_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  

logger.info("Notebook execution started.")

2025-07-11 22:53:07 - INFO - Notebook execution started.


# Install and Import Libraries

In [3]:
%%time

%pip install -r ../requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.
CPU times: user 19.7 ms, sys: 8.12 ms, total: 27.8 ms
Wall time: 1.18 s


In [None]:
# Enable GPU-accelerated pandas API via cuDF
%load_ext cudf.pandas

In [None]:
# ------------------------- Standard Library Imports -------------------------

import warnings   # Warning management and filtering
import sys        # System-specific parameters and functions
import os
from pathlib import Path
import json

# Ensure project root is included in Python's module search path
sys.path.append('..') 

# ------------------------- Core Package Imports -------------------------

import pandas as pd                            # DataFrame operations
from src.opencellid_downloader import download_and_extract  # Data acquisition utility

# ------------------------- Visualization & Dashboard Libraries -------------------------

import hvplot.pandas                           # High-level plotting API for pandas/cuDF
import pydeck as pdk                           # WebGL-powered interactive geospatial visualizations
import panel as pn                             # Dashboarding framework for interactive apps
import param                                   # Parameterized configuration support
import time                                    # Execution timing utility
import mlflow                                  # Model tracking and experiment management

# Initialize Panel with the required extensions and a material design theme
pn.extension("deckgl", loading_indicator=True, template="material")

# User Defined Constants

In [None]:
# ------------------------ Suppress Verbose Logs ------------------------
warnings.filterwarnings("ignore")

In [None]:
# ------------------------- Paths -------------------------
DATA_PATH = '../data/cell_towers_us.csv'
DATA_URL = 'https://s3.amazonaws.com/mcc-mnc.net/mcc-mnc.csv'

# Define MLFlow Model

In [None]:
class OpenCellEDA(mlflow.pyfunc.PythonModel):
    def load_context(self, context):
        self.dashboard_html = context.artifacts["dashboard"]

    def predict(self, context, model_input: pd.DataFrame, params=None):
        # just return the dashboard html as a single-row df
        return pd.DataFrame(
            {"dashboard_html": [pathlib.Path(self.dashboard_html).read_text()]}
        )
    
# --- helpers -------------------------------------------------
def load_opencellid(dataset_dir: str | Path) -> pd.DataFrame:
    """
    Download (if missing) and load the OpenCellID CSV into a DataFrame.
    """
    dataset_dir = Path(dataset_dir)
    dataset_dir.mkdir(parents=True, exist_ok=True)
    csv_path = dataset_dir / "OCID_sample.csv"

    if not csv_path.exists():
        download_and_extract(dataset_dir)  # your existing util

    return pd.read_csv(csv_path)


def describe_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return a simple statistical summary of the dataset.
    """
    return df.describe(include="all")

def plot_panel_dashboard(df: pd.DataFrame, out_html: str | Path = "dashboard.html"):
    """
    Build a Panel dashboard and save it as a static HTML file.
    """
    map_plot = df.hvplot.points(
        x="lon", y="lat", geo=True, tiles="OSM", width=800, height=450
    )
    pn.panel(map_plot).save(out_html)
    return out_html


# Log Results to MLFlow

In [None]:
mlflow.set_tracking_uri("/phoenix/mlflow")
mlflow.set_experiment("OpenCellID-EDA")

with mlflow.start_run(run_name="EDA") as run:
    t0 = time.perf_counter()
    df = load_opencellid(DATA_PATH)
    mlflow.log_metric("load_time_sec", time.perf_counter() - t0)
    mlflow.log_metric("num_rows", len(df))

    t0 = time.perf_counter()
    stats = describe_dataset(df)
    mlflow.log_metric("describe_time_sec", time.perf_counter() - t0)

    stats_path = "stats.csv"
    stats.to_csv(stats_path, index=False)
    mlflow.log_artifact(stats_path)

    plot_panel_dashboard(df, DASHBOARD_FILE)
    mlflow.log_artifact(DASHBOARD_FILE, artifact_path="dashboard")

    # register a tiny pyfunc so users can load the dashboard from MLflow
    OpenCellEDA.log_model(
        artifact_path="opencellid_dashboard_model",
        artifacts={"dashboard": DASHBOARD_FILE},
        registered_model_name="OpenCellID-EDA",
        pip_requirements="../requirements.txt",
    )

In [6]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"⏱️ Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("✅ Notebook execution completed successfully.")

2025-07-11 22:55:27 - INFO - ⏱️ Total execution time: 2m 19.95s
2025-07-11 22:55:27 - INFO - ✅ Notebook execution completed successfully.


Built with ❤️ using [**HP AI Studio**](https://hp.com/ai-studio).