<h1 style="text-align: center; font-size: 50px;"> Spam Detection with NLP (Natural Language Processing) MLflow Integration </h1>

Notebook Overview
- Start Execution
- User Constants
- Install and Import Libraries
- Configure Settings
- Verify Assets
- Logging Model to MLflow
- Fetching the Latest Model Version from MLflow
- Loading the Model and Running Inference

## Start Execution

In [1]:
import logging
import time

# Configure logger
logger: logging.Logger = logging.getLogger("register_model_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  

logger.info("Notebook execution started.")

2025-08-01 17:15:01 - INFO - Notebook execution started.


## User Constants

In [3]:
TEXT = "You have won a free ticket!"

##  Install and Import Libraries

In [4]:
%pip install -r ../requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [5]:
# ------------------------ System Utilities ------------------------
import warnings
from pathlib import Path
import os
import yaml

# ------------------------ Data Manipulation ------------------------
import pandas as pd

# ------------------------ Text Preprocessing ------------------------
import string
import nltk
import sys
nltk.download('stopwords')
from nltk.corpus import stopwords
from types import SimpleNamespace
from sklearn.metrics import classification_report

# ------------------------ Machine Learning tools ------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# ------------------------ MLflow for Experiment Tracking and Model Management ------------------------
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

# ------------------------ Utils Import ------------------------
sys.path.append("../src")
from utils import load_config

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Configure Settings

In [6]:
# Suppress Python warnings
warnings.filterwarnings("ignore")

In [7]:
# ------------------------- Paths -------------------------
DATA_PATH = '/home/jovyan/datafabric/tutorial/spam_utf8.csv'
NLTK_DIR_LOCAL  = '/home/jovyan/local/nltk_data'  
CONFIG_PATH = "../configs/config.yaml"
config = load_config(CONFIG_PATH)
DEMO_FOLDER = "../demo"

# ------------------------ MLflow Integration ------------------------
EXPERIMENT_NAME = "Spam_Detection_Experiment"
RUN_NAME = "Spam_Detection_Run"
MODEL_NAME = "Spam_Detection_Model"

## Verify Assets

In [8]:
def log_asset_status(asset_path: str, asset_name: str, success_message: str, failure_message: str) -> None:
    """
    Logs the status of a given asset based on its existence.

    Parameters:
        asset_path (str): File or directory path to check.
        asset_name (str): Name of the asset for logging context.
        success_message (str): Message to log if asset exists.
        failure_message (str): Message to log if asset does not exist.
    """
    if Path(asset_path).exists():
        logger.info(f"{asset_name} is properly configured. {success_message}")
    else:
        logger.info(f"{asset_name} is not properly configured. {failure_message}")

log_asset_status(
    asset_path=DATA_PATH,
    asset_name="Spam data",
    success_message="",
    failure_message="Please create and download the required assets in your project on AI Studio."
)

log_asset_status(
    asset_path=NLTK_DIR_LOCAL,
    asset_name="NLTK Path",
    success_message="",
    failure_message="Please check if NLTK was downloaded."
)

log_asset_status(
    asset_path=DEMO_FOLDER,
    asset_name="Demo Folder",
    success_message="",
    failure_message="Please check if Demo folder was downloaded."
)

2025-08-01 17:15:04 - INFO - Spam data is properly configured. 
2025-08-01 17:15:04 - INFO - NLTK Path is properly configured. 
2025-08-01 17:15:04 - INFO - Demo Folder is properly configured. 


## Logging Model to MLflow

In [9]:
def ensure_local_stopwords(base_dir: str):
    sw_file = Path(base_dir) / 'corpora' / 'stopwords' / 'english'
    if not sw_file.exists():
        sw_file.parent.mkdir(parents=True, exist_ok=True)
        logger.info("‚¨áÔ∏è Downloading stopwords to %s ‚Ä¶", base_dir)
        nltk.download('stopwords', download_dir=base_dir, quiet=True, raise_on_error=True)
    nltk.data.path = [base_dir]


class SpamDetectionModel(mlflow.pyfunc.PythonModel):
    def preprocess(self, text: str):
        """
        Preprocesses the message, performing:
        1. Removal of all punctuation
        2. Removal of all stopwords
        3. Return of a list of the cleaned text
        """
        try:
            
            nopunc = ''.join(c for c in text if c not in string.punctuation)
            return [w for w in nopunc.split() if w.lower() not in self.stop_words]
            
        except Exception as e:
            logger.error(f"Error preprocessing: {str(e)}")
            raise

    def load_context(self, context):
        try:
            nltk_dir = os.path.abspath(context.artifacts['nltk_data'])
            nltk.data.path = [nltk_dir]
            os.environ['NLTK_DATA'] = nltk_dir
            self.stop_words = set(stopwords.words('english'))

            df = pd.read_csv(context.artifacts['data_path'], sep=',',
                             names=["label", "message", "v3", "v4", "v5"])
            X_tr, X_te, y_tr, y_te = train_test_split(
                df['message'], df['label'], test_size=0.2, random_state=42
            )

            self.pipeline = Pipeline([
                ('bow', CountVectorizer(analyzer=self.preprocess)),
                ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB()),
            ])
            self.pipeline.fit(X_tr, y_tr)
            self._X_test, self._y_test = X_te, y_te

            config_path = context.artifacts["config"]
            with open(config_path, 'r') as f:
                self.config = yaml.safe_load(f)

            logger.info("‚úÖ Model and configuration loaded successfully")

        except Exception as e:
            logger.error(f"Error loading context: {str(e)}")

    def predict(self, context, model_input):
        try:
            return self.pipeline.predict(model_input)
        except Exception as e:
            logger.error(f"Error performing prediction: {str(e)}")

    @classmethod
    def log_model(cls, artifact_path, config_path, demo_path):
        """
        Logs the model to MLflow with appropriate artifacts and schema (vanilla-rag pattern).
        """
        try:
            mlflow.log_artifacts(NLTK_DIR_LOCAL, artifact_path='nltk_data')
            mlflow.log_artifact(CONFIG_PATH, artifact_path='config')
            nltk_artifact_uri = mlflow.get_artifact_uri('nltk_data')

            ctx = SimpleNamespace(artifacts={
                'data_path': DATA_PATH,
                'nltk_data': nltk_artifact_uri,
                'config': config_path,
                'demo' : DEMO_FOLDER
                
            })

            model = SpamDetectionModel()
            model.load_context(ctx)

            signature = ModelSignature(
                inputs=Schema([ColSpec('string', 'text')]),
                outputs=Schema([ColSpec('string')])
            )

            mlflow.pyfunc.log_model(
                artifact_path=artifact_path,
                python_model=model,
                artifacts={
                    'data_path': DATA_PATH,
                    'nltk_data': nltk_artifact_uri,
                    'config': config_path,
                    'demo': DEMO_FOLDER
                },
                signature=signature
            )

            logger.info("‚úÖ Model and artifacts successfully registered in MLflow")

        except Exception as e:
            logger.error(f"‚ùå Error logging model: {str(e)}")
            raise






In [10]:
logger.info(f'üöÄ Starting the experiment: {EXPERIMENT_NAME}')

mlflow.set_tracking_uri('/phoenix/mlflow')
mlflow.set_experiment(experiment_name=EXPERIMENT_NAME)

with mlflow.start_run(run_name=RUN_NAME) as run:
    logger.info(f"üìÅ Run's Artifact URI: {run.info.artifact_uri}")

    mlflow.log_artifacts(NLTK_DIR_LOCAL, artifact_path='nltk_data')
    mlflow.log_artifact(CONFIG_PATH, artifact_path='config')
    nltk_artifact_uri = mlflow.get_artifact_uri('nltk_data')

    ctx = SimpleNamespace(artifacts={
        'data_path': DATA_PATH,
        'nltk_data': nltk_artifact_uri,
        'config': CONFIG_PATH,
        'demo' : DEMO_FOLDER
    })

    model = SpamDetectionModel()
    model.load_context(ctx)

    preds = model.pipeline.predict(model._X_test)
    report = classification_report(model._y_test, preds, output_dict=True)
    mlflow.log_metric('accuracy', report['accuracy'])

    SpamDetectionModel.log_model(
        artifact_path=MODEL_NAME,
        config_path=CONFIG_PATH,
        demo_path = DEMO_FOLDER
    )

    model_uri = f"runs:/{run.info.run_id}/{MODEL_NAME}"
    mlflow.register_model(
        model_uri=model_uri,
        name=MODEL_NAME
    )

    logger.info(f"‚úÖ Model registered successfully with run ID: {run.info.run_id}")

2025-08-01 17:15:04 - INFO - üöÄ Starting the experiment: Spam_Detection_Experiment
2025-08-01 17:15:04 - INFO - üìÅ Run's Artifact URI: /phoenix/mlflow/711006087746390577/fd906992e7e5436e9fd7fd65895e8517/artifacts
2025-08-01 17:15:05 - INFO - ‚úÖ Model and configuration loaded successfully
2025-08-01 17:15:05 - INFO - ‚úÖ Model and configuration loaded successfully


Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

2025-08-01 17:15:10 - INFO - ‚úÖ Model and artifacts successfully registered in MLflow
Registered model 'Spam_Detection_Model' already exists. Creating a new version of this model...
Created version '21' of model 'Spam_Detection_Model'.
2025-08-01 17:15:11 - INFO - ‚úÖ Model registered successfully with run ID: fd906992e7e5436e9fd7fd65895e8517


## Fetching the Latest Model Version from MLflow

In [11]:
# Initialize the MLflow client
client = MlflowClient()

# Retrieve the latest version of the "spam_detect_model" model 
model_metadata = client.get_latest_versions(MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version  # Extract the latest model version

# Fetch model information, including its signature
model_info = mlflow.models.get_model_info(f"models:/{MODEL_NAME}/{latest_model_version}")

# Print the latest model version and its signature
logger.info(f"Latest Model Version: {latest_model_version}")
logger.info(f"Model Signature: {model_info.signature}")

2025-08-01 17:15:12 - INFO - Latest Model Version: 21
2025-08-01 17:15:12 - INFO - Model Signature: inputs: 
  ['text': string (required)]
outputs: 
  [string (required)]
params: 
  None



## Loading the Model and Running Inference

In [12]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{MODEL_NAME}/{latest_model_version}")

# Define a sample text for testing
logger.info(TEXT)
text = pd.DataFrame({'text': [TEXT]})

# Use the model to predict 
result = model.predict(text)
logger.info(result)

2025-08-01 17:15:12 - INFO - ‚úÖ Model and configuration loaded successfully
2025-08-01 17:15:12 - INFO - You have won a free ticket!
2025-08-01 17:15:12 - INFO - ['ham']


In [13]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"‚è±Ô∏è Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("‚úÖ Notebook execution completed successfully.")

2025-08-01 17:15:12 - INFO - ‚è±Ô∏è Total execution time: 0m 10.99s
2025-08-01 17:15:12 - INFO - ‚úÖ Notebook execution completed successfully.


Built with ‚ù§Ô∏è using [**Z by HP AI Studio**](https://zdocs.datascience.hp.com/docs/aistudio/overview).