<h1 style="text-align: center; font-size: 50px;"> Spam Detection with NLP (Natural Language Processing) MLflow Integration </h1>

Notebook Overview
- Start Execution
- User Constants
- Install and Import Libraries
- Configure Settings
- Verify Assets
- Logging Model to MLflow
- Fetching the Latest Model Version from MLflow
- Loading the Model and Running Inference

## Start Execution

In [1]:
import logging
import time

# Configure logger
logger: logging.Logger = logging.getLogger("register_model_logger")
logger.setLevel(logging.INFO)
logger.propagate = False  # Prevent duplicate logs from parent loggers

# Set formatter
formatter: logging.Formatter = logging.Formatter(
    fmt="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Configure and attach stream handler
stream_handler: logging.StreamHandler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

In [2]:
start_time = time.time()  

logger.info("Notebook execution started.")

2025-07-16 12:11:49 - INFO - Notebook execution started.


## User Constants

In [3]:
TEXT = "You have won a free ticket!"

##  Install and Import Libraries

In [4]:
%pip install -r ../requirements.txt --quiet

Note: you may need to restart the kernel to use updated packages.


In [5]:
# ------------------------ System Utilities ------------------------
import warnings
from pathlib import Path
import os

# ------------------------ Data Manipulation ------------------------
import pandas as pd

# ------------------------ Text Preprocessing ------------------------
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from types import SimpleNamespace
from sklearn.metrics import classification_report

# ------------------------ Machine Learning tools ------------------------
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# ------------------------ MLflow for Experiment Tracking and Model Management ------------------------
import mlflow
from mlflow import MlflowClient
from mlflow.models.signature import ModelSignature
from mlflow.types.schema import Schema, ColSpec

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Configure Settings

In [6]:
# Suppress Python warnings
warnings.filterwarnings("ignore")

In [7]:
# ------------------------- Paths -------------------------
DATA_PATH = '/home/jovyan/datafabric/tutorial/spam_utf8.csv'
NLTK_DIR_LOCAL  = '/home/jovyan/local/nltk_data'  
# ------------------------ MLflow Integration ------------------------
EXPERIMENT_NAME = "Spam_Detection_Experiment"
RUN_NAME = "Spam_Detection_Run"
MODEL_NAME = "Spam_Detection_Model"

## Verify Assets

In [8]:
def log_asset_status(asset_path: str, asset_name: str, success_message: str, failure_message: str) -> None:
    """
    Logs the status of a given asset based on its existence.

    Parameters:
        asset_path (str): File or directory path to check.
        asset_name (str): Name of the asset for logging context.
        success_message (str): Message to log if asset exists.
        failure_message (str): Message to log if asset does not exist.
    """
    if Path(asset_path).exists():
        logger.info(f"{asset_name} is properly configured. {success_message}")
    else:
        logger.info(f"{asset_name} is not properly configured. {failure_message}")

log_asset_status(
    asset_path=DATA_PATH,
    asset_name="Spam data",
    success_message="",
    failure_message="Please create and download the required assets in your project on AI Studio."
)

log_asset_status(
    asset_path=NLTK_DIR_LOCAL,
    asset_name="NLTK Path",
    success_message="",
    failure_message="Please check if NLTK was downloaded."
)

2025-07-16 12:11:54 - INFO - Spam data is properly configured. 
2025-07-16 12:11:54 - INFO - NLTK Path is properly configured. 


## Logging Model to MLflow

In [None]:
def ensure_local_stopwords(base_dir: str):
    sw_file = Path(base_dir) / 'corpora' / 'stopwords' / 'english'
    if not sw_file.exists():
        sw_file.parent.mkdir(parents=True, exist_ok=True)
        logger.info("⬇️ Downloanding stopwords on %s …", base_dir)
        nltk.download('stopwords', download_dir=base_dir, quiet=True, raise_on_error=True)
    nltk.data.path = [base_dir]            

class SpamDetectionModel(mlflow.pyfunc.PythonModel):
    def preprocess(self, text: str):
        """
        Preprocesses the message, performing:
        1. Removal of all punctuation
        2. Removal of all stopwords
        3. Return of a list of the cleaned text
        """
        try:
            
            nopunc = ''.join(c for c in text if c not in string.punctuation)
            return [w for w in nopunc.split() if w.lower() not in self.stop_words]
            
        except Exception as e:
            logger.error(f"Error preprocessing: {str(e)}")
            raise

    def load_context(self, context):
        """Load model artifacts and pipeline."""
        try:
        
            nltk_dir = os.path.abspath(context.artifacts['nltk_data'])
            nltk.data.path = [nltk_dir]
            os.environ['NLTK_DATA'] = nltk_dir
    
            self.stop_words = set(stopwords.words('english'))
    
            df = pd.read_csv(context.artifacts['data_path'], sep=',',
                             names=['label', 'message', '_1', '_2', '_3'])
            X_tr, X_te, y_tr, y_te = train_test_split(
                df['message'], df['label'], test_size=0.2, random_state=42
            )
    
            self.pipeline = Pipeline([
                ('bow',   CountVectorizer(analyzer=self.preprocess)),
                ('tfidf', TfidfTransformer()),
                ('clf',   MultinomialNB()),
            ])
            self.pipeline.fit(X_tr, y_tr)
            self._X_test, self._y_test = X_te, y_te

        except Exception as e:
            logger.error(f"Error loading context: {str(e)}")

    def predict(self, context, model_input):
        """
        Computes the prediction of whether it is ham or spam.
        """
        try:
            
            return self.pipeline.predict(model_input)
            
        except Exception as e:
            logger.error(f"Error performing prediction: {str(e)}")
            
mlflow.set_tracking_uri("/phoenix/mlflow")
mlflow.set_experiment(EXPERIMENT_NAME)
ensure_local_stopwords(NLTK_DIR_LOCAL)

with mlflow.start_run() as run:

    mlflow.log_artifacts(NLTK_DIR_LOCAL, artifact_path='nltk_data')
    nltk_artifact_uri = mlflow.get_artifact_uri('nltk_data')

    ctx = SimpleNamespace(artifacts={
        'data_path': DATA_PATH,
        'nltk_data': nltk_artifact_uri
    })

    model = SpamDetectionModel()
    model.load_context(ctx)

    preds   = model.pipeline.predict(model._X_test)
    report  = classification_report(model._y_test, preds, output_dict=True)
    mlflow.log_metric('accuracy', report['accuracy'])

    signature = ModelSignature(
        inputs  = Schema([ColSpec('string', 'text')]),
        outputs = Schema([ColSpec('string')])
    )

    mlflow.pyfunc.log_model(
        artifact_path=MODEL_NAME,
        python_model=model,
        artifacts={
            'data_path': DATA_PATH,
            'nltk_data': nltk_artifact_uri
        },
        signature=signature,
        pip_requirements='../requirements.txt'
    )
    
    model_uri = f"runs:/{run.info.run_id}/{MODEL_NAME}"
    mlflow.register_model(model_uri=model_uri, name=MODEL_NAME)

logger.info(f'Registered the model: {MODEL_NAME}')
logger.info(f'✅Stopwords packed in the artifact": {nltk_artifact_uri}')
 

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/34 [00:00<?, ?it/s]

Registered model 'Spam_Detection_Model' already exists. Creating a new version of this model...
Created version '2' of model 'Spam_Detection_Model'.
2025-07-16 12:11:57 - INFO - Registered the model: Spam_Detection_Model
2025-07-16 12:11:57 - INFO - ✅Stopwords packed in the artifact": /phoenix/mlflow/825651502965763105/288e6b4419f5478a993917b36ee86d19/artifacts/nltk_data


## Fetching the Latest Model Version from MLflow

In [10]:
# Initialize the MLflow client
client = MlflowClient()

# Retrieve the latest version of the "spam_detect_model" model 
model_metadata = client.get_latest_versions(MODEL_NAME, stages=["None"])
latest_model_version = model_metadata[0].version  # Extract the latest model version

# Fetch model information, including its signature
model_info = mlflow.models.get_model_info(f"models:/{MODEL_NAME}/{latest_model_version}")

# Print the latest model version and its signature
logger.info(f"Latest Model Version: {latest_model_version}")
logger.info(f"Model Signature: {model_info.signature}")

2025-07-16 12:11:57 - INFO - Latest Model Version: 2
2025-07-16 12:11:57 - INFO - Model Signature: inputs: 
  ['text': string (required)]
outputs: 
  [string (required)]
params: 
  None



## Loading the Model and Running Inference

In [11]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{MODEL_NAME}/{latest_model_version}")

# Define a sample text for testing
text = pd.DataFrame({'text': [TEXT]})

# Use the model to predict 
result = model.predict(text)
logger.info(result)

2025-07-16 12:11:57 - INFO - ['ham']


In [12]:
end_time: float = time.time()
elapsed_time: float = end_time - start_time
elapsed_minutes: int = int(elapsed_time // 60)
elapsed_seconds: float = elapsed_time % 60

logger.info(f"⏱️ Total execution time: {elapsed_minutes}m {elapsed_seconds:.2f}s")
logger.info("✅ Notebook execution completed successfully.")

2025-07-16 12:11:57 - INFO - ⏱️ Total execution time: 0m 8.23s
2025-07-16 12:11:57 - INFO - ✅ Notebook execution completed successfully.


Built with ❤️ using [**Z by HP AI Studio**](https://zdocs.datascience.hp.com/docs/aistudio/overview).