In [1]:
! pip install mlflow scikit-learn xgboost pandas numpy

Defaulting to user installation because normal site-packages is not writeable


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn

In [4]:
# Load the saved splits
train = pd.read_csv("train.csv")
validation = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

# Prepare features and labels
X_train, y_train = train["Message"], train["Label"].apply(lambda x: 1 if x == "spam" else 0)
X_val, y_val = validation["Message"], validation["Label"].apply(lambda x: 1 if x == "spam" else 0)
X_test, y_test = test["Message"], test["Label"].apply(lambda x: 1 if x == "spam" else 0)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)

In [5]:
def calculate_aucpr(y_true, y_pred_proba):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    return auc(recall, precision)

In [6]:
# Define the models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Start MLflow experiment
mlflow.set_experiment("Spam_Ham_Classification")

for model_name, model in models.items():
    with mlflow.start_run():
        # Train the model on the training set
        model.fit(X_train_tfidf, y_train)

        # Predict probabilities on the validation set
        y_val_pred_proba = model.predict_proba(X_val_tfidf)[:, 1]

        # Calculate AUCPR on the validation set
        val_aucpr = calculate_aucpr(y_val, y_val_pred_proba)
        print(f"{model_name} - Validation AUCPR: {val_aucpr}")

        # Log validation metrics
        mlflow.log_metric("Validation_AUCPR", val_aucpr)

        # Predict probabilities on the test set
        y_test_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]

        # Calculate AUCPR on the test set
        test_aucpr = calculate_aucpr(y_test, y_test_pred_proba)
        print(f"{model_name} - Test AUCPR: {test_aucpr}")

        # Log test metrics
        mlflow.log_metric("Test_AUCPR", test_aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

2025/03/05 21:12:17 INFO mlflow.tracking.fluent: Experiment with name 'Spam_Ham_Classification' does not exist. Creating a new experiment.


Logistic Regression - Validation AUCPR: 0.9658774231329608
Logistic Regression - Test AUCPR: 0.9858878854003535




Random Forest - Validation AUCPR: 0.9728352244955467
Random Forest - Test AUCPR: 0.9864134189104092


Parameters: { "use_label_encoder" } are not used.



XGBoost - Validation AUCPR: 0.9514961552746767
XGBoost - Test AUCPR: 0.9707991111676805




In [8]:
# Retrieve runs and print AUCPR
experiment_id = mlflow.get_experiment_by_name("Spam_Ham_Classification").experiment_id
runs = mlflow.search_runs(experiment_id)

for _, run in runs.iterrows():
    print(f"Model: {run['tags.mlflow.runName']}, Validation AUCPR: {run['metrics.Validation_AUCPR']}, Test AUCPR: {run['metrics.Test_AUCPR']}")

Model: colorful-bear-88, Validation AUCPR: 0.9514961552746767, Test AUCPR: 0.9707991111676805
Model: wistful-hen-585, Validation AUCPR: 0.9728352244955467, Test AUCPR: 0.9864134189104092
Model: unleashed-duck-372, Validation AUCPR: 0.9658774231329608, Test AUCPR: 0.9858878854003535


In [9]:
# Find the best model based on Validation AUCPR
best_run = runs.loc[runs["metrics.Validation_AUCPR"].idxmax()]
best_model_name = best_run["tags.mlflow.runName"]
best_model_uri = f"runs:/{best_run['run_id']}/{best_model_name}"

# Register the best model
mlflow.register_model(best_model_uri, "Spam_Ham_Best_Model")

Successfully registered model 'Spam_Ham_Best_Model'.
Created version '1' of model 'Spam_Ham_Best_Model'.


<ModelVersion: aliases=[], creation_timestamp=1741189492062, current_stage='None', description=None, last_updated_timestamp=1741189492062, name='Spam_Ham_Best_Model', run_id='0cc0d6a01c384f6490441080e3b332bf', run_link=None, source='file:///home/pritam/Downloads/pritamAML2/Data/mlruns/308788947089622271/0cc0d6a01c384f6490441080e3b332bf/artifacts/wistful-hen-585', status='READY', status_message=None, tags={}, user_id=None, version=1>