In [1]:
! pip install mlflow scikit-learn xgboost pandas numpy

Defaulting to user installation because normal site-packages is not writeable


In [33]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn
import joblib

In [3]:
ls

best_model.ipynb  test.csv  train.csv  validation.csv


In [9]:
train.head()

Unnamed: 0,Label,Message
0,0,understand loss gain work school
1,0,dunno lei decide lor abt leona oops tot ben go...
2,0,fps
3,0,mum ive sent many many messages since got want...
4,0,long time remember today


In [23]:
# Load the saved splits
train = pd.read_csv("train.csv").dropna()
validation = pd.read_csv("validation.csv").dropna()
test = pd.read_csv("test.csv").dropna()

# Prepare features and labels
X_train, y_train = train["Message"], train["Label"]
X_val, y_val = validation["Message"], validation["Label"]
X_test, y_test = test["Message"], test["Label"]

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)


In [24]:
def calculate_aucpr(y_true, y_pred_proba):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    return auc(recall, precision)


In [25]:
# Define the models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Start MLflow experiment
mlflow.set_experiment("Spam_Ham_Classification")

for model_name, model in models.items():
    with mlflow.start_run():
        # Train the model on the training set
        model.fit(X_train_tfidf, y_train)

        # Predict probabilities on the validation set
        y_val_pred_proba = model.predict_proba(X_val_tfidf)[:, 1]

        # Calculate AUCPR on the validation set
        val_aucpr = calculate_aucpr(y_val, y_val_pred_proba)
        print(f"{model_name} - Validation AUCPR: {val_aucpr}")

        # Log validation metrics
        mlflow.log_metric("Validation_AUCPR", val_aucpr)

        # Predict probabilities on the test set
        y_test_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]

        # Calculate AUCPR on the test set
        test_aucpr = calculate_aucpr(y_test, y_test_pred_proba)
        print(f"{model_name} - Test AUCPR: {test_aucpr}")

        # Log test metrics
        mlflow.log_metric("Test_AUCPR", test_aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

2025/04/02 15:07:59 INFO mlflow.tracking.fluent: Experiment with name 'Spam_Ham_Classification' does not exist. Creating a new experiment.


Logistic Regression - Validation AUCPR: 0.9407877292674868
Logistic Regression - Test AUCPR: 0.9278872892006962




Random Forest - Validation AUCPR: 0.9393194419611289
Random Forest - Test AUCPR: 0.9590036348815346


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Validation AUCPR: 0.895583971072911
XGBoost - Test AUCPR: 0.8893258696274254




In [26]:
# Retrieve runs and print AUCPR
experiment_id = mlflow.get_experiment_by_name("Spam_Ham_Classification").experiment_id
runs = mlflow.search_runs(experiment_id)

for _, run in runs.iterrows():
    print(f"Model: {run['tags.mlflow.runName']}, Validation AUCPR: {run['metrics.Validation_AUCPR']}, Test AUCPR: {run['metrics.Test_AUCPR']}")

Model: upset-horse-707, Validation AUCPR: 0.895583971072911, Test AUCPR: 0.8893258696274254
Model: polite-flea-618, Validation AUCPR: 0.9393194419611289, Test AUCPR: 0.9590036348815346
Model: learned-roo-747, Validation AUCPR: 0.9407877292674868, Test AUCPR: 0.9278872892006962


In [27]:
# Step 1: Get the experiment and all runs
experiment_id = mlflow.get_experiment_by_name("Spam_Ham_Classification").experiment_id
runs = mlflow.search_runs(experiment_id)

# Step 2: Select best run based on Validation AUCPR
best_run = runs.loc[runs["metrics.Validation_AUCPR"].idxmax()]
best_run_id = best_run["run_id"]

print(f"Best run ID: {best_run_id}")


Best run ID: 715dbf3c41484cbf8809ba437963ea1a


In [31]:
# Step 3: Specify the path where the model is saved in the run artifacts
model_artifact_path = "Logistic Regression"  # replace if your model is logged under a different name

# Step 4: Download it to a temporary directory
local_path = mlflow.artifacts.download_artifacts(run_id=best_run_id, artifact_path=model_artifact_path)
print(f"Model downloaded to: {local_path}")

Model downloaded to: /home/pritam/Documents/AppliedML/Assignment3/mlruns/275965308563201778/715dbf3c41484cbf8809ba437963ea1a/artifacts/Logistic Regression


In [37]:
# Step 5: Load the sklearn model from the downloaded path
model = mlflow.sklearn.load_model(local_path)

from sklearn.pipeline import Pipeline
import joblib
import mlflow
import mlflow.sklearn

# Combine vectorizer and model into a pipeline
pipeline = Pipeline([
    ("tfidf", vectorizer),
    ("classifier", model)
])

# Save pipeline locally
joblib.dump(pipeline, "best_model.joblib")
print("✅ Saved full pipeline as 'best_model.joblib'")

# Optionally log to MLflow (inside active run or standalone)
mlflow.sklearn.log_model(pipeline, "Spam_Ham_Pipeline")
print("📦 Logged pipeline model to MLflow as 'Spam_Ham_Pipeline'")


✅ Saved full pipeline as 'best_model.joblib'




📦 Logged pipeline model to MLflow as 'Spam_Ham_Pipeline'


In [35]:
model = joblib.load("best_model.joblib")