In [2]:
! pip install mlflow scikit-learn xgboost pandas numpy

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn
import joblib

In [4]:
ls

 Volume in drive C is Windows
 Volume Serial Number is E26E-0A81

 Directory of c:\Users\soumy\Downloads\Assignment 3

09-04-2025  16:44    <DIR>          .
09-04-2025  16:44    <DIR>          ..
09-04-2025  16:45    <DIR>          __pycache__
09-04-2025  16:45               725 app.py
09-04-2025  16:38            23,581 best_model.ipynb
09-04-2025  16:42         4,163,458 best_model.joblib
09-04-2025  16:41    <DIR>          mlruns
09-04-2025  16:43               721 score.py
09-04-2025  16:25            57,527 test.csv
09-04-2025  16:43             3,692 test.py
09-04-2025  16:24           172,124 train.csv
09-04-2025  16:25            57,222 validation.csv
               8 File(s)      4,479,050 bytes
               4 Dir(s)  47,824,965,632 bytes free


In [6]:
# Load the saved splits
train = pd.read_csv("train.csv").dropna()
validation = pd.read_csv("validation.csv").dropna()
test = pd.read_csv("test.csv").dropna()

# Prepare features and labels
X_train, y_train = train["Message"], train["Label"]
X_val, y_val = validation["Message"], validation["Label"]
X_test, y_test = test["Message"], test["Label"]

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
def calculate_aucpr(y_true, y_pred_proba):
    precision, recall, _ = precision_recall_curve(y_true, y_pred_proba)
    return auc(recall, precision)


In [8]:
# Define the models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss")
}

# Start MLflow experiment
mlflow.set_experiment("Spam_Ham_Classification")

for model_name, model in models.items():
    with mlflow.start_run():
        # Train the model on the training set
        model.fit(X_train_tfidf, y_train)

        # Predict probabilities on the validation set
        y_val_pred_proba = model.predict_proba(X_val_tfidf)[:, 1]

        # Calculate AUCPR on the validation set
        val_aucpr = calculate_aucpr(y_val, y_val_pred_proba)
        print(f"{model_name} - Validation AUCPR: {val_aucpr}")

        # Log validation metrics
        mlflow.log_metric("Validation_AUCPR", val_aucpr)

        # Predict probabilities on the test set
        y_test_pred_proba = model.predict_proba(X_test_tfidf)[:, 1]

        # Calculate AUCPR on the test set
        test_aucpr = calculate_aucpr(y_test, y_test_pred_proba)
        print(f"{model_name} - Test AUCPR: {test_aucpr}")

        # Log test metrics
        mlflow.log_metric("Test_AUCPR", test_aucpr)

        # Log the model
        mlflow.sklearn.log_model(model, model_name)

Logistic Regression - Validation AUCPR: 0.9435203787965679
Logistic Regression - Test AUCPR: 0.9311391771644204




Random Forest - Validation AUCPR: 0.9485277845417935
Random Forest - Test AUCPR: 0.9675692370563139


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost - Validation AUCPR: 0.8920490150123725
XGBoost - Test AUCPR: 0.8939017016366597




In [9]:
# Retrieve runs and print AUCPR
experiment_id = mlflow.get_experiment_by_name("Spam_Ham_Classification").experiment_id
runs = mlflow.search_runs(experiment_id)

for _, run in runs.iterrows():
    print(f"Model: {run['tags.mlflow.runName']}, Validation AUCPR: {run['metrics.Validation_AUCPR']}, Test AUCPR: {run['metrics.Test_AUCPR']}")

Model: spiffy-bass-377, Validation AUCPR: 0.8920490150123725, Test AUCPR: 0.8939017016366597
Model: hilarious-zebra-468, Validation AUCPR: 0.9485277845417935, Test AUCPR: 0.9675692370563139
Model: wise-shrike-233, Validation AUCPR: 0.9435203787965679, Test AUCPR: 0.9311391771644204
Model: amusing-wolf-53, Validation AUCPR: nan, Test AUCPR: nan
Model: mysterious-mole-855, Validation AUCPR: 0.8920490150123725, Test AUCPR: 0.8939017016366597
Model: flawless-gnu-194, Validation AUCPR: 0.9495848297996785, Test AUCPR: 0.9683186020975065
Model: wistful-jay-646, Validation AUCPR: 0.9435203787965679, Test AUCPR: 0.9311391771644204


In [10]:
# Step 1: Get the experiment and all runs
experiment_id = mlflow.get_experiment_by_name("Spam_Ham_Classification").experiment_id
runs = mlflow.search_runs(experiment_id)

# Step 2: Select best run based on Validation AUCPR
best_run = runs.loc[runs["metrics.Validation_AUCPR"].idxmax()]
best_run_id = best_run["run_id"]

print(f"Best run ID: {best_run_id}")


Best run ID: f4ea130e6f9b4e6e8ca1472db087fea5


In [11]:
# Step 3: Specify the path where the model is saved in the run artifacts
model_artifact_path = "Random Forest" 

# Step 4: Download it to a temporary directory
local_path = mlflow.artifacts.download_artifacts(run_id=best_run_id, artifact_path=model_artifact_path)
print(f"Model downloaded to: {local_path}")

Model downloaded to: C:\Users\soumy\Downloads\Assignment 3\mlruns\705379048845976499\f4ea130e6f9b4e6e8ca1472db087fea5\artifacts\Random Forest


In [12]:
# Step 5: Load the sklearn model from the downloaded path
model = mlflow.sklearn.load_model(local_path)

from sklearn.pipeline import Pipeline
import joblib
import mlflow
import mlflow.sklearn

# Combine vectorizer and model into a pipeline
pipeline = Pipeline([
    ("tfidf", vectorizer),
    ("classifier", model)
])

# Save pipeline locally
joblib.dump(pipeline, "best_model.joblib")
print("✅ Saved full pipeline as 'best_model.joblib'")

# Optionally log to MLflow (inside active run or standalone)
mlflow.sklearn.log_model(pipeline, "Spam_Ham_Pipeline")
print("📦 Logged pipeline model to MLflow as 'Spam_Ham_Pipeline'")


✅ Saved full pipeline as 'best_model.joblib'




📦 Logged pipeline model to MLflow as 'Spam_Ham_Pipeline'


In [13]:
model = joblib.load("best_model.joblib")

In [14]:
import requests

url = "http://127.0.0.1:5000/score"
data = {
    "text": "You won a free prize!",
    "threshold": 0.5
}

response = requests.post(url, json=data)
print(response.json())


{'app': 'Spamseek', 'prediction': 0, 'propensity': 0.34}
