### Import Stuff

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn
import pickle
import os
from mlflow.models.signature import infer_signature

### Set Up Stuff

In [2]:
mlflow.set_experiment("SMS Spam Classification")

with open('data/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)


In [3]:
def calculate_aucpr(model, X, y_true):
    y_score = model.predict_proba(X)[:, 1]
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    return auc(recall, precision)

print("\n=== Checking out first version ===")
!git checkout $(cat first_version_commit.txt) -- data/*.dvc
!dvc checkout

train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

print("\nDistribution for first version (seed 42):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())




=== Checking out first version ===
Building workspace index                             |5.00 [00:00, 1.04kentry/s]
Comparing indexes                                    |6.00 [00:00, 6.83kentry/s]
Applying changes                                      |3.00 [00:00, 2.50kfile/s]
[33mM[0m       data/validation.csv
[33mM[0m       data/train.csv
[33mM[0m       data/test.csv
[0m
Distribution for first version (seed 42):
Train: {0: 2404, 1: 383}
Validation: {0: 1212, 1: 181}
Test: {0: 1211, 1: 183}


### Train First Version

In [4]:
X_train = vectorizer.transform(train['text'])
y_train = train['label']
X_val = vectorizer.transform(validation['text'])
y_val = validation['label']
X_test = vectorizer.transform(test['text'])
y_test = test['label']

input_example = X_train[:5]


In [5]:

with mlflow.start_run(run_name="Logistic Regression"):
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    predictions = lr_model.predict(input_example)
    
    signature = infer_signature(input_example, predictions)
    
    val_aucpr = calculate_aucpr(lr_model, X_val, y_val)
    test_aucpr = calculate_aucpr(lr_model, X_test, y_test)
    
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    mlflow.sklearn.log_model(
        lr_model, 
        "logistic_regression_model",
        registered_model_name="LogisticRegressionSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Logistic Regression - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")

with mlflow.start_run(run_name="Random Forest"):
    rf_model = RandomForestClassifier(n_estimators=100)
    rf_model.fit(X_train, y_train)
    
    predictions = rf_model.predict(input_example)
    
    signature = infer_signature(input_example, predictions)
    
    val_aucpr = calculate_aucpr(rf_model, X_val, y_val)
    test_aucpr = calculate_aucpr(rf_model, X_test, y_test)
    
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    mlflow.sklearn.log_model(
        rf_model, 
        "random_forest_model",
        registered_model_name="RandomForestSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Random Forest - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")

with mlflow.start_run(run_name="Naive Bayes"):
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    
    predictions = nb_model.predict(input_example)
    
    signature = infer_signature(input_example, predictions)
    
    val_aucpr = calculate_aucpr(nb_model, X_val, y_val)
    test_aucpr = calculate_aucpr(nb_model, X_test, y_test)
    
    mlflow.log_param("model_type", "MultinomialNB")
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    mlflow.sklearn.log_model(
        nb_model, 
        "naive_bayes_model",
        registered_model_name="NaiveBayesSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Naive Bayes - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")



Registered model 'LogisticRegressionSpamClassifier' already exists. Creating a new version of this model...
Created version '3' of model 'LogisticRegressionSpamClassifier'.


Logistic Regression - Validation AUCPR: 0.9669121739445934, Test AUCPR: 0.9575823254195548


Registered model 'RandomForestSpamClassifier' already exists. Creating a new version of this model...
Created version '3' of model 'RandomForestSpamClassifier'.


Random Forest - Validation AUCPR: 0.9852913349991586, Test AUCPR: 0.9797536516682984
Naive Bayes - Validation AUCPR: 0.9758677579099335, Test AUCPR: 0.9609885796097488


Registered model 'NaiveBayesSpamClassifier' already exists. Creating a new version of this model...
Created version '3' of model 'NaiveBayesSpamClassifier'.


### Display First Version

In [6]:

print("\nModel Comparison (First version data - seed 42):")
print(f"Logistic Regression - Validation AUCPR: {calculate_aucpr(lr_model, X_val, y_val)}")
print(f"Random Forest - Validation AUCPR: {calculate_aucpr(rf_model, X_val, y_val)}")
print(f"Naive Bayes - Validation AUCPR: {calculate_aucpr(nb_model, X_val, y_val)}")


Model Comparison (First version data - seed 42):
Logistic Regression - Validation AUCPR: 0.9669121739445934
Random Forest - Validation AUCPR: 0.9852913349991586
Naive Bayes - Validation AUCPR: 0.9758677579099335


### Switch to Second Version

In [7]:
print("\n=== Checking out second version ===")
!git checkout $(cat second_version_commit.txt) -- data/*.dvc
!dvc checkout

train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

print("\nDistribution for second version (seed 123):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())


=== Checking out second version ===
Building workspace index                             |5.00 [00:00, 1.54kentry/s]
Comparing indexes                                    |6.00 [00:00, 9.33kentry/s]
Applying changes                                      |3.00 [00:00, 3.81kfile/s]
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[33mM[0m       data/test.csv
[0m
Distribution for second version (seed 123):
Train: {0: 2413, 1: 374}
Validation: {0: 1208, 1: 185}
Test: {0: 1206, 1: 188}


### Train Second Version

In [8]:
X_train = vectorizer.transform(train['text'])
y_train = train['label']
X_val = vectorizer.transform(validation['text'])
y_val = validation['label']
X_test = vectorizer.transform(test['text'])
y_test = test['label']

client = mlflow.tracking.MlflowClient()

experiment = client.get_experiment_by_name("SMS Spam Classification")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

print("\nAll tracked experiments:")
print(runs[['run_id', 'metrics.val_AUCPR', 'params.model_type']])

model_names = {
    "LogisticRegression": "LogisticRegressionSpamClassifier",
    "RandomForest": "RandomForestSpamClassifier",
    "NaiveBayes": "NaiveBayesSpamClassifier"
}

print("\nModel Performance on Second Version Data (seed 123):")
for name, model_name in model_names.items():
    latest_version = client.get_latest_versions(model_name)[0].version
    
    model_uri = f"models:/{model_name}/{latest_version}"
    model = mlflow.sklearn.load_model(model_uri)
    
    val_aucpr = calculate_aucpr(model, X_val, y_val)
    test_aucpr = calculate_aucpr(model, X_test, y_test)
    print(f"{name} (version {latest_version}) - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")


All tracked experiments:
                             run_id  metrics.val_AUCPR   params.model_type
0  d085b6b98e5745e8a6b87e18ac86cc32           0.975868       MultinomialNB
1  6d1fd6bfb2d140de8c9555d37933eda3           0.985291        RandomForest
2  d8f176c66aa74efb922913193192a87f           0.966912  LogisticRegression
3  0424c4cec49a48ae845835b59abe3ce5           0.975868       MultinomialNB
4  d2d201106281428ea1d739a7d5905800           0.983223        RandomForest
5  3e825be38dfc4c20883070dc28a45694           0.966912  LogisticRegression
6  07d398e727c9458eadeb581f54fe677b           0.975868       MultinomialNB
7  a47026fddf804fa3b312779579559e8a           0.981535        RandomForest
8  ea54a789af6b463ea7b2a24a3a162453           0.966912  LogisticRegression

Model Performance on Second Version Data (seed 123):
LogisticRegression (version 3) - Validation AUCPR: 0.9816893780935787, Test AUCPR: 0.9672808496109955
RandomForest (version 3) - Validation AUCPR: 0.9962985855592688, Tes

  latest_version = client.get_latest_versions(model_name)[0].version
  latest_version = client.get_latest_versions(model_name)[0].version
  latest_version = client.get_latest_versions(model_name)[0].version
