In [2]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_curve, auc
import mlflow
import mlflow.sklearn
import pickle
import os
from mlflow.models.signature import infer_signature

# Start MLflow tracking
mlflow.set_experiment("SMS Spam Classification")

# Load the vectorizer
with open('data/vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

# Function to calculate AUCPR
def calculate_aucpr(model, X, y_true):
    y_score = model.predict_proba(X)[:, 1]
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    return auc(recall, precision)

# Load the first version data
print("\n=== Checking out first version ===")
!git checkout $(cat first_version_commit.txt) -- data/*.dvc
!dvc checkout

# Load the data from first version
train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

# Print distributions for first version
print("\nDistribution for first version (seed 42):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())

# Prepare features and target
X_train = vectorizer.transform(train['text'])
y_train = train['label']
X_val = vectorizer.transform(validation['text'])
y_val = validation['label']
X_test = vectorizer.transform(test['text'])
y_test = test['label']

# Create a small input example for model signature
input_example = X_train[:5]

# Model 1: Logistic Regression
with mlflow.start_run(run_name="Logistic Regression"):
    # Train the model
    lr_model = LogisticRegression(max_iter=1000)
    lr_model.fit(X_train, y_train)
    
    # Generate predictions for signature
    predictions = lr_model.predict(input_example)
    
    # Infer model signature
    signature = infer_signature(input_example, predictions)
    
    # Evaluate
    val_aucpr = calculate_aucpr(lr_model, X_val, y_val)
    test_aucpr = calculate_aucpr(lr_model, X_test, y_test)
    
    # Log parameters and metrics
    mlflow.log_param("model_type", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    # Register the model with signature and input example
    mlflow.sklearn.log_model(
        lr_model, 
        "logistic_regression_model",
        registered_model_name="LogisticRegressionSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Logistic Regression - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")

# Model 2: Random Forest
with mlflow.start_run(run_name="Random Forest"):
    # Train the model
    rf_model = RandomForestClassifier(n_estimators=100)
    rf_model.fit(X_train, y_train)
    
    # Generate predictions for signature
    predictions = rf_model.predict(input_example)
    
    # Infer model signature
    signature = infer_signature(input_example, predictions)
    
    # Evaluate
    val_aucpr = calculate_aucpr(rf_model, X_val, y_val)
    test_aucpr = calculate_aucpr(rf_model, X_test, y_test)
    
    # Log parameters and metrics
    mlflow.log_param("model_type", "RandomForest")
    mlflow.log_param("n_estimators", 100)
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    # Register the model with signature and input example
    mlflow.sklearn.log_model(
        rf_model, 
        "random_forest_model",
        registered_model_name="RandomForestSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Random Forest - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")

# Model 3: Naive Bayes
with mlflow.start_run(run_name="Naive Bayes"):
    # Train the model
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    
    # Generate predictions for signature
    predictions = nb_model.predict(input_example)
    
    # Infer model signature
    signature = infer_signature(input_example, predictions)
    
    # Evaluate
    val_aucpr = calculate_aucpr(nb_model, X_val, y_val)
    test_aucpr = calculate_aucpr(nb_model, X_test, y_test)
    
    # Log parameters and metrics
    mlflow.log_param("model_type", "MultinomialNB")
    mlflow.log_param("data_version", "seed 42")
    mlflow.log_metric("val_AUCPR", val_aucpr)
    mlflow.log_metric("test_AUCPR", test_aucpr)
    
    # Register the model with signature and input example
    mlflow.sklearn.log_model(
        nb_model, 
        "naive_bayes_model",
        registered_model_name="NaiveBayesSpamClassifier",
        signature=signature,
        input_example=input_example
    )
    
    print(f"Naive Bayes - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")

# Print summary of all models on first version data
print("\nModel Comparison (First version data - seed 42):")
print(f"Logistic Regression - Validation AUCPR: {calculate_aucpr(lr_model, X_val, y_val)}")
print(f"Random Forest - Validation AUCPR: {calculate_aucpr(rf_model, X_val, y_val)}")
print(f"Naive Bayes - Validation AUCPR: {calculate_aucpr(nb_model, X_val, y_val)}")

# Load the second version data
print("\n=== Checking out second version ===")
!git checkout $(cat second_version_commit.txt) -- data/*.dvc
!dvc checkout

# Load the data from second version
train = pd.read_csv('data/train.csv')
validation = pd.read_csv('data/validation.csv')
test = pd.read_csv('data/test.csv')

# Print distributions for second version
print("\nDistribution for second version (seed 123):")
print("Train:", train['label'].value_counts().to_dict())
print("Validation:", validation['label'].value_counts().to_dict())
print("Test:", test['label'].value_counts().to_dict())

# Prepare features and target for second version
X_train = vectorizer.transform(train['text'])
y_train = train['label']
X_val = vectorizer.transform(validation['text'])
y_val = validation['label']
X_test = vectorizer.transform(test['text'])
y_test = test['label']

# Get MLflow client
client = mlflow.tracking.MlflowClient()

# Get all runs from the experiment
experiment = client.get_experiment_by_name("SMS Spam Classification")
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# Print all runs
print("\nAll tracked experiments:")
print(runs[['run_id', 'metrics.val_AUCPR', 'params.model_type']])

# Load and evaluate models on second version data
model_names = {
    "LogisticRegression": "LogisticRegressionSpamClassifier",
    "RandomForest": "RandomForestSpamClassifier",
    "NaiveBayes": "NaiveBayesSpamClassifier"
}

print("\nModel Performance on Second Version Data (seed 123):")
for name, model_name in model_names.items():
    # Get latest version
    latest_version = client.get_latest_versions(model_name)[0].version
    
    # Load the model
    model_uri = f"models:/{model_name}/{latest_version}"
    model = mlflow.sklearn.load_model(model_uri)
    
    # Evaluate
    val_aucpr = calculate_aucpr(model, X_val, y_val)
    test_aucpr = calculate_aucpr(model, X_test, y_test)
    print(f"{name} (version {latest_version}) - Validation AUCPR: {val_aucpr}, Test AUCPR: {test_aucpr}")


=== Checking out first version ===
Building workspace index                             |5.00 [00:00, 1.06kentry/s]
Comparing indexes                                    |6.00 [00:00, 8.57kentry/s]
Applying changes                                      |3.00 [00:00, 2.86kfile/s]
[33mM[0m       data/test.csv
[33mM[0m       data/train.csv
[33mM[0m       data/validation.csv
[0m
Distribution for first version (seed 42):
Train: {0: 2404, 1: 383}
Validation: {0: 1212, 1: 181}
Test: {0: 1211, 1: 183}


Registered model 'LogisticRegressionSpamClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'LogisticRegressionSpamClassifier'.


Logistic Regression - Validation AUCPR: 0.9669121739445934, Test AUCPR: 0.9575823254195548


Registered model 'RandomForestSpamClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'RandomForestSpamClassifier'.


Random Forest - Validation AUCPR: 0.9832233330222511, Test AUCPR: 0.9790682577863085
Naive Bayes - Validation AUCPR: 0.9758677579099335, Test AUCPR: 0.9609885796097488

Model Comparison (First version data - seed 42):
Logistic Regression - Validation AUCPR: 0.9669121739445934
Random Forest - Validation AUCPR: 0.9832233330222511
Naive Bayes - Validation AUCPR: 0.9758677579099335

=== Checking out second version ===


Registered model 'NaiveBayesSpamClassifier' already exists. Creating a new version of this model...
Created version '2' of model 'NaiveBayesSpamClassifier'.


Building workspace index                             |5.00 [00:00, 1.47kentry/s]
Comparing indexes                                    |6.00 [00:00, 10.1kentry/s]
Applying changes                                      |3.00 [00:00, 3.16kfile/s]
[33mM[0m       data/test.csv
[33mM[0m       data/validation.csv
[33mM[0m       data/train.csv
[0m
Distribution for second version (seed 123):
Train: {0: 2413, 1: 374}
Validation: {0: 1208, 1: 185}
Test: {0: 1206, 1: 188}

All tracked experiments:
                             run_id  metrics.val_AUCPR   params.model_type
0  0424c4cec49a48ae845835b59abe3ce5           0.975868       MultinomialNB
1  d2d201106281428ea1d739a7d5905800           0.983223        RandomForest
2  3e825be38dfc4c20883070dc28a45694           0.966912  LogisticRegression
3  07d398e727c9458eadeb581f54fe677b           0.975868       MultinomialNB
4  a47026fddf804fa3b312779579559e8a           0.981535        RandomForest
5  ea54a789af6b463ea7b2a24a3a162453           0.96691

  latest_version = client.get_latest_versions(model_name)[0].version
  latest_version = client.get_latest_versions(model_name)[0].version
  latest_version = client.get_latest_versions(model_name)[0].version
