In [1]:
%pip install mlflow

Collecting mlflow
  Downloading mlflow-2.10.2-py3-none-any.whl (19.5 MB)
Collecting importlib-metadata!=4.7.0,<8,>=3.7.0
  Downloading importlib_metadata-7.0.1-py3-none-any.whl (23 kB)
Collecting alembic!=1.10.0,<2
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
Collecting sqlparse<1,>=0.4.0
  Downloading sqlparse-0.4.4-py3-none-any.whl (41 kB)
Collecting querystring-parser<2
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting pyarrow<16,>=4.0.0
  Downloading pyarrow-15.0.0-cp38-cp38-win_amd64.whl (24.9 MB)
Collecting sqlalchemy<3,>=1.4.0
  Downloading SQLAlchemy-2.0.27-cp38-cp38-win_amd64.whl (2.1 MB)
Collecting cloudpickle<4
  Downloading cloudpickle-3.0.0-py3-none-any.whl (20 kB)
Collecting docker<8,>=4.0.0
  Downloading docker-7.0.0-py3-none-any.whl (147 kB)
Collecting markdown<4,>=3.3
  Downloading Markdown-3.5.2-py3-none-any.whl (103 kB)
Collecting Flask<4
  Downloading flask-3.0.2-py3-none-any.whl (101 kB)
Collecting waitress<3
  Downloading wa

You should consider upgrading via the 'c:\Users\Samriddha\AppData\Local\Programs\Python\Python38\python.exe -m pip install --upgrade pip' command.


In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

In [8]:
# Define a function to load and split the data
def load_split_data(train, validation, test):
    # Load datasets
    train_data = pd.read_csv(train)
    validation_data = pd.read_csv(validation)
    test_data = pd.read_csv(test)
    
    # Split features and target
    X_train = train_data['text']
    y_train = train_data['spam']
    X_val = validation_data['text']
    y_val = validation_data['spam']
    X_test = test_data['text']
    y_test = test_data['spam']
    
    return X_train, y_train, X_val, y_val, X_test, y_test

In [5]:
# Load your data (update paths according to your dataset location)
train = r'C:\Users\Samriddha\OneDrive\Desktop\msc@cmi\cmi sem 4\Applied ML\AppliedML\Assignment 2\train.csv'
validation = r'C:\Users\Samriddha\OneDrive\Desktop\msc@cmi\cmi sem 4\Applied ML\AppliedML\Assignment 2\validation.csv'
test= r'C:\Users\Samriddha\OneDrive\Desktop\msc@cmi\cmi sem 4\Applied ML\AppliedML\Assignment 2\test.csv'

In [9]:
X_train, y_train, X_val, y_val, X_test, y_test = load_split_data(train, validation, test)

In [17]:
def train_and_log_model(model_name, X_train, y_train, X_val, y_val, params={},tags={}):
    with mlflow.start_run(run_name=model_name):
        # Define the model pipeline
        if model_name == 'random_forest':
            model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', RandomForestClassifier(random_state=42))  # Random Forest Classifier
    ])
        elif model_name == 'logistic_regression':
            model = Pipeline([
                ('tfidf', TfidfVectorizer()),
                ('clf', LogisticRegression(random_state=42, **params))
            ])
        
        elif model_name == 'SVM':
            model = Pipeline([
        ('tfidf', TfidfVectorizer()),  # Text processing using TF-IDF
        ('classifier', SVC(probability=True,random_state=42))  # Support Vector Machine (SVM) Classifier
    ])
        else:
            raise ValueError("Model name not recognized.")

        # Train the model
        model.fit(X_train, y_train)
        # Evaluate the model
        y_pred_val = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred_val)
        aucpr = average_precision_score(y_val, model.predict_proba(X_val)[:, 1])
        # Log parameters, metrics, and model
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("aucpr", aucpr)
        mlflow.sklearn.log_model(model, f"model_{model_name}")
        # After training and logging metrics, register the model
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        # Get the run ID
        run_id = mlflow.active_run().info.run_id

        # Create or get the model name in the MLflow Model Registry
        client = MlflowClient()
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")

        # Create a new version of the model in the registry
        model_uri = f"runs:/{run_id}/model_{model_name}"
        model_version_info = client.create_model_version(model_name, model_uri, run_id)

        # Add tags to the model version
        tags['Created by'] = 'samriddha'  # Replace with your identifier
        for tag_key, tag_value in tags.items():
            client.set_model_version_tag(
                model_name,
                model_version_info.version,
                tag_key,
                tag_value
            )

        print(f"Model {model_name}, version {model_version_info.version} registered in the MLflow Model Registry with tags {tags}.")
        print(f"Model: {model_name}, Accuracy: {accuracy}, AUCPR: {aucpr}")

In [14]:
# Start MLflow experiment
mlflow.set_experiment("Email Spam Classification")

2024/02/20 20:27:14 INFO mlflow.tracking.fluent: Experiment with name 'Email Spam Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///c:/Users/Samriddha/OneDrive/Desktop/msc%40cmi/cmi%20sem%204/Applied%20ML/AppliedML/mlruns/211608969076237354', creation_time=1708441034519, experiment_id='211608969076237354', last_update_time=1708441034519, lifecycle_stage='active', name='Email Spam Classification', tags={}>

In [20]:
# Train and log models
model_names = ['random_forest', 'logistic_regression', 'SVM']
for model_name in model_names:
    train_and_log_model(model_name, X_train, y_train, X_val, y_val)

Model random_forest already exists in the registry.
Model random_forest, version 3 registered in the MLflow Model Registry with tags {'Created by': 'samriddha'}.
Model: random_forest, Accuracy: 0.9616055846422339, AUCPR: 0.9964756289961485
Model logistic_regression already exists in the registry.
Model logistic_regression, version 4 registered in the MLflow Model Registry with tags {'Created by': 'samriddha'}.
Model: logistic_regression, Accuracy: 0.9773123909249564, AUCPR: 0.9987395088020187
Model SVM already exists in the registry.
Model SVM, version 2 registered in the MLflow Model Registry with tags {'Created by': 'samriddha'}.
Model: SVM, Accuracy: 0.9895287958115183, AUCPR: 0.9998834498834499


The SVM model has the highest accuracy and AUCPR score

On executing the following command, followed by navigating to http://localhost:5000 in the browser, the details of each run can be reviewed 

In [22]:
!mlflow ui

^C
