# Applied Machine Learning
**Assignment 2**<br>
Shubhangi Sanyal (MDS202238)

## Model version control and experiment tracking

In [3]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.10.2-py3-none-any.whl.metadata (13 kB)
Collecting cloudpickle<4 (from mlflow)
  Downloading cloudpickle-3.0.0-py3-none-any.whl.metadata (7.0 kB)
Collecting importlib-metadata!=4.7.0,<8,>=3.7.0 (from mlflow)
  Downloading importlib_metadata-7.0.1-py3-none-any.whl.metadata (4.9 kB)
Collecting sqlparse<1,>=0.4.0 (from mlflow)
  Downloading sqlparse-0.4.4-py3-none-any.whl.metadata (4.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.0.2-py3-none-any.whl.metadata (3.6 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting pyarrow<16,>=4.0.0 (from mlflow)
  Downloading pyarrow-15.0.0-cp311-cp311-win_amd64.whl.metadata (3.1 kB)
Collecting markdown<4,>=3.3 (fro

In [4]:
import pandas as pd

In [1]:
%cd C:\Users\shubh\Downloads\AML_Ass2\Applied_ML\Assignment_2

C:\Users\shubh\Downloads\AML_Ass2\Applied_ML\Assignment_2


Loading the train, validation, and test datasets

In [5]:
train = pd.read_csv('train.csv')
val = pd.read_csv('validation.csv')
test = pd.read_csv('test.csv')

In [9]:
# Separating into text and label
X_train = train['text']
y_train = train['spam']
X_val = val['text']
y_val = val['spam']
X_test = test['text']
y_test = test['spam']

In [20]:
label_mapping = {'Spam': 1, 'Ham': 0}
y_train = y_train.replace(label_mapping)
y_val = y_val.replace(label_mapping)
y_test = y_test.replace(label_mapping)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
import mlflow
import mlflow.sklearn
from mlflow.tracking import MlflowClient

In [28]:
def train_and_log_model(model_name, X_train, y_train, X_val, y_val, params={},tags={}):
    with mlflow.start_run(run_name=model_name):
        # model pipeline
        if model_name == 'Random_Forest':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', RandomForestClassifier(random_state=42, **params))
            ])
        elif model_name == 'Bernoulli_Naive_Bayes':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', BernoulliNB(**params))
            ])
        elif model_name == 'Support_Vector_Machine':
            model = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', SVC(random_state=42, probability=True, **params))
            ])
        else:
            raise ValueError("Model name not recognized.")
        
        # Training the model
        model.fit(X_train, y_train)

        # Evaluating the model on validation dataset
        y_pred_val = model.predict(X_val)
        accuracy = accuracy_score(y_val, y_pred_val)
        aucpr = average_precision_score(y_val, model.predict_proba(X_val)[:, 1])

        # Logging parameters, metrics, and model
        mlflow.log_params(params)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("aucpr", aucpr)
        mlflow.sklearn.log_model(model, f"model_{model_name}")
        
        # Registering the model
        mlflow.sklearn.log_model(model, f"model_{model_name}")

        # Run ID
        run_id = mlflow.active_run().info.run_id

        # model name in the MLflow Model Registry
        client = MlflowClient()
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")

        # Creating a new version of the model in the registry
        model_uri = f"runs:/{run_id}/model_{model_name}"
        model_version_info = client.create_model_version(model_name, model_uri, run_id)

        # Adding tags to the model version
        tags['Created by'] = 'Shubhangi' 
        for tag_key, tag_value in tags.items():
            client.set_model_version_tag(
                model_name,
                model_version_info.version,
                tag_key,
                tag_value
            )

        print(f"Model {model_name}, version {model_version_info.version} registered in the MLflow Model Registry with tags {tags}.")
        print(f"Model: {model_name}, Accuracy: {accuracy}, AUCPR: {aucpr}")

Starting MLFlow Experiment

In [12]:
# Experiment name
mlflow.set_experiment("Email Spam-Ham Classification")

2024/02/20 12:11:14 INFO mlflow.tracking.fluent: Experiment with name 'Email Spam-Ham Classification' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///C:/Users/shubh/Downloads/AML_Ass2/Applied_ML/Assignment_2/mlruns/584957309804862089', creation_time=1708411274224, experiment_id='584957309804862089', last_update_time=1708411274224, lifecycle_stage='active', name='Email Spam-Ham Classification', tags={}>

In [13]:
tags = {
    "Review": "Passed",
    "Ready for Deployment": "Yes"
}

In [24]:
import warnings
warnings.filterwarnings('ignore')

In [29]:
# Training and logging models
model_names = ['Random_Forest', 'Bernoulli_Naive_Bayes', 'Support_Vector_Machine']
for model_name in model_names:
    train_and_log_model(model_name, X_train, y_train, X_val, y_val)

Model Random_Forest already exists in the registry.
Model Random_Forest, version 4 registered in the MLflow Model Registry with tags {'Created by': 'Shubhangi'}.
Model: Random_Forest, Accuracy: 0.9813736903376019, AUCPR: 0.9984828841061683
Model Bernoulli_Naive_Bayes already exists in the registry.
Model Bernoulli_Naive_Bayes, version 3 registered in the MLflow Model Registry with tags {'Created by': 'Shubhangi'}.
Model: Bernoulli_Naive_Bayes, Accuracy: 0.989522700814901, AUCPR: 0.9987744233703195
Model Support_Vector_Machine, version 1 registered in the MLflow Model Registry with tags {'Created by': 'Shubhangi'}.
Model: Support_Vector_Machine, Accuracy: 0.9918509895227008, AUCPR: 0.9996168860860241


In [30]:
"""
Viewing results of all versions of the models in the mlflow user interface
http://localhost:5000/
"""
! mlflow ui

^C


Evaluating final model on test dataset

In [33]:
# Chosen model after comparing results stored in the user interface
model_name = "Support_Vector_Machine" 
model_version = "1"  

In [34]:
model_uri = f"models:/{model_name}/{model_version}"
model = mlflow.sklearn.load_model(model_uri)

In [35]:
# Use the loaded model to make predictions on the test dataset
y_pred = model.predict(X_test)

# Calculate metrics on the test dataset
accuracy = accuracy_score(y_test, y_pred)
aucpr = average_precision_score(y_test, model.predict_proba(X_test)[:, 1])

In [36]:
# Print out the metrics
print(f"Accuracy on test dataset: {accuracy}")
print(f"AUCPR on test dataset: {aucpr}")

Accuracy on test dataset: 0.9965156794425087
AUCPR on test dataset: 0.999554367201426
