# Imports


In [1]:
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

# from catboost import CatBoostClassifier
# from lightgbm import LGBMClassifier

from sklearn.metrics import average_precision_score, accuracy_score

import mlflow
from mlflow.tracking import MlflowClient

import json
import warnings

warnings.simplefilter("ignore")

# Loading the dataset


In [2]:
def load_dataset(train_path, val_path, test_path):
    train = pd.read_csv(train_path)
    val = pd.read_csv(val_path)
    test = pd.read_csv(test_path)

    vectoriser = TfidfVectorizer()

    X_train = scipy.sparse.hstack(
        [
            vectoriser.fit_transform(train["text"]),
            train["is_reply"].values.reshape(-1, 1),
        ]
    )
    y_train = train["spam"].values

    X_val = scipy.sparse.hstack(
        [vectoriser.transform(val["text"]), val["is_reply"].values.reshape(-1, 1)]
    )
    y_val = val["spam"].values

    X_test = scipy.sparse.hstack(
        [vectoriser.transform(test["text"]), test["is_reply"].values.reshape(-1, 1)]
    )
    y_test = test["spam"].values

    return X_train, X_val, X_test, y_train, y_val, y_test

In [3]:
train_path = "../data/train.csv"
val_path = "../data/validation.csv"
test_path = "../data/test.csv"

X_train, X_val, X_test, y_train, y_val, y_test = load_dataset(
    train_path, val_path, test_path
)

# Tracking experiment run with mlflow


In [4]:
model_dict = {
    "logistic_regression": {
        "model": LogisticRegression,
        "params": {
            "random_state": 42,
            "penalty": "elasticnet",
            "solver": "saga",
            "C": 0.9140473524721335,
            "l1_ratio": 0.0028218514946617293,
        },
    },
    "decision_tree": {"model": DecisionTreeClassifier, "params": {}},
    "xgboost": {
        "model": XGBClassifier,
        "params": {
            "max_depth": 5,
            "learning_rate": 0.09806024570665134,
            "subsample": 0.5856590010393802,
        },
    },
}
metrics = {"accuracy": accuracy_score, "aucpr": average_precision_score}

In [6]:
def track_model(
    model_name,
    X_train,
    y_train,
    X_val,
    y_val,
    params={},
    run_name=None,
    user_params=True,
):
    if not run_name:
        run_name = model_name
    with mlflow.start_run(run_name=model_name):
        print(f"Starting run {run_name}")

        # loading the model from model dict
        model_config = model_dict.get(model_name, "error")
        if model_config == "error":
            raise ValueError("Model name not recognised.")
        print(f"=>  Model name: {model_name}")

        # loading model params
        model_params = params if user_params else model_config["params"]
        print(f"=>  Model params:\n{json.dumps(model_params, indent=4)}")

        # fitting model and evaluating
        model = model_config["model"](**model_params)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)
        model_results = {
            metric_name: metric(y_val, y_pred)
            for metric_name, metric in metrics.items()
        }
        print(f"=>  Model results:\n{json.dumps(model_results, indent=4)}")

        # Logging metrics
        mlflow.log_params(model_params)
        for metric_name, metric_value in model_results.items():
            mlflow.log_metric(f"{model_name}_{metric_name}", metric_value)

        # logging model
        client = MlflowClient()
        mlflow.sklearn.log_model(model, model_name)

        current_run_id = mlflow.active_run().info.run_id
        model_uri = f"runs:/{model_name}/{current_run_id}"
        try:
            client.create_registered_model(model_name)
        except Exception as e:
            print(f"Model {model_name} already exists in the registry.")
        model_details = client.create_model_version(
            model_name, model_uri, current_run_id
        )
        print(
            f"=>  {model_name} version {model_details.version} has been logged to registry."
        )

In [7]:
track_model("logistic_regression", X_train, y_train, X_val, y_val)

Starting run logistic_regression
=>  Model name: logistic_regression
=>  Model params:
{}
=>  Model results:
{
    "accuracy": 0.981675392670157,
    "aucpr": 0.9347904473621006
}
=>  logistic_regression version 1 has been logged to registry.


In [8]:
track_model("logistic_regression", X_train, y_train, X_val, y_val, user_params=False)

Starting run logistic_regression
=>  Model name: logistic_regression
=>  Model params:
{
    "random_state": 42,
    "penalty": "elasticnet",
    "solver": "saga",
    "C": 0.9140473524721335,
    "l1_ratio": 0.0028218514946617293
}
=>  Model results:
{
    "accuracy": 0.981675392670157,
    "aucpr": 0.9347904473621006
}
Model logistic_regression already exists in the registry.
=>  logistic_regression version 2 has been logged to registry.


In [10]:
track_model("decision_tree", X_train, y_train, X_val, y_val)

Starting run decision_tree
=>  Model name: decision_tree
=>  Model params:
{}
=>  Model results:
{
    "accuracy": 0.962478184991274,
    "aucpr": 0.861811467012522
}
=>  decision_tree version 1 has been logged to registry.


In [11]:
track_model("xgboost", X_train, y_train, X_val, y_val)

Starting run xgboost
=>  Model name: xgboost
=>  Model params:
{}
=>  Model results:
{
    "accuracy": 0.981675392670157,
    "aucpr": 0.9283437179986441
}
=>  xgboost version 1 has been logged to registry.


In [12]:
track_model("decision_tree", X_train, y_train, X_val, y_val, user_params=False)

Starting run decision_tree
=>  Model name: decision_tree
=>  Model params:
{}
=>  Model results:
{
    "accuracy": 0.9554973821989529,
    "aucpr": 0.8377703934107404
}
Model decision_tree already exists in the registry.
=>  decision_tree version 2 has been logged to registry.


In [13]:
track_model("xgboost", X_train, y_train, X_val, y_val, user_params=False)

Starting run xgboost
=>  Model name: xgboost
=>  Model params:
{
    "max_depth": 5,
    "learning_rate": 0.09806024570665134,
    "subsample": 0.5856590010393802
}
=>  Model results:
{
    "accuracy": 0.9755671902268761,
    "aucpr": 0.906409876134438
}
Model xgboost already exists in the registry.
=>  xgboost version 2 has been logged to registry.


In [14]:
!mlflow ui

[2024-02-20 16:26:29 +0530] [41273] [INFO] Starting gunicorn 21.2.0
[2024-02-20 16:26:29 +0530] [41273] [INFO] Listening at: http://127.0.0.1:5000 (41273)
[2024-02-20 16:26:29 +0530] [41273] [INFO] Using worker: sync
[2024-02-20 16:26:29 +0530] [41274] [INFO] Booting worker with pid: 41274
[2024-02-20 16:26:29 +0530] [41275] [INFO] Booting worker with pid: 41275
[2024-02-20 16:26:29 +0530] [41276] [INFO] Booting worker with pid: 41276
[2024-02-20 16:26:29 +0530] [41284] [INFO] Booting worker with pid: 41284
^C
[2024-02-20 16:29:31 +0530] [41273] [INFO] Handling signal: int
[2024-02-20 16:29:32 +0530] [41284] [INFO] Worker exiting (pid: 41284)
[2024-02-20 16:29:32 +0530] [41274] [INFO] Worker exiting (pid: 41274)
[2024-02-20 16:29:32 +0530] [41275] [INFO] Worker exiting (pid: 41275)
[2024-02-20 16:29:32 +0530] [41276] [INFO] Worker exiting (pid: 41276)


Best model according to aucpr was logistic regression.


In [18]:
model = LogisticRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
model_results = {
    metric_name: metric(y_test, y_pred) for metric_name, metric in metrics.items()
}
print(f"Results:\n{json.dumps(model_results, indent=4)}")

Results:
{
    "accuracy": 0.9886561954624782,
    "aucpr": 0.9614294570483837
}
