In [1]:
import mlflow
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, confusion_matrix, auc, precision_recall_curve
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold
from mlflow.sklearn import log_model
import warnings


In [3]:
warnings.filterwarnings("ignore")

In [4]:
train = pd.read_csv("train.csv")
X_train, y_train = train['text'], train['spam']

test = pd.read_csv("test.csv")
X_test, y_test = test['text'], test['spam']

val = pd.read_csv("validation.csv")
X_val, y_val = val['text'], val['spam']

In [5]:
# Set MLflow experiment name
mlflow.set_experiment("spam_classification_experiment")

# Define benchmark models
models = {
    "Decision_Tree": DecisionTreeClassifier(),
    "Logistic_Regression": LogisticRegression(),
    "Random_Forest": RandomForestClassifier()
}

# Loop through each model
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        # Build and train the model
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(max_features=2000)),
            ('clf', model),
        ])
        pipeline.fit(X_train, y_train)
        
        
        
        # Predict probabilities on the test set
        y_proba = pipeline.predict_proba(X_test)[:, 1]
        
        # Calculate precision-recall curve
        precision, recall, _ = precision_recall_curve(y_test, y_proba)
        
        # Calculate AUC-PR
        auc_pr = auc(recall, precision)

        # Log parameters and metrics
        mlflow.log_params(model.get_params())
        mlflow.log_metric("AUCPR", auc_pr)
        mlflow.sklearn.log_model(pipeline, "model")
        

        print(f"{model_name} AUC-PR: {auc_pr}")

        # Register the model
        mlflow.register_model(mlflow.get_artifact_uri("model"),model_name)



2024/02/21 18:33:37 INFO mlflow.tracking.fluent: Experiment with name 'spam_classification_experiment' does not exist. Creating a new experiment.


Decision_Tree AUC-PR: 0.9356442447807276


Successfully registered model 'Decision_Tree'.
Created version '1' of model 'Decision_Tree'.


Logistic_Regression AUC-PR: 0.9962349363606858


Successfully registered model 'Logistic_Regression'.
Created version '1' of model 'Logistic_Regression'.


Random_Forest AUC-PR: 0.9932400930668956


Successfully registered model 'Random_Forest'.
Created version '1' of model 'Random_Forest'.
