In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as pl

import mlflow
import mlflow.sklearn
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score,roc_curve, auc, accuracy_score, average_precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv("./train.csv")
val_data = pd.read_csv("./val.csv")
test_data = pd.read_csv("./test.csv")

In [3]:
print(train_data.isnull().sum(), val_data.isnull().sum(), test_data.isnull().sum())

message    5
label      0
dtype: int64 message    0
label      0
dtype: int64 message    1
label      0
dtype: int64


In [4]:
# Drop rows with NaNs in any column
train_data = train_data.dropna()
val_data = val_data.dropna()
test_data = test_data.dropna()

X_train, y_train = train_data.drop(columns="label"), train_data["label"].values
X_val, y_val = val_data.drop(columns="label"), val_data["label"].values
X_test, y_test = test_data.drop(columns="label"), test_data["label"].values

In [5]:
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the training data
X_train = vectorizer.fit_transform(X_train['message'])

# Transform the validation and test data
X_val = vectorizer.transform(X_val['message'])
X_test = vectorizer.transform(X_test['message'])

In [None]:
# Start MLflow experiment
mlflow.set_experiment("Benchmark_Models")
input_example = X_test[:1].toarray() 

models = {
    "Logistic Regression": {
        "model": LogisticRegression(),
        "param_grid": {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']},
    },
    "Random Forest": {
        "model": RandomForestClassifier(),
        "param_grid": {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, None]},
    },
    "SVM": {
        "model": SVC(probability=True),
        "param_grid": {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    },
}

best_models = {}


for model_name, details in models.items():
    with mlflow.start_run(run_name=model_name):
        print(f"\nTraining {model_name}...")

        # Grid Search for Hyperparameter tuning
        search = GridSearchCV(details["model"], details["param_grid"], cv=5, scoring='accuracy', n_jobs=-1)
        search.fit(X_train, y_train)
        best_model = search.best_estimator_
        best_models[model_name] = best_model
        
        # Validation Accuracy
        val_accuracy = accuracy_score(y_val, best_model.predict(X_val))
        
        # Test predictions
        y_test_pred = best_model.predict(X_test)
        y_test_prob = best_model.predict_proba(X_test)[:, 1]
        
        # AUC ROC Score
        roc_auc = roc_auc_score(y_test, y_test_prob, multi_class='ovr')
        
        # AUCPR Score
        aucpr = average_precision_score(y_test, y_test_prob)
        
        print(f"Best Params: {search.best_params_}")
        print(f"Validation Accuracy: {val_accuracy:.4f}")
        print(f"AUC ROC Score: {roc_auc:.4f}")
        print(f"AUCPR Score: {aucpr:.4f}")

        # Log metrics to MLflow
        mlflow.log_params(search.best_params_)
        mlflow.log_metric("val_accuracy", val_accuracy)
        mlflow.log_metric("roc_auc", roc_auc)
        mlflow.log_metric("aucpr", aucpr)
        
        # Log model
        mlflow.sklearn.log_model(sk_model=best_model,artifact_path=model_name,input_example=input_example)
        
        # Register best model
        mlflow.register_model(f"runs:/{mlflow.active_run().info.run_id}/{model_name}", model_name)

        print(f"{model_name} logged and registered in MLflow.\n")

print("All models trained and logged in MLflow successfully.")

2025/03/05 00:06:28 INFO mlflow.tracking.fluent: Experiment with name 'Benchmark_Models' does not exist. Creating a new experiment.



Training Logistic Regression...
Best Params: {'C': 10, 'penalty': 'l2'}
Validation Accuracy: 0.9709
AUC ROC Score: 0.9798
AUCPR Score: 0.9520


Successfully registered model 'Logistic Regression'.
Created version '1' of model 'Logistic Regression'.


Logistic Regression logged and registered in MLflow.


Training Random Forest...
Best Params: {'max_depth': None, 'n_estimators': 200}
Validation Accuracy: 0.9619
AUC ROC Score: 0.9902
AUCPR Score: 0.9627


Successfully registered model 'Random Forest'.
Created version '1' of model 'Random Forest'.


Random Forest logged and registered in MLflow.


Training SVM...
