In [None]:
import os

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import confusion_matrix, precision_recall_curve, accuracy_score, precision_score, recall_score, auc, classification_report, ConfusionMatrixDisplay, make_scorer

from itertools import product

from imblearn.over_sampling import SMOTE, RandomOverSampler

import mlflow
import mlflow.sklearn

In [None]:
mlflow.set_tracking_uri("sqlite:///database/mlflow.db")
experiment = mlflow.set_experiment("SMS Spam Detection")


experiment_id = experiment.experiment_id
print("Experiment ID:", experiment_id)

In [None]:
train_save_path = os.path.join('data','train.csv')
val_save_path = os.path.join('data','val.csv')
test_save_path = os.path.join('data','test.csv')

train_val_test_save_paths = [train_save_path, val_save_path, test_save_path]

train_val_test_save_paths

In [None]:
def load_train_val_test(train_val_test_save_paths, oversampler=None):
    train_data = pd.read_csv(train_val_test_save_paths[0])
    val_data = pd.read_csv(train_val_test_save_paths[1])
    test_data = pd.read_csv(train_val_test_save_paths[2])
    
    
    y_train = train_data['label']
    X_train = train_data.drop('label', axis=1)

   
    if oversampler:
        X_train, y_train = oversampler.fit_resample(X_train, y_train)


    y_val = val_data['label']
    X_val = val_data.drop('label', axis=1)

    y_test = test_data['label']
    X_test = test_data.drop('label', axis=1)

    return X_train, X_val, X_test, y_train, y_val ,y_test

In [None]:
def train_model(X_train, X_val, y_train, y_val, classifier, param_grid):
    
    classifier_name = classifier.__class__.__name__
    run_name = classifier_name + str("_run")

    
    
    
    
    best_model = classifier
    
    
    best_precision = 0.0

   
    param_list = list(product(*param_grid.values()))

    for param in param_list:
        
        with mlflow.start_run(run_name=run_name, experiment_id=experiment_id):
            
            mlflow.set_tags(run_tags)
            
            
            param_dict = dict(zip(param_grid.keys(), param))
            
            mlflow.log_params(param_dict)
            
            
            model = classifier.set_params(**param_dict)
            
            model.fit(X_train, y_train)

            
            y_val_hat = model.predict(X_val)
            y_val_prob = model.predict_proba(X_val)[:, 1]
            
           
            current_precision = precision_score(y_val, y_val_hat, average='micro')
            # log precision_score
            mlflow.log_metric(key="precision", value=current_precision)

            # calculate the area under the precision-recall curve (AUCPR)
            precision, recall, thresholds = precision_recall_curve(y_val, y_val_prob)
            aucpr = auc(recall, precision)
            # log aucpr
            mlflow.log_metric(key="AUCPR", value=aucpr)

           
            mlflow.sklearn.log_model(model, classifier_name)

            
            if current_precision > best_precision:
                best_precision = current_precision
                best_model = model
                
                print("Current Best Precision on Val: %.3f" % best_precision)
    
    
    print("Overall Best Model:", best_model)
    print("Overall Best Precision on Val: %.3f" % best_precision)

    return best_model

In [None]:
classifier = LogisticRegression()



param_grid = {'penalty': ['l1', 'l2'],
              'C': [0.001, 0.01, 0.1, 1, 10],
              'solver': ['liblinear'],
              'max_iter': [100, 200, 500]}

best_logit = train_model(
                            X_train=X_train,
                            X_val=X_val,
                            y_train=y_train,
                            y_val=y_val,
                            classifier=classifier,
                            param_grid=param_grid
                            )


y_test_prob = best_logit.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_test_prob)
plt.figure()
plt.plot(recall, precision, color='blue', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title("Test Precision-Recall curve")
plt.legend()
plt.show()

# Calculate the Area under the PRCurve
aucpr = auc(recall, precision)
print("Area under the PRCurve for Test Data:", aucpr)

In [None]:
classifier = RandomForestClassifier()


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

best_rfc = train_model(
                            X_train=X_train,
                            X_val=X_val,
                            y_train=y_train,
                            y_val=y_val,
                            classifier=classifier,
                            param_grid=param_grid
                            )


y_test_prob = best_rfc.predict_proba(X_test)[:, 1]
precision, recall, thresholds = precision_recall_curve(y_test, y_test_prob)
plt.figure()
plt.plot(recall, precision, color='blue', label='Precision-Recall curve')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title("Test Precision-Recall curve")
plt.legend()
plt.show()

# Calculate the Area under the PRCurve
aucpr = auc(recall, precision)
print("Area under the PRCurve for Test Data:", aucpr)

In [None]:
y_test_hat = best_logit.predict(X_test)

# show confusion matrix
cm_display = ConfusionMatrixDisplay.from_predictions(y_test, y_test_hat, values_format='.5g')
plt.title("Logistic Regression")
plt.show()

# print the classification report
print(classification_report(y_test, y_test_hat))

In [None]:
y_test_hat = best_rfc.predict(X_test)

# show confusion matrix
cm_display = ConfusionMatrixDisplay.from_predictions(y_test, y_test_hat, values_format='.5g')
plt.title("Random Forest Classifier")
plt.show()

# print the classification report
print(classification_report(y_test, y_test_hat))