In [5]:
!pip install mlflow



In [6]:
import nltk
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn import metrics

import random
random.seed(42)

from urllib.parse import urlparse
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import mlflow
import mlflow.sklearn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
raw_data = pd.read_csv("/content/SMSSpamCollection",header=None,sep='\t',)
raw_data.columns=['label','message']

raw_data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [20]:
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("test.csv")

In [21]:
X_train, y_train = train["message"], train["label"]
X_val, y_val = val["message"], val["label"]
X_test, y_test = test["message"], test["label"]


In [25]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the final model on the test set.
    """
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    #Specify pos_label='spam' to indicate that 'spam' is the positive class
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1_score_val = f1_score(y_test, y_pred, pos_label='spam')
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc_val = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, 'predict_proba') else 0



    return {
        "Precision": round(precision,4),
        "Recall": round(recall,4),
        "F1-Score": round(f1_score_val,4),
        "Accuracy": round(accuracy,4),
        "ROC-AUC": round(roc_auc_val,4)
    }

In [26]:
# Naive Bayes
print("Naive Bayes\n\n")
pipeline_nb = make_pipeline(CountVectorizer(), MultinomialNB(alpha = 0.1))
pipeline_nb.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_nb.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_nb, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Naive Bayes


On validation Dataset:
Accuracy : 98.8%
Precision : 98.13%
Recall : 92.92%
f1 score : 95.45%
AUCPR : 97.89%


In [27]:
# Logistic Regression
print("Logistic Regression\n\n")
pipeline_lr = make_pipeline(CountVectorizer(), LogisticRegression(random_state = 42))
pipeline_lr.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_lr.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_lr, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Logistic Regression


On validation Dataset:
Accuracy : 98.56%
Precision : 100.0%
Recall : 89.38%
f1 score : 94.39%
AUCPR : 98.85%


In [28]:
# Random Forest Classifier
print("Random Forest Classifier\n\n")
pipeline_rf = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state = 42, max_depth=60, n_jobs=-1))
pipeline_rf.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_rf.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_rf, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Random Forest Classifier


On validation Dataset:
Accuracy : 97.61%
Precision : 100.0%
Recall : 82.3%
f1 score : 90.29%
AUCPR : 99.56%


In [31]:
# Naive Bayes
with mlflow.start_run(run_name="Naive Bayes"):
    y_pred = pipeline_nb.predict(X_test)
    mlflow.log_param("model_name", "Naive Bayes")
    mlflow.log_metric("accuracy", accuracy_score(y_pred, y_test))
    # Specify pos_label='spam' for precision, recall, and f1-score
    mlflow.log_metric("precision", precision_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("recall", recall_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test, pos_label='spam'))
    # Get predicted probabilities instead of class labels
    y_pred_proba = pipeline_nb.predict_proba(X_test)[:, 1]  # Probability of 'spam' class
    mlflow.log_metric("AUCPR", roc_auc_score(y_test, y_pred_proba))  # Use probabilities for ROC AUC
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_nb, "model")

    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_nb,
        artifact_path="sklearn-model",
        registered_model_name="Naive Bayes model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_nb, "model", registered_model_name="Naive Bayes")
    else:
        mlflow.sklearn.log_model(pipeline_nb, "model")

Successfully registered model 'Naive Bayes model'.
Created version '1' of model 'Naive Bayes model'.


In [33]:
# Logistic Regression
with mlflow.start_run(run_name="Logistic Regression"):
    y_pred = pipeline_lr.predict(X_test)
    mlflow.log_param("model_name", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_score(y_pred, y_test))
    # Specify pos_label='spam' for precision, recall, and f1-score to match your target variable labels
    mlflow.log_metric("precision", precision_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("recall", recall_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test, pos_label='spam'))
    # Get predicted probabilities instead of class labels
    y_pred_proba = pipeline_lr.predict_proba(X_test)[:, 1]  # Probability of 'spam' class
    mlflow.log_metric("AUCPR", roc_auc_score(y_test, y_pred_proba))  # Use probabilities for ROC AUC
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_lr, "model")

    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_nb, # This should be pipeline_lr for Logistic Regression model
        artifact_path="sklearn-model",
        registered_model_name="Logistic Regression model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_lr, "model", registered_model_name="Logistic Regression")
    else:
        mlflow.sklearn.log_model(pipeline_lr, "model")

Successfully registered model 'Logistic Regression model'.
Created version '1' of model 'Logistic Regression model'.


In [35]:
# Random Forest
with mlflow.start_run(run_name="Random Forest"):
    y_pred = pipeline_rf.predict(X_test)
    mlflow.log_param("model_name", "Random Forest")
    mlflow.log_metric("accuracy", accuracy_score(y_pred, y_test))
    # Specify pos_label='spam' for precision, recall, and f1-score to match your target variable labels
    mlflow.log_metric("precision", precision_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("recall", recall_score(y_pred, y_test, pos_label='spam'))
    mlflow.log_metric("f1 score", f1_score(y_pred, y_test, pos_label='spam'))
    # Get predicted probabilities for ROC AUC calculation
    y_pred_proba = pipeline_rf.predict_proba(X_test)[:, 1]  # Probability of 'spam' class
    mlflow.log_metric("AUCPR", roc_auc_score(y_test, y_pred_proba))
    mlflow.log_dict(np.array(confusion_matrix(y_test, y_pred)).tolist(), "confusion_matrix.json")
    mlflow.sklearn.log_model(pipeline_rf, "model")

    tracking_url_type = urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.sklearn.log_model(
        sk_model=pipeline_rf,
        artifact_path="sklearn-model",
        registered_model_name="Random Forest model"
    )
    if tracking_url_type != "file":
        mlflow.sklearn.log_model(pipeline_rf, "model", registered_model_name="Random Forest")
    else:
        mlflow.sklearn.log_model(pipeline_rf, "model")

Successfully registered model 'Random Forest model'.
Created version '1' of model 'Random Forest model'.
