In [80]:
#Importing the required functions
import pandas as pd
import numpy as np
import regex as re
import nltk
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import auc
from sklearn.svm import SVC
import warnings
warnings.filterwarnings("ignore")
import random
import mlflow
random.seed(1024)

**Reading in the train, test and validation data**

In [81]:
path_train = './data/train.csv'
path_val = './data/validation.csv'
path_test = './data/test.csv'
path_mod_df = './data/mod_df.csv'

train_df = pd.read_csv(path_train)
val_df = pd.read_csv(path_val)
test_df = pd.read_csv(path_test)
raw_data = pd.read_csv(path_mod_df)

Converting the response from 'spam' and 'ham' to 0 and 1 respectively.

In [82]:
train_df['y_train'] = train_df['y_train'].map({'ham': 1, 'spam': 0})
val_df['y_val'] = val_df['y_val'].map({'ham': 1, 'spam': 0})
test_df['y_test'] = test_df['y_test'].map({'ham': 1, 'spam': 0})

**Creating the bag of words transformer**

In [83]:
vectorizer = CountVectorizer()
vectorizer.fit(train_df.X_train)

bow_transformer = vectorizer.vocabulary_
print(len(bow_transformer))

7331


**Converting all the data to be used into bag of words form**

In [84]:
X_train = vectorizer.transform(train_df.X_train)
X_val = vectorizer.transform(val_df.X_val)
X_test = vectorizer.transform(test_df.X_test)

print(X_train.shape, X_val.shape, X_test.shape)

(4514, 7331) (502, 7331) (558, 7331)


**Creating the tf-idf transformer**

In [85]:
tfidf_transformer = TfidfTransformer().fit(X_train)

**Converting the data into tf-idf form**

In [86]:
tfidf_X_train = tfidf_transformer.transform(X_train)
tfidf_X_val = tfidf_transformer.transform(X_val)
tfidf_X_test = tfidf_transformer.transform(X_test)

print(tfidf_X_train.shape, tfidf_X_val.shape, tfidf_X_test.shape)

(4514, 7331) (502, 7331) (558, 7331)


**Multinomial Naive Bayes Model based on tf-idf tokenizer**

**Building a function to create the spam detection model and compute the evaluation metrics for the predicted values based on the test set**

In [87]:
def MNB_model(alp):
    spam_detection_model = MultinomialNB(alpha = alp).fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model.predict(tfidf_X_test)

    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, thresholds = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    
    #print("The AUPCR score is:",aupcr)
    return [acc_sc, aupcr]

**Logistic Regression Model based on tf-idf tokenizer**

**Creating the spam detection model and computing the evaluation metrics for the predicted values based on the test set**

In [88]:
def Log_model(C):
    spam_detection_model_2 = LogisticRegression(C = i)
    spam_detection_model_2.fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model_2.predict(tfidf_X_test)
    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, _ = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    #print("The AUPCR score is:",aupcr)

    return [acc_sc, aupcr]

**Support Vector Classifier Model**

**Creating the spam detection model and computing the evaluation metrics for the predicted values based on the test set**

In [89]:
def SVC_Model(C):    
    spam_detection_model_3 = SVC(C = i)
    spam_detection_model_3.fit(tfidf_X_train, train_df.y_train)
    test_predictions = spam_detection_model_3.predict(tfidf_X_test)
    acc_sc = accuracy_score(test_df.y_test, test_predictions)
    
    #Computing Precision and Recall
    precision, recall, _ = precision_recall_curve(test_df.y_test, test_predictions)
    # Use AUC function to calculate the area under the curve of precision recall curve
    aupcr = auc(recall, precision)
    #print("The AUPCR score is:",aupcr)

    return [acc_sc, aupcr]

In [90]:
from  mlflow.tracking import MlflowClient
client = MlflowClient()
#log into MLflow

#Set storage directory
mlflow.set_tracking_uri('MLFlow_Logs\mlruns')

#set experiment
mlflow.set_experiment('SMS Spam Classification Model Evaluation')

<Experiment: artifact_location='MLFlow_Logs\\mlruns/362212374075415730', creation_time=1677226009667, experiment_id='362212374075415730', last_update_time=1677226009667, lifecycle_stage='active', name='SMS Spam Classification Model Evaluation', tags={}>

In [95]:
mlflow.set_experiment('SMS Spam Classification Model Evaluation')
#Running the models and logging the runs with MLFlow
md_name = "Multinomial Naive Bayes" 
for i in np.arange(0.05, 2.25, 0.25):
    with mlflow.start_run() as run: #inside brackets run_name='test'
        #Log parameters
        mlflow.log_param("Model",md_name)
        mlflow.log_param("Alpha",i)
        #Running the model
        model_metrics = MNB_model(i)
        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[0])
        mlflow.log_metric("AUPCR", model_metrics[1])

md_name = "Logistic Regression"
for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    with mlflow.start_run() as run: 
        #Log parameters
        mlflow.log_param("Model",md_name)
        mlflow.log_param("C",i)
        #Running the model
        model_metrics = Log_model(i)
        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[0])
        mlflow.log_metric("AUPCR", model_metrics[1])


md_name = "Support Vector Classifier"
for i in [0.1, 0.5, 1, 10, 20, 50, 100]:
    with mlflow.start_run() as run: 
        #Log parameters
        mlflow.log_param("Model",md_name)
        mlflow.log_param("Alpha",i)
        #Running the model
        model_metrics = MNB_model(i)
        #Logging metrics
        mlflow.log_metric("Accuracy", model_metrics[0])
        mlflow.log_metric("AUPCR", model_metrics[1])
        
