In [68]:
import mlflow
import mlflow.sklearn
from urllib.parse import urlparse
import mlflow.keras

In [69]:
#importing the required libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import average_precision_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [70]:
#loading the dataset
train_df = pd.read_csv("train.csv")
val_df = pd.read_csv("validation.csv")
test_df = pd.read_csv("test.csv")

#splitting the datframe into X and y
X_train = train_df['Message']
y_train = train_df['Label']
X_val = val_df['Message']
y_val = val_df['Label']
X_test = test_df['Message']
y_test = test_df['Label']

In [71]:
# get tf-idf values of training data

tfidf = TfidfVectorizer()

train_tfidf = tfidf.fit_transform(X_train)
train_tfidf

<3342x6682 sparse matrix of type '<class 'numpy.float64'>'
	with 44748 stored elements in Compressed Sparse Row format>

In [72]:
#get the tf-idf values of testing data
test_tfidf = tfidf.transform(X_test)
test_tfidf

<1115x6682 sparse matrix of type '<class 'numpy.float64'>'
	with 13486 stored elements in Compressed Sparse Row format>

In [73]:
#get the tf-idf values of validation data
val_tfidf = tfidf.transform(X_val)
val_tfidf

<1115x6682 sparse matrix of type '<class 'numpy.float64'>'
	with 13584 stored elements in Compressed Sparse Row format>

## Applying Naive Bayes

In [74]:
#use Naive Bayes classifier to fit the training data
alpha = 1
with mlflow.start_run():
    naive_bayes_classifier = MultinomialNB(alpha = alpha)
    naive_bayes_classifier.fit(train_tfidf, y_train)

    #predict the results on test data
    y_pred = naive_bayes_classifier.predict(test_tfidf)

    #accuracy score of Naive Bayes
    score1 = metrics.accuracy_score(y_test, y_pred)
    aucpr1= average_precision_score(y_test, y_pred)
    print("The accuracy score of Naive Bayes is : ",score1)
    print("AUCPR of Naive Bayes is: ", aucpr1)

    #Confusion matrix of Naive Bayes

    print(metrics.classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
    print(metrics.confusion_matrix(test_df["Label"], y_pred))
    

    mlflow.log_param("alpha",alpha)
    mlflow.log_metric("Accuracy", score1)
    mlflow.log_metric("AUCPR", aucpr1)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(naive_bayes_classifier, "model1", registered_model_name="MultinomialNB_SMS")
    else:
            mlflow.sklearn.log_model(naive_bayes_classifier, "model1")

The accuracy score of Naive Bayes is :  0.9515695067264573
AUCPR of Naive Bayes is:  0.6810835544980325
              precision    recall  f1-score   support

         Ham       0.95      1.00      0.97       968
        Spam       1.00      0.63      0.78       147

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.87      1115
weighted avg       0.95      0.95      0.95      1115

[[968   0]
 [ 54  93]]


## Applying SVM

In [75]:
from sklearn import svm

clf = svm.SVC()
clf.fit(train_tfidf,y_train)
y_pred_svm = clf.predict(test_tfidf)

#accuracy score of SVM
score2 = metrics.accuracy_score(y_test, y_pred_svm)
aucpr2 = average_precision_score(y_test, y_pred_svm)
print("The accuracy score of SVM is : ",score2)
print("The AUCPR of SVM is: ", aucpr2)

#Confusion matrix of SVM

print(metrics.classification_report(y_test, y_pred_svm, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(test_df["Label"], y_pred_svm))

The accuracy score of SVM is :  0.9811659192825112
The AUCPR of SVM is:  0.8743925303822805
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       968
        Spam       0.98      0.87      0.92       147

    accuracy                           0.98      1115
   macro avg       0.98      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[966   2]
 [ 19 128]]


#### Hyperparameter Tuning for SVM

In [76]:
from sklearn.model_selection import GridSearchCV
param_grid={"C":[0.1, 1, 10, 100], "kernel":("linear", "rbf")}
grid= GridSearchCV(clf, param_grid, refit=True, verbose=3)
grid.fit(train_tfidf,y_train)
y_pred_grid = grid.predict(val_tfidf)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END ..............C=0.1, kernel=linear;, score=0.900 total time=   0.2s
[CV 2/5] END ..............C=0.1, kernel=linear;, score=0.895 total time=   0.2s
[CV 3/5] END ..............C=0.1, kernel=linear;, score=0.906 total time=   0.2s
[CV 4/5] END ..............C=0.1, kernel=linear;, score=0.907 total time=   0.2s
[CV 5/5] END ..............C=0.1, kernel=linear;, score=0.886 total time=   0.2s
[CV 1/5] END .................C=0.1, kernel=rbf;, score=0.867 total time=   0.4s
[CV 2/5] END .................C=0.1, kernel=rbf;, score=0.865 total time=   0.4s
[CV 3/5] END .................C=0.1, kernel=rbf;, score=0.867 total time=   0.4s
[CV 4/5] END .................C=0.1, kernel=rbf;, score=0.867 total time=   0.4s
[CV 5/5] END .................C=0.1, kernel=rbf;, score=0.867 total time=   0.4s
[CV 1/5] END ................C=1, kernel=linear;, score=0.981 total time=   0.2s
[CV 2/5] END ................C=1, kernel=linear;,

In [77]:
#accuracy score of SVM
score2 = metrics.accuracy_score(y_val, y_pred_grid)
print("The accuracy score of SVM is : ",score2)

#Confusion matrix of SVM

print(metrics.classification_report(y_val, y_pred_grid, target_names=['Ham', 'Spam']))
print(metrics.confusion_matrix(val_df["Label"], y_pred_grid))

The accuracy score of SVM is :  0.9829596412556054
              precision    recall  f1-score   support

         Ham       0.98      1.00      0.99       961
        Spam       0.97      0.90      0.94       154

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[957   4]
 [ 15 139]]


In [78]:
grid.best_params_

{'C': 10, 'kernel': 'linear'}

In [79]:
#fitting SVM after hyperparameter tuning

C = 10
kernel = 'linear'
#fitting SVM
with mlflow.start_run():
    clf = svm.SVC(C=C, kernel =kernel,probability=True)
    clf.fit(train_tfidf,y_train)
    y_pred_svm = clf.predict(test_tfidf)

    #accuracy score of SVM
    score2 = metrics.accuracy_score(y_test, y_pred_svm)
    aucpr2 = average_precision_score(y_test, y_pred_svm)
    print("The accuracy score of SVM is : ",score2)
    print("The AUCPR of SVM is: ", aucpr2)

    #Confusion matrix of SVM

    print(metrics.classification_report(y_test, y_pred_svm, target_names=['Ham', 'Spam']))
    print(metrics.confusion_matrix(test_df["Label"], y_pred_svm))

    mlflow.log_param("C",C)
    mlflow.log_param("kernel",kernel)
    mlflow.log_metric("Accuracy", score2)
    mlflow.log_metric("AUCPR", aucpr2)

    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(clf, "model2", registered_model_name="SVM_SMS")
    else:
            mlflow.sklearn.log_model(clf, "model2")

The accuracy score of SVM is :  0.9883408071748879
The AUCPR of SVM is:  0.9216272770237621
              precision    recall  f1-score   support

         Ham       0.99      1.00      0.99       968
        Spam       0.99      0.93      0.95       147

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[966   2]
 [ 11 136]]


In [80]:
import pickle

In [81]:
filename = 'best_model.sav'
pickle.dump(clf, open(filename, 'wb'))

## Neural Network

In [95]:
#Importing the required libraries
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout, LSTM, Bidirectional

In [83]:
# Defining pre-processing hyperparameters
max_len = 50 
trunc_type = "post" 
padding_type = "post" 
oov_tok = "<OOV>" 
vocab_size = 500
embeding_dim = 16
n_dense = 24

#Tokenizing training data
tokenizer = Tokenizer(500)
tokenizer.fit_on_texts(X_train)

In [84]:
# Sequencing and padding on training and testing 
train_seq = tokenizer.texts_to_sequences(X_train)
train_pad = pad_sequences(train_seq, maxlen = max_len)
val_seq = tokenizer.texts_to_sequences(X_val)
val_pad = pad_sequences(val_seq, maxlen = max_len)
test_seq = tokenizer.texts_to_sequences(X_test)
test_pad = pad_sequences(test_seq, maxlen = max_len)

In [85]:
#Creating the neural network
model = Sequential()
model.add(Embedding(vocab_size, embeding_dim, input_length=max_len))
model.add(GlobalAveragePooling1D())
model.add(Dense(24, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [86]:
# fitting the model to our data

with mlflow.start_run():
    model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
    model.fit(train_pad, y_train, epochs=30, validation_data=(val_pad, y_val))
    # Model performance on validation data
    score3 = model.evaluate(val_pad, y_val)

    mlflow.log_param("activation function", "relu")
    mlflow.log_metric("test loss", score3[0])
    mlflow.log_metric("test accuracy", score3[1])
    
    # Log Model
    mlflow.keras.log_model(model, "model3")

    
    tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
    if tracking_url_type_store != "file":

            # Register the model
            # There are other ways to use the Model Registry, which depends on the use case,
            # please refer to the doc for more information:
            # https://mlflow.org/docs/latest/model-registry.html#api-workflow
            mlflow.sklearn.log_model(model, "model3", registered_model_name="NN_SMS")
    else:
            mlflow.sklearn.log_model(model, "model3")

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30




INFO:tensorflow:Assets written to: C:\Users\SUCHET~1\AppData\Local\Temp\tmpdvd9xqi9\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\SUCHET~1\AppData\Local\Temp\tmpdvd9xqi9\model\data\model\assets


Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\dense
......vars
.........0
.........1
...layers\dense_1
......vars
.........0
.........1
...layers\embedding
......vars
.........0
...layers\global_average_pooling1d
......vars
...metrics\mean
......vars
.........0
.........1
...metrics\mean_metric_wrapper
......vars
.........0
.........1
...optimizer
......vars
.........0
.........1
.........10
.........2
.........3
.........4
.........5
.........6
.........7
.........8
.........9
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-02-23 20:40:25         1982
metadata.json                                  2023-02-23 20:40:25           64
variables.h5                                   2023-02-23 20:40:25       124548


#### Hyperparameter tuning for Neural Network

In [87]:
#Changing the loss function
# fitting the model on our data
model.compile(loss='binary_crossentropy',optimizer='SGD' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=30, validation_data=(val_pad, y_val))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [88]:
# Model performance on validation data 
model.evaluate(val_pad, y_val)



[0.07026489078998566, 0.9793722033500671]

In [89]:
# Changing the number of epochs

# fitting a dense spam detector model
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=50, validation_data=(val_pad, y_val))
# Model performance on validation data 
model.evaluate(val_pad, y_val)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


[0.15704137086868286, 0.9784753322601318]

As we can see the best accuracy is observed for Adam and number of epochs 30, so we predict the model on the test data.

In [90]:
# fitting the model 
model.compile(loss='binary_crossentropy',optimizer='adam' ,metrics=['accuracy'])
fit_model = model.fit(train_pad, y_train, epochs=30, validation_data=(test_pad, y_test))

# Model performance on test data 
results = model.evaluate(test_pad, y_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [91]:
print("Neural Network gives us an accuracy of :", results[1])

Neural Network gives us an accuracy of : 0.9802690744400024
