In [2]:
import nltk
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn import metrics

import random
random.seed(37)

from urllib.parse import urlparse
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

%matplotlib inline
import matplotlib.pyplot as plt

nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
raw_data = pd.read_csv("/content/SMSSpamCollection",header=None,sep='\t',)
raw_data.columns=['label','message']

raw_data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
train = pd.read_csv("/content/train_3.csv")
val = pd.read_csv("/content/validation_3.csv")
test = pd.read_csv("/content/test_3.csv")

In [5]:
X_train, y_train = train["message"], train["label"]
X_val, y_val = val["message"], val["label"]
X_test, y_test = test["message"], test["label"]


In [6]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the final model on the test set.
    """
    y_pred = model.predict(X_test)
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    #Specify pos_label='spam' to indicate that 'spam' is the positive class
    precision = precision_score(y_test, y_pred, pos_label='spam')
    recall = recall_score(y_test, y_pred, pos_label='spam')
    f1_score_val = f1_score(y_test, y_pred, pos_label='spam')
    accuracy = accuracy_score(y_test, y_pred)
    roc_auc_val = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, 'predict_proba') else 0



    return {
        "Precision": round(precision,4),
        "Recall": round(recall,4),
        "F1-Score": round(f1_score_val,4),
        "Accuracy": round(accuracy,4),
        "ROC-AUC": round(roc_auc_val,4)
    }

In [15]:
# Naive Bayes
print("Naive Bayes\n\n")
pipeline_nb = make_pipeline(CountVectorizer(), MultinomialNB(alpha = 0.1))
pipeline_nb.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_nb.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_nb, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Naive Bayes


On validation Dataset:
Accuracy : 97.67%
Precision : 90.97%
Recall : 92.16%
f1 score : 91.56%
AUCPR : 98.07%


In [11]:
# Logistic Regression
print("Logistic Regression\n\n")
pipeline_lr = make_pipeline(CountVectorizer(), LogisticRegression(random_state = 42))
pipeline_lr.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_lr.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_lr, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Logistic Regression


On validation Dataset:
Accuracy : 96.86%
Precision : 97.58%
Recall : 79.08%
f1 score : 87.36%
AUCPR : 99.16%


In [16]:

# Random Forest Classifier
print("Random Forest Classifier\n\n")
pipeline_rf = make_pipeline(CountVectorizer(), RandomForestClassifier(random_state = 42, max_depth=60, n_jobs=-1))
pipeline_rf.fit(X_train, y_train)

# Evaluate the model on validation set
predictions = pipeline_rf.predict(X_val)

# Evaluation Metrics
metrics = evaluate_model(pipeline_rf, X_val, y_val)

print("On validation Dataset:", end = "\n")
print("Accuracy : " + str(round(metrics['Accuracy']*100, 2)) + "%")
print("Precision : " + str(round(metrics['Precision']*100, 2)) + "%")
print("Recall : " + str(round(metrics['Recall']*100, 2)) + "%")
print("f1 score : " + str(round(metrics['F1-Score']*100, 2)) + "%")
print("AUCPR : " + str(round(metrics['ROC-AUC']*100, 2)) + "%")

Random Forest Classifier


On validation Dataset:
Accuracy : 95.78%
Precision : 100.0%
Recall : 69.28%
f1 score : 81.85%
AUCPR : 99.04%


# DISPLAY METRICS

In [18]:
# Store the metrics in a dictionary
data = {
    'Model': ['Naive Bayes', 'Logistic Regression', 'Random Forest'],
    'Accuracy': [0.9767, 0.9686, 0.9578],
    'Precision': [0.9097, 0.9758, 1.00],
    'Recall': [0.9216, 0.7908, 0.6928],
    'F1-Score': [0.9156, 0.8736, 0.8185],
    'ROC-AUC': [0.9807, 0.9916, 0.9904]
}

# Create a Pandas DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
display(df)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,ROC-AUC
0,Naive Bayes,0.9767,0.9097,0.9216,0.9156,0.9807
1,Logistic Regression,0.9686,0.9758,0.7908,0.8736,0.9916
2,Random Forest,0.9578,1.0,0.6928,0.8185,0.9904


# WHICH IS THE BEST MODEL?

For spam detection, we dont want important non-spam messages to be classified as spam; although it is ok to not be able to detect a few spam messages. (Since spam messaages are already very few.) So a high precision is preferred. Of the 3 models, both Logistic Regression and Random Forest have high precision; but Logistic Regression have more balanced metrics. Hence our best model is Logistic Regression.

# SAVING THE BEST MODEL: LOGISTIC REGRESSION

In [20]:
from google.colab import drive
import joblib

# Mount Google Drive
drive.mount('/content/drive')

# Saving the trained and evaluated model
model_save_path = '/content/drive/My Drive/logistic_regression_model.pkl'
joblib.dump(pipeline_lr, model_save_path)

Mounted at /content/drive


['/content/drive/My Drive/logistic_regression_model.pkl']