## Import libaries

In [1]:
import pandas as pd # to handle dataframes/tables
import numpy as np  # to handle numerical operations
from sklearn.feature_extraction.text import TfidfVectorizer # to convert text to TF-IDF features
from sklearn.model_selection import train_test_split # to split data into training and testing sets
from sklearn.naive_bayes import MultinomialNB # Naive Bayes classifier
from sklearn.svm import SVC # Support Vector Machine classifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # to evaluate model performance


In [2]:
# load dataset
# 'encoding' parameter is used to handle special characters in the dataset
df_raw = pd.read_csv('spam.csv', encoding='latin-1')

# display first 5 rows of the dataset
df_raw.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# remove unnecessary columns
df_raw = df_raw[['v1', 'v2']]

# rename columns for better understanding
df_raw.columns = ['label', 'message']

df_raw.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# copy the dataframe to a new variable for cleaning
df_clean = df_raw.copy()

In [5]:
# function to clean and tokenize the text data
def clean_text(text):
    # convert text to lowercase
    text = text.lower()
    # remove punctuation and special characters
    text = ''.join(char for char in text if char.isalnum() or char.isspace())
    # tokenize the text into words
    tokens = text.split()
    return tokens


In [6]:
# apply the cleaning function to the 'message' column
df_clean['tokens'] = df_clean['message'].apply(clean_text)

# display the cleaned dataframe
df_clean.head()

Unnamed: 0,label,message,tokens
0,ham,"Go until jurong point, crazy.. Available only ...","[go, until, jurong, point, crazy, available, o..."
1,ham,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,U dun say so early hor... U c already then say...,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"Nah I don't think he goes to usf, he lives aro...","[nah, i, dont, think, he, goes, to, usf, he, l..."


In [7]:
# view size of the cleaned dataframe
df_clean.shape

(5572, 3)

## Convert dataset to TF-IDF

In [8]:
# create a TF-IDF vectorizer object
vectorizer = TfidfVectorizer()

# fit and transform the tokens column into TF-IDF features
X = vectorizer.fit_transform(df_clean['tokens'].apply(lambda x: ' '.join(x)))

# display the shape of the TF-IDF matrix
X.shape

(5572, 9523)

## Model Selection

In [9]:
# split the dataset into training and testing sets

y = df_clean['label'].map({'ham': 0, 'spam': 1})  # convert labels to binary values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
#display training and testing dataset sizes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4457, 9523), (1115, 9523), (4457,), (1115,))

In [11]:
# Naive Bayes Classifier

# create a Multinomial Naive Bayes classifier
nb_classifier = MultinomialNB()

# train the classifier on the training data
nb_classifier.fit(X_train, y_train)



0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [12]:
# Support Vector Machine (SVM) Classifier

# create a Support Vector Machine classifier
svm_classifier = SVC(kernel='linear', random_state=42)

# train the classifier on the training data
svm_classifier.fit(X_train, y_train)


0,1,2
,C,1.0
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [13]:
# compare both model classifiers based on accuracy, precision, recall, and F1-score

def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# evaluate Naive Bayes classifier
nb_metrics = evaluate_model(nb_classifier, X_test, y_test)
print("Naive Bayes Classifier Metrics:")
print(f"Accuracy: {nb_metrics[0]:.4f}, Precision: {nb_metrics[1]:.4f}, Recall: {nb_metrics[2]:.4f}, F1-Score: {nb_metrics[3]:.4f}")

# evaluate SVM classifier
svm_metrics = evaluate_model(svm_classifier, X_test, y_test)
print("SVM Classifier Metrics:")
print(f"Accuracy: {svm_metrics[0]:.4f}, Precision: {svm_metrics[1]:.4f}, Recall: {svm_metrics[2]:.4f}, F1-Score: {svm_metrics[3]:.4f}")

# display comparison of both models
columns = ["Accuracy", "Precision", "Recall", "F1-Score"]

results_df = pd.DataFrame(
    [nb_metrics, svm_metrics],
    index=["Naive Bayes", "Linear SVM"],
    columns=columns
)

results_df

Naive Bayes Classifier Metrics:
Accuracy: 0.9561, Precision: 1.0000, Recall: 0.6733, F1-Score: 0.8048
SVM Classifier Metrics:
Accuracy: 0.9794, Precision: 0.9847, Recall: 0.8600, F1-Score: 0.9181


Unnamed: 0,Accuracy,Precision,Recall,F1-Score
Naive Bayes,0.956054,1.0,0.673333,0.804781
Linear SVM,0.979372,0.984733,0.86,0.918149
