In [2]:
import pandas as pd
df = pd.read_csv('sms_spam.csv', index_col=0)
df.head(20)

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
ham,"Go until jurong point, crazy.. Available only ..."
ham,Ok lar... Joking wif u oni...
spam,Free entry in 2 a wkly comp to win FA Cup fina...
ham,U dun say so early hor... U c already then say...
ham,"Nah I don't think he goes to usf, he lives aro..."
spam,FreeMsg Hey there darling it's been 3 week's n...
ham,Even my brother is not like to speak with me. ...
ham,As per your request 'Melle Melle (Oru Minnamin...
spam,WINNER!! As a valued network customer you have...
spam,Had your mobile 11 months or more? U R entitle...


In [4]:
df = df.reset_index()
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [5]:
df.head(20)

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [6]:
#convert all test to lowercase
df['text'] = df['text'].str.lower()

In [7]:
#remove non-alphabetic characters (punctuation)
df['text'] = df['text'].str.replace(r'[^a-zA-Z\s]', '')


In [9]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# download stopwords if not already downloaded
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

df['text'] = df['text'].apply(lambda x: ' '.join([word for word in word_tokenize(x) if word not in stop_words]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


In [10]:
#apply stemming to reduce words to their root form
from nltk.stem import PorterStemmer
ps = PorterStemmer()
df['text'] = df['text'].apply(lambda x: ' '.join([ps.stem(word) for word in word_tokenize(x)]))

In [11]:
from sklearn.model_selection import train_test_split

#split data into training and testing sets (80/20)
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline

# Create a pipeline that combines the CountVectorizer and the Naive Bayes classifier
model = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model using the training data
model.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = model.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, roc_auc_score

# a. Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# b. Precision, Recall, and F1-score
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

# c. Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print(f'Confusion Matrix:\n{conf_matrix}')

# d. ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC-AUC Score: {roc_auc}')

Accuracy: 0.9874439461883409
Precision: 0.972027972027972
Recall: 0.9328859060402684
F1-score: 0.952054794520548
Confusion Matrix:
[[962   4]
 [ 10 139]]
ROC-AUC Score: 0.964372559645393
