<h1 style="color:green">Text Classification and Spam Detection with Naive Bayes using Pandas and Numpy</h1>


In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
dataset = pd.read_table('SMSSpamCollection', header = None, names = ['Labels', 'SMS'])

In [3]:
dataset

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
#df = dataset.head()
df = dataset
df

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [5]:
df['SMS'] = df['SMS'].apply(lambda x: re.sub(r'\W', ' ', x.lower()).split())
df

Unnamed: 0,Labels,SMS
0,ham,"[go, until, jurong, point, crazy, available, o..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f..."
3,ham,"[u, dun, say, so, early, hor, u, c, already, t..."
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,..."
...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]"
5569,ham,"[pity, was, in, mood, for, that, so, any, othe..."
5570,ham,"[the, guy, did, some, bitching, but, i, acted,..."


In [6]:
unique_words = set()
for sms in df['SMS']:
    unique_words.update(sms)

In [7]:
#unique_words

In [8]:
sorted_words = sorted(unique_words)
word_counts = pd.DataFrame(0, columns=sorted_words, index=df.index)
word_counts

Unnamed: 0,0,00,000,000pes,008704050406,0089,0121,01223585236,01223585334,0125698789,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
for i, sms in enumerate(df['SMS']):
    word_counts.iloc[i] = [sms.count(word) for word in sorted_words]

In [10]:
final_df = pd.concat([df, word_counts], axis=1)
final_df

Unnamed: 0,Labels,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
0,ham,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
train_size = int(0.9 * len(final_df))
train_size

5014

In [12]:
train_data, test_data = final_df[:train_size], final_df[train_size:]

In [13]:
train_data

Unnamed: 0,Labels,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
0,ham,"[go, until, jurong, point, crazy, available, o...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,"[ok, lar, joking, wif, u, oni]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,"[free, entry, in, 2, a, wkly, comp, to, win, f...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,"[u, dun, say, so, early, hor, u, c, already, t...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,"[nah, i, don, t, think, he, goes, to, usf, he,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5009,ham,"[go, fool, dont, cheat, others, ok]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5010,ham,"[my, mobile, number, pls, sms, ur, mail, id, c...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5011,ham,"[by, the, way, rencontre, is, to, meet, again,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5012,spam,"[you, have, won, a, guaranteed, 1000, cash, or...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
test_data

Unnamed: 0,Labels,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zogtorius,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥
5014,ham,"[uncle, g, just, checking, up, on, you, do, ha...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5015,ham,"[hello, boytoy, geeee, i, m, missing, you, tod...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5016,ham,"[i, think, the, other, two, still, need, to, g...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5017,ham,"[hey, gals, u, all, wanna, meet, 4, dinner, at...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5018,spam,"[dear, 0776xxxxxxx, u, ve, been, invited, to, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
spam_messages = train_data[train_data['Labels'] == 'spam']
ham_messages = train_data[train_data['Labels'] == 'ham']

In [16]:
p_spam = len(spam_messages) / len(train_data)
p_ham = len(ham_messages) / len(train_data)
print(p_spam, p_ham)

0.1346230554447547 0.8653769445552453


In [17]:
vocabulary = set()
for sms in train_data['SMS']:
    for word in sms:
        vocabulary.add(word)

In [18]:
#print(vocabulary)

In [19]:
word_count_spam = {word: 0 for word in vocabulary}
word_count_ham = {word: 0 for word in vocabulary}

In [20]:
for sms in spam_messages['SMS']:
    for word in sms:
        word_count_spam[word] += 1

In [21]:
#print(word_count_spam)

In [22]:
for sms in ham_messages['SMS']:
    for word in sms:
        word_count_ham[word] += 1

In [23]:
#print(word_count_ham)

In [24]:
def classify_sms(sms):
    spam_prob = p_spam
    ham_prob = p_ham

    for word in sms:
        if word in vocabulary:
            spam_prob *= (word_count_spam[word] + 1) / (len(spam_messages) + len(vocabulary))
            ham_prob *= (word_count_ham[word] + 1) / (len(ham_messages) + len(vocabulary))

    return 'spam' if spam_prob > ham_prob else 'ham'


In [25]:
test_data['Predicted'] = test_data['SMS'].apply(classify_sms)
test_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['Predicted'] = test_data['SMS'].apply(classify_sms)


Unnamed: 0,Labels,SMS,0,00,000,000pes,008704050406,0089,0121,01223585236,...,zoom,zouk,zyada,èn,é,ú1,ü,〨ud,鈥,Predicted
5014,ham,"[uncle, g, just, checking, up, on, you, do, ha...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5015,ham,"[hello, boytoy, geeee, i, m, missing, you, tod...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5016,ham,"[i, think, the, other, two, still, need, to, g...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5017,ham,"[hey, gals, u, all, wanna, meet, 4, dinner, at...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5018,spam,"[dear, 0776xxxxxxx, u, ve, been, invited, to, ...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,spam,"[this, is, the, 2nd, time, we, have, tried, 2,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,spam
5568,ham,"[will, ü, b, going, to, esplanade, fr, home]",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,ham
5569,ham,"[pity, was, in, mood, for, that, so, any, othe...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham
5570,ham,"[the, guy, did, some, bitching, but, i, acted,...",0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ham


In [26]:
pd_accuracy = (test_data['Labels'] == test_data['Predicted']).sum() / len(test_data)
pd_accuracy

0.974910394265233

<h1 style="color:green">Text Classification with Naive Bayes for Spam Detection using scikit-learn</h1>


In [27]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [28]:
dataset = pd.read_table('SMSSpamCollection', header = None, names = ['Labels', 'SMS'])
nb_df = dataset
nb_df

Unnamed: 0,Labels,SMS
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [29]:
X_train, X_test, y_train, y_test = train_test_split(nb_df['SMS'], nb_df['Labels'], test_size=0.2, random_state=42)

In [30]:
vectorizer = CountVectorizer()
X_train_counts = vectorizer.fit_transform(X_train)
X_test_counts = vectorizer.transform(X_test)

In [31]:
classifier = MultinomialNB()
classifier.fit(X_train_counts, y_train)

MultinomialNB()

In [32]:
predictions = classifier.predict(X_test_counts)

In [33]:
nb_accuracy = accuracy_score(y_test, predictions)
nb_accuracy

0.9919282511210762

In [34]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

         ham       0.99      1.00      1.00       966
        spam       1.00      0.94      0.97       149

    accuracy                           0.99      1115
   macro avg       1.00      0.97      0.98      1115
weighted avg       0.99      0.99      0.99      1115



In [35]:
print(f'Difference in accuracy of email spam detection using scikit-learn and pandas is: {nb_accuracy - pd_accuracy}')

Difference in accuracy of email spam detection using scikit-learn and pandas is: 0.017017856855843227


<h2 style="color:green">Summary</h2>


In this project, we implemented an email spam detection system using two powerful libraries, scikit-learn and pandas. By employing scikit-learn's Naive Bayes classifier, we achieved an accuracy of approximately 99.19%. Comparatively, our implementation using pandas resulted in an accuracy of around 97.49%. The primary purpose of using pandas was to provide a transparent and instructive demonstration of the inner workings of the Naive Bayes model. This approach, while slightly less accurate, serves as a valuable educational tool for understanding the core principles of email spam detection and the mechanics of the Naive Bayes algorithm. It highlights the trade-off between model simplicity and predictive performance, making it an excellent choice for educational and illustrative purposes in machine learning.