In [1]:
import os
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [2]:
# A list of common english words which should not affect predictions
stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone',
             'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount',
             'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around',
             'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before',
             'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
             'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de',
             'describe', 'detail', 'did', 'do', 'does', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg',
             'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone',
             'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for',
             'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had',
             'has', 'hasnt', 'have', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed',
             'interest', 'into', 'is', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less',
             'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly',
             'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine',
             'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
             'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
             'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming',
             'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 
             'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system',
             't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there',
             'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third', 'this',
             'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward',
             'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we',
             'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby',
             'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom',
             'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
             'yourselves']

In [3]:
data = pd.read_csv(r'C:\Users\Sheon\Desktop\spam.csv', header = None, encoding = 'latin-1')
print(data)

X = []
Y = []

for category in data[0]:
    Y.append(category)

for text in data[1]:
    X.append(text)

X.pop(0)
Y.pop(0)

X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size = 0.25, random_state = 0)

vocab = {}
for i in range(len(X_train)):
    word_list = []
    for word in X_train[i].split():
        word_new = word.strip(string.punctuation).lower()
        if (len(word_new) > 2)  and (word_new not in stopwords):  
            if word_new in vocab:
                vocab[word_new] += 1
            else:
                vocab[word_new] = 1

num_words = [0 for i in range(max(vocab.values())+1)]
freq = [i for i in range(max(vocab.values())+1)] 
for key in vocab:
    num_words[vocab[key]] += 1

cutoff_freq = 20
# For deciding cutoff frequency
num_words_above_cutoff = len(vocab) - sum(num_words[0 : cutoff_freq]) 
print("Number of words with frequency higher than cutoff frequency({}) :".format(cutoff_freq), num_words_above_cutoff)

features = []
for key in vocab:
    if vocab[key] >= cutoff_freq:
        features.append(key)

print(features)

X_train_dataset = np.zeros((len(X_train),len(features)))
for i in range(len(X_train)):
    word_list = [word.strip(string.punctuation).lower() for word in X_train[i].split()]
    for word in word_list:
        if word in features:
            X_train_dataset[i][features.index(word)] += 1
            
X_test_dataset = np.zeros((len(X_test),len(features)))
for i in range(len(X_test)):
    word_list = [word.strip(string.punctuation).lower() for word in X_test[i].split()]
    for word in word_list:
        if word in features:
            X_test_dataset[i][features.index(word)] += 1

         0                                                  1    2    3    4
0       v1                                                 v2  NaN  NaN  NaN
1      ham  Go until jurong point, crazy.. Available only ...  NaN  NaN  NaN
2      ham                      Ok lar... Joking wif u oni...  NaN  NaN  NaN
3     spam  Free entry in 2 a wkly comp to win FA Cup fina...  NaN  NaN  NaN
4      ham  U dun say so early hor... U c already then say...  NaN  NaN  NaN
5      ham  Nah I don't think he goes to usf, he lives aro...  NaN  NaN  NaN
6     spam  FreeMsg Hey there darling it's been 3 week's n...  NaN  NaN  NaN
7      ham  Even my brother is not like to speak with me. ...  NaN  NaN  NaN
8      ham  As per your request 'Melle Melle (Oru Minnamin...  NaN  NaN  NaN
9     spam  WINNER!! As a valued network customer you have...  NaN  NaN  NaN
10    spam  Had your mobile 11 months or more? U R entitle...  NaN  NaN  NaN
11     ham  I'm gonna be home soon and i don't want to tal...  NaN  NaN  NaN

In [4]:
# Using sklearn's Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train_dataset, Y_train)

sklearn_score_train = clf.score(X_train_dataset, Y_train)
print("Sklearn's score on training data :", sklearn_score_train)
sklearn_score_test = clf.score(X_test_dataset, Y_test)
print("Sklearn's score on testing data :", sklearn_score_test)
Y_test_pred = clf.predict(X_test_dataset)
print("Classification report for testing data :")
print(classification_report(Y_test, Y_test_pred))

Sklearn's score on training data : 0.966499162479062
Sklearn's score on testing data : 0.9605168700646087
Classification report for testing data :
             precision    recall  f1-score   support

        ham       0.97      0.98      0.98      1196
       spam       0.89      0.83      0.86       197

avg / total       0.96      0.96      0.96      1393

