In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt1
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
%matplotlib inline

In [4]:
#import the dataset
path = "./datasets/P6_P7_spam.csv"
df = pd.read_csv(path,encoding = 'latin-1')

In [5]:
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [6]:
df.shape

(5572, 5)

In [7]:
df = df[['v1','v2']]
# df['length']=df['v2'].apply(len)

In [8]:
df.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
# remove punctuations and stopwords
def text_process(mess):
    nopunc =[char for char in mess if char not in string.punctuation]
    nopunc=''.join(nopunc)
    clean = [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]
    if(len(clean) < 9):
        for i in range(9-len(clean)):
            clean.append(11304)
    return clean[:9]

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/sujay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
df['v2'] = df['v2'].apply(text_process)
df['length'] = df['v2'].apply(len)
unique_words = []
for msg in df['v2']:
    for word in msg:
        unique_words.append(word)
unique_words = set(unique_words)
df.head()

Unnamed: 0,v1,v2,length
0,ham,"[Go, jurong, point, crazy, Available, bugis, n...",9
1,ham,"[Ok, lar, Joking, wif, u, oni, 11304, 11304, 1...",9
2,spam,"[Free, entry, 2, wkly, comp, win, FA, Cup, final]",9
3,ham,"[U, dun, say, early, hor, U, c, already, say]",9
4,ham,"[Nah, dont, think, goes, usf, lives, around, t...",9


In [12]:
len(unique_words)

8508

In [13]:
vocab = {}
i=0
for word in unique_words:
    vocab[word]=i
    i+=1

In [14]:
vocab

{'rightly': 0,
 'Almost': 1,
 'cheesy': 2,
 'X': 3,
 'browsin': 4,
 'sooooo': 5,
 'created': 6,
 'wondar': 7,
 'officewhats': 8,
 'envy': 9,
 'Executive': 10,
 'dates': 11,
 'Mmm': 12,
 '0796XXXXXX': 13,
 'technical': 14,
 'nelson': 15,
 'Amrita': 16,
 'PARTY': 17,
 'welp': 18,
 'band': 19,
 '2U': 20,
 '09061790126': 21,
 'earlier': 22,
 'Poop': 23,
 'tis': 24,
 'Realy': 25,
 'Dear1': 26,
 'dehydration': 27,
 'jason': 28,
 'jontin': 29,
 'smsing': 30,
 'deliver': 31,
 'fixd': 32,
 'Becomes': 33,
 'breadstick': 34,
 'kotees': 35,
 'facilities': 36,
 'freak': 37,
 'chick': 38,
 'visit': 39,
 'cherish': 40,
 'picture': 41,
 'Bonus': 42,
 'arab': 43,
 'wondarfull': 44,
 'FINE': 45,
 'wwwcashbincouk': 46,
 'ABTA': 47,
 'WRONGTAKE': 48,
 'hyde': 49,
 'absolutely': 50,
 'met': 51,
 'PISS': 52,
 'Baaaaaaaabe': 53,
 'Shah': 54,
 'successful': 55,
 'Wrong': 56,
 'Loyalty': 57,
 'kindly': 58,
 'recent': 59,
 '80082': 60,
 'sarcastic': 61,
 '24hrs': 62,
 '08712402972': 63,
 'Start': 64,
 'shanghai

In [15]:
def transform_data(data_set,vocab):
    ds=[]
    for row in data_set:
        temp = []
        for word in row:
            temp.append(vocab[word])
        ds.append(temp)
    return ds
            

In [16]:
msg_train,msg_test,label_train,label_test = train_test_split(transform_data(df['v2'],vocab),df['v1'],test_size=0.2)

In [17]:
# improves accuracy significantly
# tfidf_transformer = TfidfTransformer(use_idf = False)
# msg_train=tfidf_transformer.transform(msg_train)
# msg_test=tfidf_transformer.transform(msg_test)
# msg_train.shape

In [18]:
spam_detect_model = MultinomialNB().fit(msg_train,label_train)

In [19]:
y_pred = spam_detect_model.predict(msg_test)
y_pred[:5]

array(['ham', 'ham', 'spam', 'ham', 'ham'], dtype='<U4')

In [20]:
print("accuracy: ",accuracy_score(label_test,y_pred))
print(classification_report(label_test,y_pred))
print(confusion_matrix(label_test,y_pred))

accuracy:  0.600896860986547
              precision    recall  f1-score   support

         ham       0.91      0.60      0.72       959
        spam       0.20      0.62      0.30       156

    accuracy                           0.60      1115
   macro avg       0.55      0.61      0.51      1115
weighted avg       0.81      0.60      0.66      1115

[[573 386]
 [ 59  97]]
