In [21]:
import pandas as pd
import numpy as np
import nltk 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
data = pd.read_csv("spam.csv")

In [3]:
data.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [4]:
data.dropna(axis=1, inplace=True)

In [5]:
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
data.rename(columns={'v1':"Type","v2":"Message"},inplace=True)

In [7]:
data.head()

Unnamed: 0,Type,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
data.Type.value_counts()

ham     4825
spam     747
Name: Type, dtype: int64

In [9]:
corpus = []
ps = PorterStemmer()

Preprocessing

In [10]:
for i in range(0,5572):
    #Regular Expresion
    '''
    Replace email addresses with 'emailaddr'
    Replace URLs with 'httpaddr'
    Replace money symbols with 'moneysymb'
    Replace phone numbers with 'phonenumbr'
    Replace numbers with 'numbr'
    '''
    msg = data['Message'][i] 
    msg = re.sub('\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', data['Message'][i])
    msg = re.sub('(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', 'httpaddr', data['Message'][i])
    msg = re.sub('([A-Z]{3}|[A-Z]?[\$€¥])?\s?(\d{1,3}((,\d{1,3})+)?(.\d{1,3})?(.\d{1,3})?(,\d{1,3})?)', 'moneysymb', data['Message'][i])
    msg = re.sub('\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', 'phonenumbr', data['Message'][i])
    msg = re.sub('\d+(\.\d+)?', 'numbr', data['Message'][i])

    ''' Remove all punctuations '''
    msg = re.sub('[^\w\d\s]', ' ', data['Message'][i])
    
    if i<2:
        print("\t\t\t\t MESSAGE ", i)
    
    if i<2:
        print("\n After Regular Expression - Message ", i, " : ", msg)
    
    # Each word to lower case
    msg = msg.lower()    
    if i<2:
        print("\n Lower case Message ", i, " : ", msg)
    
    # Splitting words to Tokenize
    msg = msg.split()    
    if i<2:
        print("\n After Splitting - Message ", i, " : ", msg)
    
    # Stemming with PorterStemmer handling Stop Words
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    if i<2:
        print("\n After Stemming - Message ", i, " : ", msg)
    
    # preparing Messages with Remaining Tokens
    msg = ' '.join(msg)
    if i<2:
        print("\n Final Prepared - Message ", i, " : ", msg, "\n\n")
    
    # Preparing WordVector Corpus
    corpus.append(msg)


				 MESSAGE  0

 After Regular Expression - Message  0  :  Go until jurong point  crazy   Available only in bugis n great world la e buffet    Cine there got amore wat   

 Lower case Message  0  :  go until jurong point  crazy   available only in bugis n great world la e buffet    cine there got amore wat   

 After Splitting - Message  0  :  ['go', 'until', 'jurong', 'point', 'crazy', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'there', 'got', 'amore', 'wat']

 After Stemming - Message  0  :  ['go', 'jurong', 'point', 'crazi', 'avail', 'bugi', 'n', 'great', 'world', 'la', 'e', 'buffet', 'cine', 'got', 'amor', 'wat']

 Final Prepared - Message  0  :  go jurong point crazi avail bugi n great world la e buffet cine got amor wat 


				 MESSAGE  1

 After Regular Expression - Message  1  :  Ok lar    Joking wif u oni   

 Lower case Message  1  :  ok lar    joking wif u oni   

 After Splitting - Message  1  :  ['ok', 'lar', 'joking', 'wif', '

In [11]:
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()

Applying classification

In [12]:
y = data.Type

In [13]:
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Type, Length: 5572, dtype: object

Encoding Label

In [14]:
y = LabelEncoder().fit_transform(y)
y

array([0, 0, 1, ..., 0, 0, 0])

splliting testing & training data

In [15]:
xtrain ,xtest ,ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=0)

Applying Guassian Naive Bayes

In [16]:
Bc = GaussianNB()
Bc.fit(xtrain,ytrain) 
yp = Bc.predict(xtest)
cm = confusion_matrix(ytest, yp)
print(cm)
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, Bc.predict(xtest)))
print (classification_report(ytest, Bc.predict(xtest)))

GaussianNB()

Random Forest

In [35]:
rmf = RandomForestClassifier(n_estimators=90)
rmf.fit(xtrain,ytrain)
rmfp = rmf.predict(xtest)
rmc = confusion_matrix(ytest,rmfp)
print(rmc)
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, rmf.predict(xtest)))
print (classification_report(ytest, rmf.predict(xtest)))

[[949   0]
 [ 30 136]]
Accuracy : 0.97309 


              precision    recall  f1-score   support

           0       0.97      1.00      0.98       949
           1       1.00      0.82      0.90       166

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [39]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(xtrain,ytrain)
knnp = knn.predict(xtest)
kcm = confusion_matrix(ytest,knnp)
print(kcm)
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, knn.predict(xtest)))
print (classification_report(ytest, knn.predict(xtest)))

[[949   0]
 [ 62 104]]
Accuracy : 0.94439 


              precision    recall  f1-score   support

           0       0.94      1.00      0.97       949
           1       1.00      0.63      0.77       166

    accuracy                           0.94      1115
   macro avg       0.97      0.81      0.87      1115
weighted avg       0.95      0.94      0.94      1115



In [30]:
dst = DecisionTreeClassifier(random_state=50)
dst.fit(xtrain,ytrain)
dstp = dst.predict(xtest)
dmc = confusion_matrix(ytest,dstp)
print(dmc)
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, dst.predict(xtest)))
print (classification_report(ytest, dst.predict(xtest)))

[[942   7]
 [ 18 148]]
Accuracy : 0.97758 


              precision    recall  f1-score   support

           0       0.98      0.99      0.99       949
           1       0.95      0.89      0.92       166

    accuracy                           0.98      1115
   macro avg       0.97      0.94      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [40]:
lgr = LogisticRegression()
lgr.fit(xtrain,ytrain)
lgrp = lgr.predict(xtest)
lmc = confusion_matrix(ytest,lgrp)
print(lmc)
print ("Accuracy : %0.5f \n\n" % accuracy_score(ytest, lgr.predict(xtest)))
print (classification_report(ytest, lgr.predict(xtest)))

[[949   0]
 [ 24 142]]
Accuracy : 0.97848 


              precision    recall  f1-score   support

           0       0.98      1.00      0.99       949
           1       1.00      0.86      0.92       166

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.95      1115
weighted avg       0.98      0.98      0.98      1115

