#                                             SmsSpam_Filtering

In [1]:
#import Packages
import pandas as pd
import numpy as np

In [2]:
#Load data
sms = pd.read_csv("/home/mtech11/Documents/SMSSpam", sep="\t", names=["Status","Message"])

In [3]:
#shape of data
sms.shape

(5572, 2)

In [4]:
#No. of columns
sms.columns

Index(['Status', 'Message'], dtype='object')

In [5]:
#print top 5 rows
sms.head()

Unnamed: 0,Status,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
#No. of spam
len(sms[sms.Status=="spam"])

747

In [7]:
#No. of ham
len(sms[sms.Status=="ham"])

4825

In [8]:
#Change ham to 1
#change spam to 0
sms.loc[sms["Status"]=="ham","Status"]=1
sms.loc[sms["Status"]=="spam","Status"]=0

In [9]:
#Print top 5 rows
sms.head()

Unnamed: 0,Status,Message
0,1,"Go until jurong point, crazy.. Available only ..."
1,1,Ok lar... Joking wif u oni...
2,0,Free entry in 2 a wkly comp to win FA Cup fina...
3,1,U dun say so early hor... U c already then say...
4,1,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
#Seperate Messages and status
sms_x = sms["Message"]
sms_y = sms["Status"]

In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [12]:
corpus = []
for i in range(0, 5572):
    message = re.sub('[^a-zA-Z]', ' ', sms['Message'][i]) #substitute non alphabets with space
    message = message.lower()
    message = message.split()
    ps = PorterStemmer()
    message = [ps.stem(word) for word in message if not word in set(stopwords.words('english'))]
    message = ' '.join(message)
    corpus.append(message)

In [13]:
len(corpus)

5572

# Feature Extraction with CountVectorizer

In [14]:
#convert into vector
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = sms['Status']

In [15]:
X.shape

(5572, 6296)

In [16]:
X[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [17]:
#cv.get_feature_names()
cv.inverse_transform(X[0])

[array(['amor', 'avail', 'buffet', 'bugi', 'cine', 'crazi', 'go', 'got',
        'great', 'jurong', 'la', 'point', 'wat', 'world'], dtype='<U34')]

In [18]:
y.shape

(5572,)

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [20]:
#convert into int
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [21]:
#Naive bayes classifier with BernoulliNB
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
prediction = classifier.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9748803827751196


In [22]:
#Naive bayes classifier with MultinomialNB
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,y_train)
prediction = model.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9796650717703349


In [23]:
#Naive bayes classifier with GaussianNB
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(pred,y_test))

0.8642344497607656


In [24]:
#KNN classifier with K=2
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_jobs=-1)
neigh.fit(X_train,y_train)
prediction = neigh.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9204545454545454


# Feature Extraction with TfidfVectorizer

In [25]:
#convert into vector
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
X = cv.fit_transform(corpus).toarray()
y = sms['Status']

In [26]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 0)

In [27]:
#convert into int
y_train = y_train.astype("int")
y_test = y_test.astype("int")

In [28]:
#Naive bayes classifier with BernoulliNB
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train,y_train)
prediction = classifier.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9748803827751196


In [29]:
#KNN classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()
neigh.fit(X_train,y_train)
prediction = neigh.predict(X_test)
print(accuracy_score(prediction,y_test))

0.9144736842105263


In [30]:
#Naive bayes classifier with GaussianNB
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train,y_train)
pred = model.predict(X_test)
print(accuracy_score(pred,y_test))

0.8606459330143541
