In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
df=pd.read_csv('sms_spam.csv')
df

Unnamed: 0,type,text
0,ham,Hope you are having a good week. Just checking in
1,ham,K..give back my thanks.
2,ham,Am also doing in cbe only. But have to pay.
3,spam,"complimentary 4 STAR Ibiza Holiday or £10,000 ..."
4,spam,okmail: Dear Dave this is your final notice to...
...,...,...
5554,ham,You are a great role model. You are giving so ...
5555,ham,"Awesome, I remember the last time we got someb..."
5556,spam,"If you don't, your prize will go to another cu..."
5557,spam,"SMS. ac JSco: Energy is high, but u may not kn..."


In [43]:
df.isnull().sum()

type    0
text    0
dtype: int64

In [44]:
df.describe()

Unnamed: 0,type,text
count,5559,5559
unique,2,5156
top,ham,"Sorry, I'll call later"
freq,4812,30


In [45]:
df.groupby('type').describe()

Unnamed: 0_level_0,text,text,text,text
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4812,4503,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [46]:
text=df.iloc[:,-1]
text

0       Hope you are having a good week. Just checking in
1                                 K..give back my thanks.
2             Am also doing in cbe only. But have to pay.
3       complimentary 4 STAR Ibiza Holiday or £10,000 ...
4       okmail: Dear Dave this is your final notice to...
                              ...                        
5554    You are a great role model. You are giving so ...
5555    Awesome, I remember the last time we got someb...
5556    If you don't, your prize will go to another cu...
5557    SMS. ac JSco: Energy is high, but u may not kn...
5558                      Shall call now dear having food
Name: text, Length: 5559, dtype: object

In [47]:
corpus=[]
for i in range(0,5559):
  text=re.sub('[^a-zA-Z]',' ',df['text'][i])
  text=text.lower()
  text=text.split()
  ps=PorterStemmer()
  all_stopwords=stopwords.words('english')
  text=[ps.stem(word) for word in text if not word in set(all_stopwords)]
  text=' '.join(text)
  corpus.append(text)

print(corpus)

['hope good week check', 'k give back thank', 'also cbe pay', 'complimentari star ibiza holiday cash need urgent collect landlin lose box sk wp ppm', 'okmail dear dave final notic collect tenerif holiday cash award call landlin tc sae box cw wx ppm', 'aiya discuss later lar pick u', 'much buzi', 'pleas ask mummi call father', 'marvel mobil play offici ultim spider man game ur mobil right text spider game send u free ball wallpap', 'fyi usf swing room whenev', 'sure thing big man hockey elect go longer hour though', 'anyth lor', 'march end readi call sure problem capit never complet far work ladi', 'hmm well night night', 'k sure get noon see', 'ha ha cool cool chikku chikku db', 'darren say dat u meet da ge den dun meet dinner co later u leav xy feel awkward den u meet lunch lor', 'dint tell anyth angri told abi', 'u u wan come come lor din c stripe skirt', 'u win music gift voucher everi week start txt word draw tsc www ldew com skillgam winaweek age ppermesssubscript', 'mro come gym 

In [48]:
cv=CountVectorizer()
X=cv.fit_transform(corpus).toarray()
y=df.iloc[:,0]
y

0        ham
1        ham
2        ham
3       spam
4       spam
        ... 
5554     ham
5555     ham
5556    spam
5557    spam
5558     ham
Name: type, Length: 5559, dtype: object

In [49]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

##Naive Bayes

In [50]:
mb=MultinomialNB()
mb.fit(X_train,y_train)

MultinomialNB()

In [51]:
y_pred=mb.predict(X_test)
y_pred

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], dtype='<U4')

In [None]:
accuracies=cross_val_score(estimator=mb,X=X_train,y=y_train,cv=10) # cv isthe number of folds we eant in our training set
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

In [52]:
cm=confusion_matrix(y_test,y_test)
print(cm)
accuracy_score(y_pred,y_test)


[[979   0]
 [  0 133]]


0.9802158273381295

##Decision Tree

In [56]:
dt=DecisionTreeClassifier(criterion='entropy',random_state=0)
dt.fit(X_train,y_train)

DecisionTreeClassifier(criterion='entropy', random_state=0)

In [57]:
y_pred=dt.predict(X_test)
y_pred
cm=confusion_matrix(y_test,y_test)
print(cm)
accuracy_score(y_pred,y_test)

[[979   0]
 [  0 133]]


0.9703237410071942

In [59]:
accuracies=cross_val_score(estimator=dt,X=X_train,y=y_train,cv=10) # cv isthe number of folds we eant in our training set
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))

Accuracy: 96.87 %
Standard Deviation: 0.97 %


##Random Forest Classifier

In [53]:
rd=RandomForestClassifier(n_estimators=10,random_state=0)
rd.fit(X_train,y_train)

RandomForestClassifier(n_estimators=10, random_state=0)

In [63]:
accuracies=cross_val_score(estimator=rd,X=X_train,y=y_train,cv=25) # cv isthe number of folds we eant in our training set
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
# print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 97.03 %


In [55]:
y_pred=rd.predict(X_test)
y_pred
cm=confusion_matrix(y_test,y_test)
print(cm)
accuracy_score(y_pred,y_test)

[[979   0]
 [  0 133]]


0.9757194244604317