In [1]:
import pandas as pd

In [9]:
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'message'])

In [10]:
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.shape

(5572, 2)

 ### Data Cleaning

In [12]:
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer

In [13]:
ps = PorterStemmer()

In [19]:
corpus = []
for i in range(len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['message'][i])
    review = review.lower()
    review = nltk.word_tokenize(review)
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer

In [28]:
cv = CountVectorizer(max_features=5000)

In [29]:
X = cv.fit_transform(corpus).toarray()

In [30]:
X.shape

(5572, 5000)

In [33]:
y = pd.get_dummies(df['label'])

In [35]:
y = y.iloc[:, 1]

In [37]:
y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam, dtype: uint8

In [39]:
from sklearn.model_selection import train_test_split

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [41]:
from sklearn.naive_bayes import MultinomialNB

In [42]:
spam_detect_model = MultinomialNB().fit(X_train, y_train)

In [43]:
y_pred = spam_detect_model.predict(X_test)

In [45]:
a = {'Actual': y_test, 'Predicted': y_pred}

In [48]:
pd.DataFrame(a).tail(50)

Unnamed: 0,Actual,Predicted
3428,0,0
1852,0,0
2925,0,0
5375,0,0
556,0,0
2686,1,1
1855,0,0
1002,1,1
2928,0,0
1721,0,0


In [52]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [51]:
confusion_matrix(y_test, y_pred)

array([[945,  10],
       [  8, 152]], dtype=int64)

In [53]:
accuracy_score(y_test, y_pred)

0.9838565022421525