In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=["label", "message"])

# Data cleaning and preprocessing

In [3]:
import re
import nltk

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [5]:
ps = PorterStemmer()
corpus = []

In [6]:
"""
Method to lower the text data and splitting them to form tokens.
Porter Stemmer is used to get the stem of the word.
"""

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [8]:
y = pd.get_dummies(messages['label'])
y = y.iloc[:, 1].values

# Train Test Split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

# Training model using Naive bayes classifier

In [10]:
from sklearn.naive_bayes import MultinomialNB

spam_detect_model = MultinomialNB().fit(X_train, y_train)
y_pred = spam_detect_model.predict(X_test)

# Checking the accuracy of the model

In [11]:
from sklearn.metrics import confusion_matrix

confusion = confusion_matrix(y_test, y_pred)

In [12]:
confusion

array([[946,   9],
       [  7, 153]], dtype=int64)

__946 values + 153 values have been correctly predicted.__

In [13]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)

In [14]:
print("Acuracy is: {}".format(accuracy))

Acuracy is: 0.9856502242152466
