In [1]:
# importing the Dataset

import pandas as pd
messages = pd.read_csv('SMSSpamCollection', sep='\t', names=["label", "message"])

In [2]:
#Data cleaning and preprocessing

import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
corpus = []

for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['message'][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Creating the Bag of Words model

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000)
X = cv.fit_transform(corpus).toarray()

y=pd.get_dummies(messages['label'])
y=y.iloc[:,1].values

In [4]:
# Train Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

## Naive Bayes Classifier

In [5]:
# Training model using Naive bayes classifier

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB().fit(X_train, y_train)

In [6]:
y_pred_nb = nb.predict(X_test)
y_pred_nb

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [7]:
from sklearn.metrics import confusion_matrix
cfusn_nb = confusion_matrix(y_test, y_pred_nb)
cfusn_nb

array([[948,   7],
       [  7, 153]], dtype=int64)

In [8]:
from sklearn.metrics import accuracy_score
acc_nb =  accuracy_score(y_test, y_pred_nb)
acc_nb

0.9874439461883409

## Logistic Regression

In [9]:
# Training model using Logistic Regression

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression().fit(X_train, y_train)

In [10]:
y_pred_lr = lr.predict(X_test)
y_pred_lr

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [11]:
from sklearn.metrics import confusion_matrix
cfusn_lr = confusion_matrix(y_test, y_pred_lr)
cfusn_lr

array([[954,   1],
       [ 16, 144]], dtype=int64)

In [12]:
from sklearn.metrics import accuracy_score
acc_lr =  accuracy_score(y_test, y_pred_lr)
acc_lr

0.9847533632286996

## KNN

In [13]:
# Training model using K-Nearest Neighbours Classifier

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier().fit(X_train, y_train)

In [14]:
y_pred_knn = knn.predict(X_test)
y_pred_knn

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [15]:
from sklearn.metrics import confusion_matrix
cfusn_knn = confusion_matrix(y_test, y_pred_knn)
cfusn_knn

array([[955,   0],
       [ 86,  74]], dtype=int64)

In [16]:
from sklearn.metrics import accuracy_score
acc_knn =  accuracy_score(y_test, y_pred_knn)
acc_knn

0.9228699551569507

## SVM

In [17]:
# Training model using Support Vector Classifiers

from sklearn import svm
svc = svm.SVC().fit(X_train, y_train)

In [18]:
y_pred_svc = svc.predict(X_test)
y_pred_svc

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [19]:
from sklearn.metrics import confusion_matrix
cfusn_svc = confusion_matrix(y_test, y_pred_svc)
cfusn_svc

array([[955,   0],
       [ 17, 143]], dtype=int64)

In [20]:
from sklearn.metrics import accuracy_score
acc_svc =  accuracy_score(y_test, y_pred_svc)
acc_svc

0.9847533632286996

## Decision Tree

In [21]:
# Training model using Decision Tree Classifier

from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier().fit(X_train, y_train)

In [22]:
y_pred_dt = dt.predict(X_test)
y_pred_dt

array([0, 1, 0, ..., 0, 1, 0], dtype=uint8)

In [23]:
from sklearn.metrics import confusion_matrix
cfusn_dt = confusion_matrix(y_test, y_pred_dt)
cfusn_dt

array([[946,   9],
       [ 13, 147]], dtype=int64)

In [24]:
from sklearn.metrics import accuracy_score
acc_dt =  accuracy_score(y_test, y_pred_dt)
acc_dt

0.9802690582959641

### So, Naive Bayes will be our choice as the best model for its accuracy.