In [1]:
import pandas as pd
import numpy as np
import nltk
import re

In [2]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

In [3]:
stemmer = PorterStemmer()

In [4]:
data = pd.read_csv("SMSSpamCollection",sep="\t",names=["label","message"])

In [5]:
data.shape

(5572, 2)

In [6]:
data.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
sentences=[]
for i in range(len(data)):
    line = re.sub("[^a-zA-Z]"," ",data['message'][i])
    line = line.lower()
    words = nltk.word_tokenize(line)
    
    words=[stemmer.stem(word) for word in words if not word in stopwords.words("english")]
    line = " ".join(words)
    sentences.append(line)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(sentences).toarray()

In [10]:
X.shape

(5572, 2500)

In [11]:
y = pd.get_dummies(data['label'])
y.head()

Unnamed: 0,ham,spam
0,1,0
1,1,0
2,0,1
3,1,0
4,1,0


In [12]:
y = y.iloc[:,1]
y.head()

0    0
1    0
2    1
3    0
4    0
Name: spam, dtype: uint8

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457, 2500)
(1115, 2500)
(4457,)
(1115,)


In [16]:
from sklearn.naive_bayes import MultinomialNB

In [17]:
nm = MultinomialNB()

In [18]:
nm.fit(x_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [19]:
prediction = nm.predict(x_test)

In [20]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [21]:
accuracy_score(y_test,prediction)

0.9856502242152466

In [22]:
print(classification_report(y_test,prediction))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       955
           1       0.94      0.96      0.95       160

    accuracy                           0.99      1115
   macro avg       0.97      0.97      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [23]:
print(confusion_matrix(y_test,prediction))

[[946   9]
 [  7 153]]


In [24]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=30)
rfc.fit(x_train, y_train)
prediction = rfc.predict(x_test)
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

0.9856502242152466
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       1.00      0.90      0.95       160

    accuracy                           0.99      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.99      0.99      0.99      1115

[[955   0]
 [ 16 144]]


In [25]:
from sklearn.tree import DecisionTreeClassifier
rfc = DecisionTreeClassifier()
rfc.fit(x_train, y_train)
prediction = rfc.predict(x_test)
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

0.9739910313901345
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       955
           1       0.93      0.88      0.91       160

    accuracy                           0.97      1115
   macro avg       0.96      0.94      0.95      1115
weighted avg       0.97      0.97      0.97      1115

[[945  10]
 [ 19 141]]


In [26]:
from sklearn.linear_model import LogisticRegression
rfc = LogisticRegression()
rfc.fit(x_train, y_train)
prediction = rfc.predict(x_test)
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))
print(confusion_matrix(y_test,prediction))

0.9847533632286996
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       955
           1       0.99      0.90      0.94       160

    accuracy                           0.98      1115
   macro avg       0.99      0.95      0.97      1115
weighted avg       0.98      0.98      0.98      1115

[[954   1]
 [ 16 144]]


In [27]:
a = ["I HAVE A DATE ON SUNDAY WITH WILL!!"]
a =cv.transform(a).toarray()

In [28]:
nm.predict(a)

array([0], dtype=uint8)

In [29]:
data[data['label']=="spam"]

Unnamed: 0,label,message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [30]:
import pickle

In [31]:
pickle.dump(nm,open("model.pkl","wb"))

In [32]:
pickle.dump(cv,open("cv_transform.pkl","wb"))