In [1]:
# importing required modules
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')

In [2]:
# importing the dataset
dataset = pd.read_csv('sms_spam_dataset.csv', encoding='latin-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
# removing unwanted columns
dataset = dataset.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)

# renaming columns
dataset = dataset.rename(columns={"v1":"label", "v2":"text"})
dataset.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
# count observations in each label
dataset.label.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [5]:
# convert label to a numerical variable
dataset['numerical_label'] = dataset.label.map({'ham':0, 'spam':1})
dataset.head()

Unnamed: 0,label,text,numerical_label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [6]:
# for splitting dataset into train set and test set
from sklearn.model_selection import train_test_split

In [7]:
X_train,X_test,y_train,y_test = train_test_split(dataset["text"],dataset["label"], test_size = 0.2, random_state = 10)

In [8]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4457,)
(1115,)
(4457,)
(1115,)


In [9]:
# for vectorizing words
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
vect = CountVectorizer(stop_words='english')
vect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [11]:
print(vect.get_feature_names()[0:20])
print(vect.get_feature_names()[-20:])

['00', '000', '000pes', '008704050406', '0089', '0121', '01223585236', '01223585334', '0125698789', '02', '0207', '02072069400', '02073162414', '02085076972', '021', '03', '04', '0430', '05', '050703']
['zyada', 'åð', 'åòharry', 'åòit', 'åômorrow', 'åôrents', 'ì_', 'ì¼1', 'ìä', 'ìï', 'ó_', 'û_', 'û_thanks', 'ûªm', 'ûªt', 'ûªve', 'ûï', 'ûïharry', 'ûò', 'ûówell']


In [12]:
X_train_df = vect.transform(X_train)
X_test_df = vect.transform(X_test)
type(X_test_df)

scipy.sparse.csr.csr_matrix

In [13]:
prediction = dict()
# Naive Bayes Machine Learning Model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_df,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [14]:
prediction["naive_bayes"] = model.predict(X_test_df)

In [15]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [16]:
# get accuracy
accuracy_score(y_test,prediction["naive_bayes"])

0.98834080717488793

In [17]:
print(classification_report(y_test, prediction['naive_bayes'], target_names = ["Ham", "Spam"]))

             precision    recall  f1-score   support

        Ham       0.99      1.00      0.99       965
       Spam       0.97      0.94      0.96       150

avg / total       0.99      0.99      0.99      1115



In [35]:
def classify(user_input):
    custom_train = [('text', [user_input])]
    custom_train = pd.DataFrame.from_items(custom_train)
    text = custom_train.iloc[:, 0].values
    return model.predict(vect.transform(text))

text = input('Type here something and see if it belongs to spam: ')
print(classify(text))

Type here something and see if it belongs to spam: IMPORTANT - You could be entitled up to £3,160 in compensation from mis-sold PPI on a credit card or loan. Please reply PPI for info or STOP to opt out.
['spam']


In [37]:
text = input('Type here something and see if it belongs to spam: ')
print(classify(text))

Type here something and see if it belongs to spam: Hey there! I hope you're free tonight, let's catch up at dinner!
['ham']
