Installing necessary libraries

In [None]:
!pip install pandas
!pip install numpy
!pip install chardet
!pip install sklearn
!pip install gensim



Making necessary imports



In [None]:
import pandas as pd
import numpy as np
import chardet as cd
from gensim import parsing
from sklearn import naive_bayes, svm
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer

Reading from CSV file and Preprocessing the data

In [None]:
with open('data.csv', 'rb') as file:
    data = cd.detect(file.read())

dataframe = pd.read_csv('data.csv', encoding = data['encoding'])
dataframe = dataframe.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
dataframe['v1'] = dataframe.v1.map({'ham':0, 'spam':1})

dataset = dataframe.values

print("The first ten rows are: \n")
print(dataframe.head(10))

np.random.shuffle(dataset)

X = dataset[:,1]
Y = dataset[:,0]
Y = Y.astype('int')

for i in range(X.shape[0]):
    X[i] = parsing.stem_text(X[i].lower())

vectorizer = CountVectorizer()
X_processed = vectorizer.fit_transform(X)

The first ten rows are: 

   v1                                                 v2
0   0  Go until jurong point, crazy.. Available only ...
1   0                      Ok lar... Joking wif u oni...
2   1  Free entry in 2 a wkly comp to win FA Cup fina...
3   0  U dun say so early hor... U c already then say...
4   0  Nah I don't think he goes to usf, he lives aro...
5   1  FreeMsg Hey there darling it's been 3 week's n...
6   0  Even my brother is not like to speak with me. ...
7   0  As per your request 'Melle Melle (Oru Minnamin...
8   1  WINNER!! As a valued network customer you have...
9   1  Had your mobile 11 months or more? U R entitle...


Splitting the dataset into Training data and Test data

In [None]:
X_train = X_processed[0:4500,:]
Y_train = Y[0:4500]
X_test = X_processed[4500:,:]
Y_test = Y[4500:]

print("The shape of training data is :", X_train.shape)
print("The shape of test data is :", X_test.shape)

The shape of training data is : (4500, 8265)
The shape of test data is : (1072, 8265)


Function to print various metrics of different models

In [None]:
def scores(Y_actual, Y_predicted, model):
    if model == 1:
      print("\nThe metrics of Naive Bayes classifier are :")
    if model == 2:
      print("\nThe metrics of Support Vector Classifier are:")
    print("\nAccuracy score is :" ,  accuracy_score(Y_actual, Y_predicted))
    print("\nPrecision score is :" ,  precision_score(Y_actual, Y_predicted))
    print("\nRecall score is :" ,  recall_score(Y_actual, Y_predicted))
    print("\nROC AUC score  is :" ,  roc_auc_score(Y_actual,Y_predicted))
    print("\nConfusion Matrix : \n")
    print(confusion_matrix(Y_actual, Y_predicted))
    print("\n")

Fitting the Multinomial Naive Bayes model and SVM

In [None]:
bayes_classifier = naive_bayes.MultinomialNB()
svm_classifier = svm.SVC(C = 1000)

bayes_classifier.fit(X_train, Y_train)
svm_classifier.fit(X_train, Y_train)

Y_predicted_bayes = bayes_classifier.predict(X_test)
Y_predicted_svm = svm_classifier.predict(X_test)

scores(Y_test, Y_predicted_bayes, 1)
scores(Y_test, Y_predicted_svm, 2)


The metrics of Naive Bayes classifier are :

Accuracy score is : 0.9841417910447762

Precision score is : 0.9411764705882353

Recall score is : 0.9343065693430657

ROC AUC score  is : 0.9628752098052227

Confusion Matrix : 

[[927   8]
 [  9 128]]



The metrics of Support Vector Classifier are:

Accuracy score is : 0.9850746268656716

Precision score is : 1.0

Recall score is : 0.8832116788321168

ROC AUC score  is : 0.9416058394160585

Confusion Matrix : 

[[935   0]
 [ 16 121]]




Function to check new messages and predict as Ham/Spam

In [None]:
def test(message):
    message = message.lower()
    message = parsing.stem_text(message)
    message_vector = vectorizer.transform([message])

    if bayes_classifier.predict(message_vector) == 1:
      bayes_result = "Spam"  
    else:
      bayes_result = "Ham"
      
    if svm_classifier.predict(message_vector) == 1:
      svm_result = "Spam"
    else:
      svm_result = "Ham"

    print("According to Bayes Classifier the message is :", bayes_result)
    print("According to SVM Classifier the message is :", svm_result)


message1 = "Avail exciting discounts. Click to know more."
print("The message is : ", message1)
test(message1)

message2 = "Claim your free rewards. Get upto $100 off."
print("\nThe message is : ", message2)
test(message2)

message3 = "Have you seen the latest \Red Barbie\? It comes with all of K ****B****!"
print("\nThe message is : ", message3)
test(message3)

message4 = "Find your special one on www.areyouunique.co.uk"
print("\nThe message is : ", message4)
test(message4)

message5 = "Congrats! Special cinema pass for 10 is yours. call 09061209465 now! C Suprman V, Avngrs III, Wondrwmn 2, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! "
print("\nThe message is : ", message5)
test(message5)

message6 = "Safely straiten teeth 100% from home!. Exciting rewards as well. Call 1234567890 to knw mre. $$$$$ click $$$$"
print("\nThe message is : ", message6)
test(message6)

The message is :  Avail exciting discounts. Click to know more.
According to Bayes Classifier the message is : Ham
According to SVM Classifier the message is : Ham

The message is :  Claim your free rewards. Get upto $100 off.
According to Bayes Classifier the message is : Spam
According to SVM Classifier the message is : Spam

The message is :  Have you seen the latest \Red Barbie\? It comes with all of K ****B****!
According to Bayes Classifier the message is : Ham
According to SVM Classifier the message is : Ham

The message is :  Find your special one on www.areyouunique.co.uk
According to Bayes Classifier the message is : Spam
According to SVM Classifier the message is : Spam

The message is :  Congrats! Special cinema pass for 10 is yours. call 09061209465 now! C Suprman V, Avngrs III, Wondrwmn 2, etc all 4 FREE! bx420-ip4-5we. 150pm. Dont miss out! 
According to Bayes Classifier the message is : Spam
According to SVM Classifier the message is : Spam

The message is :  Safely str