In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import feature_extraction, model_selection, naive_bayes, metrics, svm
from sklearn.model_selection import train_test_split
from IPython.display import Image
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
data=pd.read_csv("/home/ccoew/Downloads/SMSSpamCollection", delimiter='\t', names=['label','sms'])
data.head()

Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
f=feature_extraction.text.CountVectorizer(stop_words='english')
X=f.fit_transform(data['sms'])
np.shape(X)



(5572, 8444)

In [3]:
data['label']=data['label'].map({'spam':1,'ham':0})
X_train, X_test, y_train, y_test = model_selection.train_test_split(X,data['label'],test_size=.2, random_state=42)
print([np.shape(X_train), np.shape(X_test)])

[(4457, 8444), (1115, 8444)]


Naive Bayes

It is an supervised method that uses the probabilities of each attribute belonging to each class to make a prediction. 

Naive bayes simplifies the calculation of probabilities by assuming that the probability of each attribute belonging to a given class value is independent of all other attributes. This is a strong assumption but results in a fast and effective method. The probability of a class value given a value of an attribute is called the conditional probability. By multiplying the conditional probabilities together for each attribute for a given class value, we have a probability of a data instance belonging to that class.

To make a prediction we can calculate probabilities of the instance belonging to each class and select the class value with the highest probability.

In [4]:
from sklearn.metrics import accuracy_score



bayes = naive_bayes.MultinomialNB()
bayes.fit(X_train,y_train)
score_train=bayes.score(X_train,y_train)
score_test=bayes.score(X_test,y_test)
recall_test=metrics.recall_score(y_test,bayes.predict(X_test))
precision_test=metrics.precision_score(y_test, bayes.predict(X_test))

matrix= np.matrix(np.c_[score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head()

Unnamed: 0,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,0.993942,0.98296,0.959732,0.916667


In [5]:
m_confusion_test=metrics.confusion_matrix(y_test,bayes.predict(X_test))
pd.DataFrame(data=m_confusion_test, columns=['Predicted 0', 'Predicted 1'],index=['Actual 0', 'Actual 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,953,13
Actual 1,6,143


Support Vector Machine

In SVM, each data item is plotted as a point in n(feature count)-dimensional space with the value of each feature being the value of a particular coordinate. Then classification is performed by finding the hyper-plane that differentiate the two classes very well

In [6]:

    
svc=svm.SVC()    
svc.fit(X_train,y_train)
score_train=svc.score(X_train,y_train)
score_test=svc.score(X_test,y_test)
recall_test=metrics.recall_score(y_test,svc.predict(X_test))
precision_test=metrics.precision_score(y_test, svc.predict(X_test))

matrix= np.matrix(np.c_[score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head()

Unnamed: 0,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,0.865829,0.866368,0.0,0.0


In [7]:
m_confusion_test=metrics.confusion_matrix(y_test,bayes.predict(X_test))
pd.DataFrame(data=m_confusion_test, columns=['Predicted 0', 'Predicted 1'],index=['Actual 0', 'Actual 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,953,13
Actual 1,6,143


Logistic Regression

It measures the relationship between the variables. Relationship measure is calculated by estimating probabilities using logistic function. The probabilities are transformed into binary values in order to actually make a prediction. The logistic function used for this purpose is called Sigmoid function. Sigmoid function takes any real value input and maps it to 0 or 1. “-1 and 1” can also be the choice.

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(solver='liblinear',penalty='l1')
lr.fit(X_train,y_train)
score_train=lr.score(X_train,y_train)
score_test=lr.score(X_test,y_test)
recall_test=metrics.recall_score(y_test,lr.predict(X_test))
precision_test=metrics.precision_score(y_test, lr.predict(X_test))

matrix= np.matrix(np.c_[score_train, score_test, recall_test, precision_test])
models = pd.DataFrame(data=matrix, columns=['Train accuracy','Test Accuracy', 'Test Recall', 'Test Precision'])
models.head()

Unnamed: 0,Train accuracy,Test Accuracy,Test Recall,Test Precision
0,0.990128,0.984753,0.892617,0.992537


In [9]:
m_confusion_test=metrics.confusion_matrix(y_test,bayes.predict(X_test))
pd.DataFrame(data=m_confusion_test, columns=['Predicted 0', 'Predicted 1'],index=['Actual 0', 'Actual 1'])

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,953,13
Actual 1,6,143
