In [22]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
#import cPickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.learning_curve import learning_curve

In [23]:
messages = pandas.read_csv('spam_data/SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE, names=["label", "message"])

In [24]:
def split_into_tokens(message):
    words = TextBlob(message).words
    return [word.lemma for word in words]

train_data, test_data, train_labels, test_labels = train_test_split(messages['message'], messages['label'], test_size=0.2, random_state=26)

In [25]:
vectorizer = TfidfVectorizer(analyzer=split_into_tokens, stop_words='english')
messages_tfidf = vectorizer.fit_transform(train_data)
messages_tfidf_test = vectorizer.transform(test_data)

In [27]:
# use the one above. ignore this


# compare TfidfVectorizer also instead of count vectorizer
bow_transformer = CountVectorizer(analyzer=split_into_tokens).fit(train_data)

messages_bow = bow_transformer.transform(train_data)
print ('sparse matrix shape:', messages_bow.shape)
print ('number of non-zeros:', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

#bow_transformer1 = CountVectorizer(analyzer=split_into_tokens).fit(test_data)

messages_bow1 = bow_transformer.transform(test_data)
print ('sparse matrix shape:', messages_bow.shape)
print ('number of non-zeros:', messages_bow.nnz)
print ('sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

#tfidf_transformer1 = TfidfTransformer().fit(messages_bow1)
messages_tfidf_test = tfidf_transformer.transform(messages_bow1)

sparse matrix shape: (4459, 9675)
number of non-zeros: 65494
sparsity: 0.15%
sparse matrix shape: (4459, 9675)
number of non-zeros: 65494
sparsity: 0.15%


In [18]:
modelSVM = LinearSVC()
modelBaye = MultinomialNB()

In [19]:
modelSVM.fit(messages_tfidf,train_labels)
modelBaye.fit(messages_tfidf,train_labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [16]:
# Test the unseen mails for Spam

resultSVM = modelSVM.predict(messages_tfidf_test)
resultBaye = modelBaye.predict(messages_tfidf_test)

print(confusion_matrix(test_labels,resultSVM))
print(confusion_matrix(test_labels,resultBaye))

[[969   1]
 [ 17 128]]
[[970   0]
 [ 36 109]]


In [32]:
print(classification_report(test_labels, resultSVM))

             precision    recall  f1-score   support

        ham       0.98      1.00      0.99       970
       spam       0.98      0.90      0.94       145

avg / total       0.98      0.98      0.98      1115



In [33]:
print(classification_report(test_labels, resultBaye))

             precision    recall  f1-score   support

        ham       0.95      1.00      0.97       970
       spam       1.00      0.62      0.77       145

avg / total       0.95      0.95      0.95      1115



In [34]:
accuracy_score(test_labels, resultSVM)

0.98475336322869955

In [35]:
accuracy_score(test_labels, resultBaye)

0.95067264573991028

In [20]:
# Test the unseen mails for Spam

resultSVM = modelSVM.predict(messages_tfidf_test)
resultBaye = modelBaye.predict(messages_tfidf_test)

print(confusion_matrix(test_labels,resultSVM))
print(confusion_matrix(test_labels,resultBaye))

[[968   2]
 [ 15 130]]
[[970   0]
 [ 55  90]]


In [30]:
# training a Naive Bayes classifier
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=2, n_iter=7, random_state=26)
svd.fit(messages_tfidf)
final_train_data = svd.transform(messages_tfidf)
final_test_data = svd.transform(messages_tfidf_test)


gnb = GaussianNB().fit(final_train_data,train_labels)
gnb_predictions = gnb.predict(final_test_data)
print(confusion_matrix(test_labels,gnb_predictions))
print(accuracy_score(test_labels, gnb_predictions))

[[970   0]
 [145   0]]
0.869955156951


In [31]:
from sklearn.tree import DecisionTreeClassifier
modelDecision = DecisionTreeClassifier()
modelDecision.fit(messages_tfidf,train_labels)
resultDecision = modelDecision.predict(messages_tfidf_test)
print(confusion_matrix(test_labels,resultDecision))
print(accuracy_score(test_labels, resultDecision))

[[954  16]
 [ 21 124]]
0.966816143498
