In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas
import sklearn
import pickle
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import learning_curve

Loading the data 

In [3]:
messages = pandas.read_csv('C:\\Users\\rajas\\OneDrive\\Documents\\smsspamcollection\\SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])
messages.groupby('label').describe()

Unnamed: 0_level_0,message,message,message,message
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Data preprocessing

In [4]:
def split_into_tokens(message):
    #message = unicode(message,'utf8')  # convert bytes into proper unicode
    return TextBlob(message).words

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rajas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
messages.message.head().apply(split_into_tokens)

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, goes, to, usf, he...
Name: message, dtype: object

In [7]:
TextBlob("Hello world, how is it going?").tags  # list of (word, POS) pairs

[('Hello', 'NNP'),
 ('world', 'NN'),
 ('how', 'WRB'),
 ('is', 'VBZ'),
 ('it', 'PRP'),
 ('going', 'VBG')]

In [8]:
def split_into_lemmas(message):
    #message = str(message, 'utf-8').lower()
    words = TextBlob(message).words
    # for each word, take its "base form" = lemma 
    return [word.lemma for word in words]

messages.message.head().apply(split_into_lemmas)

0    [Go, until, jurong, point, crazy, Available, o...
1                       [Ok, lar, Joking, wif, u, oni]
2    [Free, entry, in, 2, a, wkly, comp, to, win, F...
3    [U, dun, say, so, early, hor, U, c, already, t...
4    [Nah, I, do, n't, think, he, go, to, usf, he, ...
Name: message, dtype: object

In [9]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])
print(len(bow_transformer.vocabulary_))

10897


In [13]:
message4 = messages['message'][3]

In [14]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 4166)	2
  (0, 4735)	1
  (0, 5335)	1
  (0, 6168)	1
  (0, 6192)	1
  (0, 7077)	1
  (0, 9182)	2
  (0, 9489)	1
  (0, 9955)	1
(1, 10897)


In [15]:
messages_bow = bow_transformer.transform(messages['message'])
print('sparse matrix shape:', messages_bow.shape)
print('number of non-zeros:', messages_bow.nnz)
print('sparsity: %.2f%%' % (100.0 * messages_bow.nnz / (messages_bow.shape[0] * messages_bow.shape[1])))

sparse matrix shape: (5574, 10897)
number of non-zeros: 81829
sparsity: 0.13%


In [16]:
tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 9955)	0.22359242444808272
  (0, 9489)	0.1950971983473778
  (0, 9182)	0.4964973074995105
  (0, 7077)	0.4273835735587625
  (0, 6192)	0.3103377278723363
  (0, 6168)	0.2916597444628191
  (0, 5335)	0.28637921730908206
  (0, 4735)	0.25861924136190984
  (0, 4166)	0.39051931531495004


In [17]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5574, 10897)


Splitting the data 


In [24]:
msg_train, msg_rem, label_train, label_rem = \
    train_test_split(messages['message'], messages['label'], train_size=0.7)

print(len(msg_train), len(msg_rem), len(msg_train) + len(msg_rem))

3901 1673 5574


In [25]:
msg_valid, msg_test, label_valid, label_test = train_test_split(msg_rem,label_rem, test_size=0.5)

In [29]:
print(len(msg_train))
print(len(msg_valid))
print(len(msg_test))

3901
836
837


saving the train test and validation sets to csv files 

In [31]:
msg_train.to_csv('C:\\Users\\rajas\\OneDrive\\Documents\\smsspamcollection\\train.csv')
msg_valid.to_csv('C:\\Users\\rajas\\OneDrive\\Documents\\smsspamcollection\\validation.csv')
msg_test.to_csv('C:\\Users\\rajas\\OneDrive\\Documents\\smsspamcollection\\test.csv')