## Text Classification with NLTK and Scikit-Learn
** 1.IMPORTS AND LOAD DATA **

In [1]:
# imports
import sys
import numpy as np
import pandas as pd
import sklearn
import nltk

print('python: {}'.format(sys.version))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
print('sklearn: {}'.format(sklearn.__version__))
print('nltk: {}'.format(nltk.__version__))

python: 3.6.1 (v3.6.1:69c0db5, Mar 21 2017, 18:41:36) [MSC v.1900 64 bit (AMD64)]
numpy: 1.12.1
pandas: 0.22.0
sklearn: 0.19.1
nltk: 3.4


In [2]:
# load the data
df = pd.read_table('SMSSpamCollection', encoding='utf-8', header=None)

In [3]:
# print data details
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
0    5572 non-null object
1    5572 non-null object
dtypes: object(2)
memory usage: 87.1+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
# check data distribution
classes = df[0]
print(classes.value_counts())

ham     4825
spam     747
Name: 0, dtype: int64


** 2.PRE-PROCESS THE DATA **

In [5]:
# convert labels to binary
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
Y = encoder.fit_transform(classes)

print(classes[:5])
print(Y[:5])

0     ham
1     ham
2    spam
3     ham
4     ham
Name: 0, dtype: object
[0 0 1 0 0]


In [6]:
# store text messages
texts = df[1]
print(texts[:5])

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: 1, dtype: object


In [7]:
# regex to replace email-ids, urls, symbols, phone numbers, and digits

processed_texts = texts.str.replace(r'^\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,3}$', 'emailaddr')
processed_texts = processed_texts.str.replace(r'^((http:\/\/www\.)|(www\.)|(http:\/\/))[a-zA-Z0-9._-]+\.[a-zA-Z.]{2,5}$',
                                              'webaddress')
processed_texts = processed_texts.str.replace(r'£|\$', 'moneysymb')
processed_texts = processed_texts.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$', 'phonenumbr')
processed_texts = processed_texts.str.replace(r'\d+(\.\d+)?', 'numbr')

In [8]:
# remove punctuations and continuous whitespaces
processed_texts = processed_texts.str.replace(r'[^\w\s\d]', ' ')
processed_texts = processed_texts.str.replace(r'\s+', ' ')
processed_texts = processed_texts.str.replace(r'^\s+|\s+?$', '')

In [9]:
# convert all texts to lower case
processed_texts = processed_texts.str.lower()

In [10]:
print(processed_texts[:5])

0    go until jurong point crazy available only in ...
1                              ok lar joking wif u oni
2    free entry in numbr a wkly comp to win fa cup ...
3          u dun say so early hor u c already then say
4    nah i don t think he goes to usf he lives arou...
Name: 1, dtype: object


In [11]:
# remove stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
processed_texts = processed_texts.apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

In [12]:
# do stemming using porter stemmer
stemmer = nltk.PorterStemmer()

processed_texts = processed_texts.apply(lambda x: ' '.join(stemmer.stem(word) for word in x.split()))

In [13]:
print(processed_texts[:5])

0    go jurong point crazi avail bugi n great world...
1                                ok lar joke wif u oni
2    free entri numbr wkli comp win fa cup final tk...
3                  u dun say earli hor u c alreadi say
4                 nah think goe usf live around though
Name: 1, dtype: object


In [14]:
# create bag of words
from nltk.tokenize import word_tokenize

all_words = []
for text in processed_texts:
    words = word_tokenize(text)
    for word in words:
        all_words.append(word)
        
all_words = nltk.FreqDist(all_words)

In [15]:
# print number of all words and most common 15 words
print('Vocab size: {}'.format(len(all_words)))
print('most common 15 words: {}'.format(all_words.most_common(15)))

Vocab size: 6584
most common 15 words: [('numbr', 2648), ('u', 1207), ('call', 674), ('go', 456), ('get', 452), ('ur', 391), ('gt', 318), ('lt', 316), ('come', 304), ('moneysymbnumbr', 303), ('ok', 293), ('free', 284), ('day', 276), ('know', 275), ('love', 266)]


In [16]:
# we will take most common 2000 words as features
word_features = [word[0] for word in all_words.most_common(2000)]

In [17]:
# function to convert texts to feature representation
def find_feature(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)
    return features

In [18]:
# make data ready to feed to the model
messages = list(zip(processed_texts, Y))

seed = 1
np.random.seed = seed
np.random.shuffle(messages)

feature_sets = [(find_feature(text), label) for (text, label) in messages]

In [19]:
# split data into train and test set
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(feature_sets, test_size=0.25, random_state=seed)

print('Train len: {}'.format(len(train_data)))
print('Test len: {}'.format(len(test_data)))

Train len: 4179
Test len: 1393


** 3.DEPLOY SKLEARN CLASSIFIERS **

In [20]:
# import models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [21]:
# define models to train
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression',
         'SGD Classifier', 'Naive Bayes', 'SVM Linear']
classifier = [KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
              LogisticRegression(), SGDClassifier(max_iter=100), MultinomialNB(), SVC(kernel='linear')]

models = list(zip(names, classifier))
for model in models:
    print(model)

('K Nearest Neighbors', KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))
('Decision Tree', DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'))
('Random Forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))
('

In [22]:
# wrap models in NLTK
from nltk.classify.scikitlearn import SklearnClassifier

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(train_data)
    accuracy = nltk.classify.accuracy(nltk_model, test_data) * 100
    print('{}: Accuracy = {}'.format(name, accuracy))

K Nearest Neighbors: Accuracy = 94.25699928212491
Decision Tree: Accuracy = 97.4156496769562
Random Forest: Accuracy = 97.77458722182341
Logistic Regression: Accuracy = 98.7078248384781
SGD Classifier: Accuracy = 98.42067480258436
Naive Bayes: Accuracy = 98.49246231155779
SVM Linear: Accuracy = 98.77961234745155


In [23]:
# ensemble method - voting classifier
from sklearn.ensemble import VotingClassifier

names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'Logistic Regression',
         'SGD Classifier', 'Naive Bayes', 'SVM Linear']
classifier = [KNeighborsClassifier(), DecisionTreeClassifier(), RandomForestClassifier(),
              LogisticRegression(), SGDClassifier(max_iter=100), MultinomialNB(), SVC(kernel='linear')]

models = list(zip(names, classifier))

nltk_ensemble = SklearnClassifier(VotingClassifier(estimators=models, voting='hard', n_jobs=-1))
nltk_ensemble.train(train_data)
accuracy = nltk.classify.accuracy(nltk_ensemble, test_data) * 100
print('Ensemble method: Accuracy = {}'.format(accuracy))

Ensemble method: Accuracy = 98.77961234745155


In [24]:
# make predictions separately
test_texts, test_labels = zip(*test_data)

predictions = nltk_ensemble.classify_many(test_texts)

In [25]:
# print confusion matrix and classification report
print(classification_report(test_labels, predictions))

pd.DataFrame(confusion_matrix(test_labels, predictions), 
             index=[['actual', 'actual'], ['ham', 'spam']],
             columns=[['predicted', 'predicted'], ['ham', 'spam']])

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      1209
          1       1.00      0.91      0.95       184

avg / total       0.99      0.99      0.99      1393



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,ham,spam
actual,ham,1209,0
actual,spam,17,167
