## Comparing Classifiers

To make the most educated decision about which classifier to use for any given problem, it's simple to run a comparison between classifiers on the dataset before diving into building a full fledged model. The dataset used in this example is the <a href='https://www.kaggle.com/uciml/sms-spam-collection-dataset'>SMS Spam Collection Dataset</a> from Kaggle.

In [1]:
# Scikit-Learn modules~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
from sklearn.naive_bayes import *
from sklearn.dummy import *
from sklearn.ensemble import *
from sklearn.neighbors import *
from sklearn.tree import *
from sklearn.svm import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.calibration import *
from sklearn.linear_model import *
from sklearn.multiclass import *
from sklearn.model_selection import *
from sklearn.svm import *

# Other Modules
from IPython.display import display
import pandas as pd
import numpy as np
import csv

In [2]:
data = pd.read_csv('data/spam.csv', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace = True)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})

In [3]:
data.head(10)

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
5,1,FreeMsg Hey there darling it's been 3 week's n...
6,0,Even my brother is not like to speak with me. ...
7,0,As per your request 'Melle Melle (Oru Minnamin...
8,1,WINNER!! As a valued network customer you have...
9,1,Had your mobile 11 months or more? U R entitle...


In [4]:
X = data
n_features = X.shape[1]

target = np.array(X['label'])
target_names = np.array(['ham', 'spam'])
labels = target_names[target]
is_spam = (labels == 'spam')

y = target
n_classes = target_names.shape[0]

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.33, random_state=42)

tr_target = np.array(X_train['label'])
tr_target_names = np.array(['ham', 'spam'])
tr_labels = tr_target_names[tr_target]

te_target = np.array(X_test['label'])
te_target_names = np.array(['ham', 'spam'])
te_labels = te_target_names[te_target]

In [5]:
display(X_train.head(10), y_train, len(X_train))

Unnamed: 0,label,message
3235,0,Aight text me when you're back at mu and I'll ...
945,0,I cant wait to see you! How were the photos we...
5319,0,Kothi print out marandratha.
5528,0,Its just the effect of irritation. Just ignore it
247,0,Kallis wont bat in 2nd innings.
3986,1,Ringtone Club: Gr8 new polys direct to your mo...
360,0,Ha ha cool cool chikku chikku:-):-DB-)
5347,0,No sir. That's why i had an 8-hr trip on the b...
3781,0,How r Ì_ going to send it to me?
5188,0,Okie


array([0, 0, 0, ..., 0, 0, 0])

3733

In [6]:
display(X_test.head(10), y_test, len(X_test))

Unnamed: 0,label,message
3245,0,"Funny fact Nobody teaches volcanoes 2 erupt, t..."
944,0,I sent my scores to sophas and i had to do sec...
1044,1,We know someone who you know that fancies you....
2484,0,Only if you promise your getting out as SOON a...
812,1,Congratulations ur awarded either å£500 of CD ...
2973,0,"I'll text carlos and let you know, hang on"
2991,0,K.i did't see you.:)k:)where are you now?
2942,0,No message..no responce..what happend?
230,0,Get down in gandhipuram and walk to cross cut ...
1181,0,You flippin your shit yet?


array([0, 0, 1, ..., 0, 0, 1])

1839

In [8]:
# Performance comparison between available classifiers~~~~~~~~~~~~~~~~~~~~~~~~~
def performance(classifiers, vectorizers, train_data, test_data, train_label, test_label):
    scores = pd.DataFrame(columns=['classifier', 'vectorizer', 'score'])
    for clf in classifiers:
        for vec in vectorizers:
            # Train Models
            print('Training {} with {}...'\
                  .format(clf.__class__.__name__, vec.__class__.__name__))
            vectorize = vec.fit_transform(train_data)
            clf.fit(vectorize, train_label)
            
            # Score Models
            vectorize = vec.transform(test_data)
            score = clf.score(vectorize, test_label)
            scores.loc[len(scores)] = \
                ([clf.__class__.__name__, vec.__class__.__name__, score])
    return scores

In [9]:
print('Measuring Model Performances...')
print('~' * 80)
scores = performance(
    [
        BernoulliNB(),
        RandomForestClassifier(n_estimators=100, n_jobs=-1),
        AdaBoostClassifier(),
        BaggingClassifier(),
        ExtraTreesClassifier(),
        GradientBoostingClassifier(),
        DecisionTreeClassifier(),
        CalibratedClassifierCV(),
        DummyClassifier(),
        PassiveAggressiveClassifier(),
        RidgeClassifier(),
        RidgeClassifierCV(),
        SGDClassifier(),
        SVC(kernel = 'rbf', C = 10000),
        OneVsRestClassifier(SVC(kernel='linear')),
        OneVsRestClassifier(LogisticRegression()),
        KNeighborsClassifier()
    ],
    [
        CountVectorizer(),
        TfidfVectorizer(),
        HashingVectorizer()
    ],
    X_train['message'],
    X_test['message'],
    tr_target,
    te_target
)

Measuring Model Performances...
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Training BernoulliNB with CountVectorizer...
Training BernoulliNB with TfidfVectorizer...
Training BernoulliNB with HashingVectorizer...
Training RandomForestClassifier with CountVectorizer...
Training RandomForestClassifier with TfidfVectorizer...
Training RandomForestClassifier with HashingVectorizer...
Training AdaBoostClassifier with CountVectorizer...
Training AdaBoostClassifier with TfidfVectorizer...
Training AdaBoostClassifier with HashingVectorizer...
Training BaggingClassifier with CountVectorizer...
Training BaggingClassifier with TfidfVectorizer...
Training BaggingClassifier with HashingVectorizer...
Training ExtraTreesClassifier with CountVectorizer...
Training ExtraTreesClassifier with TfidfVectorizer...
Training ExtraTreesClassifier with HashingVectorizer...
Training GradientBoostingClassifier with CountVectorizer...
Training GradientBoostingClassifier with Tf



In [10]:
scores.sort_values(by=['score'], ascending=False)

Unnamed: 0,classifier,vectorizer,score
21,CalibratedClassifierCV,CountVectorizer,0.984231
22,CalibratedClassifierCV,TfidfVectorizer,0.983687
34,RidgeClassifierCV,TfidfVectorizer,0.983143
43,OneVsRestClassifier,TfidfVectorizer,0.983143
37,SGDClassifier,TfidfVectorizer,0.982599
23,CalibratedClassifierCV,HashingVectorizer,0.982055
12,ExtraTreesClassifier,CountVectorizer,0.981512
42,OneVsRestClassifier,CountVectorizer,0.980968
30,RidgeClassifier,CountVectorizer,0.980968
33,RidgeClassifierCV,CountVectorizer,0.980968


## Building a Classification Model

Once we compare all the mean accuracy scores of each classification model, we can land on the algorithm we want to use for the classification model.

In [11]:
clf = CalibratedClassifierCV()
vec = CountVectorizer()

In [12]:
# Train Model
vectorize = vec.fit_transform(X_train['message'])
clf.fit(vectorize, tr_labels)
            
# Score Model
vectorize = vec.transform(X_test['message'])
score = clf.score(vectorize, te_labels)
print(score)

0.9842305600870038


In [13]:
# Save Predictions to CSV~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
results = []
for idx, row in X_test.iterrows():
    ans = row[0]
    txt = row[1]
    vectorize = vec.transform([txt])
    pred = clf.predict(vectorize)[0]
    if pred == 'spam' and ans == 1:  result = 'TP'
    elif pred == 'ham' and ans == 0: result = 'TN'
    elif pred == 'spam' and ans == 0: result = 'FP'
    elif pred == 'ham' and ans == 1: result = 'FN'
    results.append([len(results), txt, ans, pred, result])
    
# Write results to CSV
with open('data/test_results.csv', 'w', newline='', encoding="utf-8") as csvfile:
    spam_writer = csv.writer(csvfile, delimiter=',', 
                             quotechar='"', quoting=csv.QUOTE_MINIMAL)
    
    # Header Row
    spam_writer.writerow(['#', 'text', 'actual', 'prediction', 'result'])
    
    for row in results:
        spam_writer.writerow(row)

## Building the API

Now that we have a working model that is satisfactory, we can move forward to creating an API that allows for the use of it. For this task, we will use the ```flask``` package, which is a microframework for Python.

In [None]:
from sklearn.svm import *
from sklearn.multiclass import *
from sklearn.model_selection import *
from sklearn.feature_extraction.text import TfidfVectorizer
from flask import Flask, render_template, request, redirect, url_for, jsonify
import os
import csv
import pandas as pd
import numpy as np

app = Flask(__name__)
global clf
global vec

# Load data 
data = pd.read_csv('data/spam.csv', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis = 1, inplace = True)
data.rename(columns={'v1': 'label', 'v2': 'message'}, inplace = True)
data['label'] = data['label'].map({'spam': 1, 'ham': 0})
    
# Split data into training/testing sets
X = data
n_features = X.shape[1]
target = np.array(X['label'])
target_names = np.array(['ham', 'spam'])
labels = target_names[target]
is_spam = (labels == 'spam')
y = target
n_classes = target_names.shape[0]

train_data, test_data, train_label, test_label = \
train_test_split(X, y, test_size=0.25, random_state=77)

tr_target = np.array(train_data['label'])
tr_target_names = np.array(['ham', 'spam'])
tr_labels = tr_target_names[tr_target]

# Train the Model
clf = SGDClassifier(loss='log')
vec = CountVectorizer()
vector = vec.fit_transform(train_data['message'])
clf.fit(vector, tr_labels)

# Get prediction API
@app.route('/', methods=['GET'])
def index():
    message = request.args.get('message', '')
    error = ''
    prob = ''
    pred = ''
    
    global clf
    global vec
    try:
        if len(message) > 0:
            vec_msg = vec.transform([message])
            pred = clf.predict(vec_msg).tolist()
            prob = clf.predict_proba(vec_msg).tolist()
    except BaseException as inst:
        error = str(type(inst).__name__) + ' ' + str(inst)
    return jsonify(message=message
                , predict_proba=prob
                , predict=pred
                , error=error)

if __name__ == '__main__':
    from werkzeug.serving import *
    
    port = int(os.environ.get('PORT', 8111))
    run_simple('localhost', port, app)

 * Running on http://localhost:8111/ (Press CTRL+C to quit)
127.0.0.1 - - [12/Aug/2020 13:50:20] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:50:21] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -
127.0.0.1 - - [12/Aug/2020 13:51:40] "[37mGET /?message=%27Hey,%20how%27s%20it%20going?%27 HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:53:17] "[37mGET /?message=Text%20me%20for%20some%20dolla%20dolla%20billz%20bb%20$$$ HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:53:40] "[37mGET /?message=Win%20some%20money!%20Cash%20cash%20cash!%20$$$ HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:55:11] "[37mGET /?message=Wat%20u%20want?%20Cash%20money!%20Text%20to%20win HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:55:24] "[37mGET /?message=Moneeeeeeey HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Aug/2020 13:55:49] "[37mGET /?message=Text%20for%20cash%20-%20$1000%20now%20-%20Win%20win%20winner HTTP/1.1[0m" 200 -


{
     "error":"",
     "message":"Text me for a fun time sexy",
     "predict":"spam",
     "predict_proba":[0.4869716854827332,
                       0.5130283145172668]
}

{
    "error":"",
    "message":"Where's my money? You owe me $150",
    "predict":"ham",
    "predict_proba":[0.999834421228003,
                     0.00016557877199700202]
}