# Testing Classifiers using advanced NLP techniques

## Import Dependencies

In [163]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
import operator
% matplotlib inline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from collections import defaultdict
from sklearn.naive_bayes import GaussianNB, MultinomialNB

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

## Import data/models

In [173]:
with open('../data/processed/tokenized_test_b.pickle', 'rb') as file:
    tokenized_test = pickle.load(file)
with open('../data/processed/y_test_b.pickle', 'rb') as file:
    test_y = pickle.load(file)
    
with open('../data/processed/tokenized_train_a.pickle', 'rb') as file:
    tokenized_train = pickle.load(file)
with open('../data/processed/y_train_a.pickle', 'rb') as file:
    train_y = pickle.load(file)


with open('../data/processed/w2v_dict.pickle', 'rb') as file:
    w2v = pickle.load(file)

## Define Functions

In [106]:
# Generate a confusion matrix plot
def plot_confusion_matrix(cm, title='Confusion Matrix', cmap=plt.cm.Greys):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [102]:
def find_auc(y_test, y_score):
    
    fpr, tpr, _ = roc_curve(y_test, y_score)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(6, 6))

    plt.plot([0,1],[0,1], linestyle=":", color='grey', linewidth=4) # this is our baseline
    plt.plot(fpr, tpr, color='purple', alpha=.7, linewidth=3, label="AUC="+str(round(roc_auc, 3))) # this is our ROC curve
    
    plt.xlabel('FPR', fontsize=20)
    plt.ylabel('TPR', fontsize=20)
    plt.legend(loc=0)

    plt.show()

    print('AUC: ', roc_auc)

In [160]:
def find_best_classifier(features, target, seed, k_folds, crossval_scoring, models_list):

    """
    Test multiple classifiers using cross validation.
    Evaluate performance to find model with highest score as defined by the 'crossval_scoring' argument 
    ('roc_auc', 'f1', log_loss', precision', 'recall', etc)
    """

    # Test options and evaluation metric
    scoring=crossval_scoring

    # Spot Check Algorithms
    models = models_list

    # Evaluate each model in turn
    results = []
    names = []

    for name, model in models:
        kfold = StratifiedKFold(n_splits=k_folds, random_state=seed)
        cv_results = cross_val_score(model, features, target, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %s (%f), std (%f)" % (name, scoring, cv_results.mean(), cv_results.std())
        print(msg)

    zipped_eval = zip(models, [i.mean() for i in results])
    model_eval = sorted(zipped_eval, key=operator.itemgetter(1))
    
    best_clf = model_eval[-1][0][1]
    clf_name = model_eval[-1][0][0]
    print("\n Model with best {} is {}".format(scoring, clf_name))
    print('\n', best_clf)
        
    return best_clf

In [89]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.values())

    def fit(self, X):
        return self

    def transform(self, X):
        return [
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ]


class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(word2vec.items())

    def fit(self, X):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return [
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ]

In [136]:
def generate_features(vectorizer, list_tokens, set_dim=None):
    """
    Utility function to generate word vectors for tweets
    """
    vectorizer_ = vectorizer

    if set_dim:
        # Convert list of embeddings to dataframe and reshape relevant dimensions into multi-dimensional array
        embeddings = vectorizer_.fit(list_tokens).transform(list_tokens)
        df_tokens = pd.DataFrame(embeddings)
        relevant = df.iloc[:, 0:set_dim]
        features = np.array(relevant)
        
    else:
        features = vectorizer_.fit(list_tokens).transform(list_tokens).toarray()
    
    return features
    

# Test Models

## Test models with Count Vectorizer

In [None]:
seed = 42

In [None]:
features = generate_features(CountVectorizer(analyzer=lambda x: x), tokenized_test)

models = [('NaiveBayesGaussian', GaussianNB()),
          ('NaiveBayesBernoulli', BernoulliNB()),
          ('NaiveBayesMultinomial', MultinomialNB()),
          ('RandomForest', RandomForestClassifier(random_state=seed)),
          ('ExtraTrees', ExtraTreesClassifier(random_state=seed))]

In [None]:
best_clf = find_best_classifier(features=features, 
                                target=test_y, 
                                seed=42, 
                                k_folds=5, 
                                crossval_scoring='roc_auc', 
                                models_list = models)

## Test models with Tf-IDF Count Vectorizer

In [170]:
features = generate_features(TfidfVectorizer(analyzer=lambda x: x), tokenized_test)

models = [('NaiveBayesGaussian', GaussianNB()),
          ('NaiveBayesBernoulli', BernoulliNB()),
          ('NaiveBayesMultinomial', MultinomialNB()),
          ('RandomForest', RandomForestClassifier(random_state=seed)),
          ('ExtraTrees', ExtraTreesClassifier(random_state=seed))]

In [171]:
best_clf = find_best_classifier(features=features, 
                                target=y, 
                                seed=42, 
                                k_folds=5, 
                                crossval_scoring='roc_auc',
                                models_list = models)

NaiveBayesGaussian: roc_auc (0.661248), std (0.012899)
NaiveBayesBernoulli: roc_auc (0.826842), std (0.020672)
NaiveBayesMultinomial: roc_auc (0.827583), std (0.020732)
RandomForest: roc_auc (0.767739), std (0.019510)
ExtraTrees: roc_auc (0.783671), std (0.020281)

 Model with best roc_auc is NaiveBayesMultinomial

 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


## Test models with W2V Mean Embedding Vectorizer

In [164]:
features = generate_features(MeanEmbeddingVectorizer(w2v), tokenized_test, set_dim=100)

models = [('RandomForest', RandomForestClassifier(random_state=seed)),
          ('ExtraTrees', ExtraTreesClassifier(random_state=seed))]

In [165]:
best_clf = find_best_classifier(features=features, 
                                target=y, 
                                seed=42, 
                                k_folds=5, 
                                crossval_scoring='roc_auc',
                                models_list = models)

RandomForest: roc_auc (0.669129), std (0.032995)
ExtraTrees: roc_auc (0.666367), std (0.029718)

 Model with best roc_auc is RandomForest

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)


## Test models with W2V Tf-IDF Embedding Vectorizer

In [166]:
features = generate_features(TfidfEmbeddingVectorizer(w2v), tokenized_test, set_dim=100)

models = [('RandomForest', RandomForestClassifier(random_state=seed)),
          ('ExtraTrees', ExtraTreesClassifier(random_state=seed))]

In [167]:
best_clf = find_best_classifier(features=features, 
                                target=y, 
                                seed=42, 
                                k_folds=5, 
                                crossval_scoring='roc_auc',
                                models_list = models)

RandomForest: roc_auc (0.669129), std (0.032995)
ExtraTrees: roc_auc (0.666367), std (0.029718)

 Model with best roc_auc is RandomForest

 RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)
