# cat-AI-log. An AI-based product group allocation system

Capstone project.

Sebastian Thomas @ neue fische Bootcamp Data Science<br />
(datascience at sebastianthomas dot de)

# Part 4: Predictive analysis

We vectorize our text data and fit some predictive models.

## Imports

### Modules, classes and functions

In [None]:
# import of modules
from importlib import import_module

# python object persistence
import joblib

# regular expressions
import re

# data
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

# machine learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.preprocessing import LabelEncoder, StandardScaler, MaxAbsScaler, Normalizer
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# custom modules
from modules.ds_rough import Vectorizer, Scaler
from modules.quotient_extraction import QuotientCountVectorizer, QuotientTfidfVectorizer

### Helpers

Some helping functions for exploration and a list for estimator selection.

In [None]:
def plot_similarities_and_certainties(classifier, corpus, y,
                                      normal_vectorizer=make_pipeline(CountVectorizer(), Normalizer()),
                                      test_size=0.1):
    corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=test_size, stratify=y,
                                                                  random_state=0)

    ndt = normal_vectorizer.fit_transform(corpus_train)

    classifier.fit(corpus_train, y_train);

    corpus_test_correct = corpus_test[y_test == classifier.predict(corpus_test)]
    corpus_test_incorrect = corpus_test[y_test != classifier.predict(corpus_test)]

    bins = np.linspace(0., 1., 6)

    similarities_correct = np.max(ndt.dot(normal_vectorizer.transform(corpus_test_correct).transpose()),
                                  axis=0).toarray().flatten()
    similarities_correct_binned = pd.cut(np.round(similarities_correct, 2), bins, include_lowest=True)

    similarities_incorrect = np.max(ndt.dot(normal_vectorizer.transform(corpus_test_incorrect).transpose()),
                                    axis=0).toarray().flatten()
    similarities_incorrect_binned = pd.cut(np.round(similarities_incorrect, 2), bins, include_lowest=True)

    certainties_correct = np.max(classifier.predict_proba(corpus_test_correct), axis=1)
    certainties_correct_binned = pd.cut(np.round(certainties_correct, 2), bins, include_lowest=True)

    certainties_incorrect = np.max(classifier.predict_proba(corpus_test_incorrect), axis=1)
    certainties_incorrect_binned = pd.cut(np.round(certainties_incorrect, 2), bins, include_lowest=True)

    print('mean similarity of correctly classified:   {:.3f}'.format(np.mean(similarities_correct)))
    print('mean similarity of incorrectly classified: {:.3f}'.format(np.mean(similarities_incorrect)))
    print('mean certainty of correctly classified:    {:.3f}'.format(np.mean(certainties_correct)))
    print('mean certainty of incorrectly classified:  {:.3f}'.format(np.mean(certainties_incorrect)))

    (fig, ax) = plt.subplots(2, 2, figsize=(13.5, 9), dpi=300)

    sns.countplot(similarities_correct_binned, ax=ax[0, 0])
    sns.countplot(similarities_incorrect_binned, ax=ax[0, 1])
    sns.countplot(certainties_correct_binned, ax=ax[1, 0])
    sns.countplot(certainties_incorrect_binned, ax=ax[1, 1])

    plt.show()

In [None]:
classifier_selection = [
    ('sklearn.naive_bayes', 'BernoulliNB', 'bernoulli_naive_bayes_classifier',
     {
         'alpha':     [1.0e-10, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
         'fit_prior': [True, False]
     }),
    ('sklearn.naive_bayes', 'MultinomialNB', 'multinomial_naive_bayes_classifier',
     {
         'alpha':     [0.0, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
         'fit_prior': [True, False]
     }),
    ('sklearn.svm', 'SVC', 'linear_support_vector_classifier',
     {
         'kernel':       ['linear'],
         'C':            [10**r for r in range(-5, 1)],
         'shrinking':    [True, False],
         'random_state': [0]
     }),
    ('sklearn.svm', 'SVC', 'poly_support_vector_classifier',
     {
         'kernel':       ['poly'],
         'degree':       list(range(2, 4)),
         'C':            [10**r for r in range(-5, 1)],
         'gamma':        [10**r for r in range(-5, 1)],
         'shrinking':    [True, False],
         'random_state': [0]
     }),
    ('sklearn.svm', 'SVC', 'sigmoid_support_vector_classifier',
     {
         'kernel':       ['sigmoid'],
         'C':            [10**r for r in range(-5, 6)],
         'gamma':        [10**r for r in range(-5, 4)],
         'shrinking':    [True, False],
         'random_state': [0]
     }),
    ('sklearn.svm', 'SVC', 'rbf_support_vector_classifier',
     {
         'kernel':       ['rbf'],
         'C':            [10**r for r in range(-5, 4)],
         'gamma':        [10**r for r in range(-5, 4)],
         'shrinking':    [True, False],
         'random_state': [0]
     }),
    ('sklearn.ensemble', 'RandomForestClassifier', 'random_forest_classifier',
     {
         'criterion':         ['gini', 'entropy'],
         'n_estimators':      [300, 400, 500, 600, 1000],
         'max_depth':         list(range(5, 30, 2)), # with None, we get an overfit!
         'min_samples_split': list(range(2, 11)),
         'min_samples_leaf':  list(range(1, 11)),
         'random_state':      [0]
     }),
#    ('sklearn.linear_model', 'LogisticRegression', 'logistic_discrimination_classifier',
#     {
#         'C':             [10**r for r in range(-5, 4)],
#         'fit_intercept': [True, False],
#         'penalty':       ['l1', 'l2'],
#         'random_state':  [0]
#     }),
#    ('sklearn.naive_bayes', 'GaussianNB', 'gaussian_naive_bayes_classifier',
#     {
#         'var_smoothing': [10**r for r in range(-15, 2)]
#     }),
#    ('sklearn.neighbors', 'KNeighborsClassifier', 'k_nearest_neighbors_classifier',
#     {
#         'n_neighbors': list(range(1, 21)),
#         'algorithm':   ['ball_tree', 'kd_tree', 'brute'],
#         'leaf_size':   list(range(15, 55, 5)),
#         'p':           list(range(1, 6))
#     }),
#    ('sklearn.tree', 'DecisionTreeClassifier', 'decision_tree_classifier',
#     {
#         'criterion':         ['gini', 'entropy'],
#         'splitter':          ['best', 'random'],
#         'max_depth':         list(range(5, 21, 2)) + [None],
#         'min_samples_split': list(range(2, 11)),
#         'min_samples_leaf':  list(range(1, 22, 3)),
#         'max_features':      [r*0.1 for r in range(1, 11)] + ['auto', 'sqrt', 'log2', None],
#         'random_state':      [0]
#     }),
#    ('xgboost', 'XGBClassifier', 'xgb_classifier',
#     {
#         'learning_rate':     [0.05, 0.10, 0.20, 0.30],
#         'n_estimators':      [100, 150, 200, 250],
#         'max_depth':         [5, 6, 7, 8, 9, 10],
#         'min_child_weight':  [1, 3, 5],
#         'gamma':             [0.0, 0.1, 0.2, 0.3],
#         'colsample_bytree':  [0.4, 0.5, 0.6, 0.7],
#         'reg_lambda':        [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
#    })
]

### Data

We import our cleaned data with engineered features.

In [None]:
mira = pd.read_pickle('data/mira_2.pickle')
mira.sample(5, random_state=0)

## Preparation

We define the data set that we use to construct the classifiers.

In [None]:
# as we will make changes to this data set, we need to copy it
mira_clfd = mira[mira['product group'].notna()].copy()
mira_clfd[['article', 'article base', 'product group']].sample(5, random_state=0)

Since, the four least frequent occuring product groups have very few instances, we change the value of the target `'product group'` of these instances to `'Varia'`.

In [None]:
least_frequent_product_groups = mira_clfd.groupby(['product group'])['article']\
                                .count().sort_values(ascending=False).index[-4:]

def reduce_to_most_frequent(product_group):
    return 'Varia' if product_group in least_frequent_product_groups else product_group

labels = mira_clfd['product group'].apply(reduce_to_most_frequent)
labels.unique()

There remain 13 labels. We encode these labels as non-negative integers.

In [None]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)

## Approach with `'article'`

As a first approach, we work with the unprocessed feature `'article'`.

In [None]:
corpus = mira_clfd['article'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

We explore some facts about the train and test set.

In [None]:
print('train instances:   {:<5}'.format(corpus_train.shape[0]))
print('test instances:    {:<5}'.format(corpus_test.shape[0]))
print('tokens in X_train: {:<5}'.format(CountVectorizer().fit_transform(corpus_train).shape[1]))
print('tokens in X:       {:<5}'.format(CountVectorizer().fit_transform(corpus).shape[1]))
print('all tokens:        {:<5}'.format(CountVectorizer().fit_transform(mira['article']).shape[1]))

We make a rough analysis with some vectorizers and scalers.

In [None]:
corpus_fit, corpus_valid, y_fit, y_valid = train_test_split(corpus_train, y_train, test_size=0.1,
                                                            stratify=y_train, random_state=0)

for vec_type in ['count', 'tfidf']:
    for sc_type in [None, 'standard', 'maxabs', 'norm']:
        clf = make_pipeline(Vectorizer(vec_type), Scaler(sc_type), MultinomialNB())
        clf.fit(corpus_fit, y_fit)
        print('{:<5} {:<8} {:<3} {:.3f}'.format(vec_type, str(sc_type), 'NB', clf.score(corpus_valid, y_valid)))

        clf = make_pipeline(Vectorizer(vec_type), Scaler(sc_type), SVC(random_state=0))
        clf.fit(corpus_fit, y_fit)
        print('{:<5} {:<8} {:<3} {:.3f}'.format(vec_type, str(sc_type), 'SVC', clf.score(corpus_valid, y_valid)))

        clf = make_pipeline(Vectorizer(vec_type), Scaler(sc_type), RandomForestClassifier(random_state=0))
        clf.fit(corpus_fit, y_fit)
        print('{:<5} {:<8} {:<3} {:.3f}'.format(vec_type, str(sc_type), 'RF', clf.score(corpus_valid, y_valid)))

        clf = make_pipeline(Vectorizer(vec_type), Scaler(sc_type), XGBClassifier(random_state=0))
        clf.fit(corpus_fit, y_fit)
        print('{:<5} {:<8} {:<3} {:.3f}'.format(vec_type, str(sc_type), 'XGB', clf.score(corpus_valid, y_valid)))

Next, we make a systematic randomized search with various machine learning algorithms (Naïve Bayes, Support Vector Classifier, Random Forest) and save the best results to a pandas dataframe. As the computation takes a couple of minutes, we persist the results.

In [None]:
try:
    classifier_comparison = pd.read_pickle('results/based_on_article.pickle')
except FileNotFoundError:
    classifier_comparison = pd.DataFrame(columns=['Module', 'Class', 'Name', 'BestParameters', 'BestScore'])

    for (module_name, class_name, classifier_name, parameters) in classifier_selection:
        print('Randomized search for {}.{:<80}'.format(module_name, class_name), end='\r')
        module_of_classifier = import_module(module_name)
        class_of_classifier = getattr(module_of_classifier, class_name)
        classifier = Pipeline([('vct', 'passthrough'),
                               ('scl', 'passthrough'),
                               (classifier_name, class_of_classifier())])
        parameters = {classifier_name + '__' + param: parameters[param]
                      for param in parameters.keys()}
        parameters.update({
            'vct': [CountVectorizer(), TfidfVectorizer()],
            'scl': ['passthrough', StandardScaler(with_mean=False), MaxAbsScaler(), Normalizer()],        
        })
        rs = RandomizedSearchCV(estimator=classifier, n_iter=200, param_distributions=parameters,
                                scoring='accuracy', cv=10, return_train_score=True, verbose=0, n_jobs=-1,
                                random_state=0)
        rs.fit(corpus_train, y_train)

        classifier_comparison = classifier_comparison.append({'Module': module_name,
                                                              'Class': class_name,
                                                              'Name': classifier_name,
                                                              'BestParameters': rs.best_params_,
                                                              'BestScore': rs.best_score_},
                                                             ignore_index=True)
    
    classifier_comparison.to_pickle(path='results/based_on_article.pickle')
    classifier_comparison.to_csv(path_or_buf='results/based_on_article.csv', index=False)

classifier_comparison.sort_values(by='BestScore', ascending=False)

The best found classifier is a support vector classifier with rbf kernel. It has a mean accoury on the validation sets of roughly 73.0%.

We analyse the similarities and certainties of the best found classifier.

In [None]:
def load_estimator(idx):
    module_name = classifier_comparison.loc[idx, 'Module']
    class_name = classifier_comparison.loc[idx, 'Class']
    classifier_name = classifier_comparison.loc[idx, 'Name']
    parameters = classifier_comparison.loc[idx, 'BestParameters']
    if classifier_name.endswith('support_vector_classifier'):
        parameters.update({
                classifier_name + '__probability': True
            })

    module_of_classifier = import_module(module_name)
    class_of_classifier = getattr(module_of_classifier, class_name)
    classifier = Pipeline([('vct', parameters['vct']),
                           ('scl', parameters['scl']),
                           (classifier_name, class_of_classifier())])
    classifier.set_params(**parameters)
    
    return classifier

plot_similarities_and_certainties(load_estimator(5), corpus_train, y_train)

The mean similarities and the mean certainties of the correctly and incorrectly classified validation instances do not differ much.

## Approach with `'article base'`

Next, we work with the processed feature `'article base'`. The steps are the same as in the first approach.

In [None]:
corpus = mira_clfd['article base'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

We explore some facts about the train and test set.

In [None]:
print('train instances:   {:<5}'.format(corpus_train.shape[0]))
print('test instances:    {:<5}'.format(corpus_test.shape[0]))
print('tokens in X_train: {:<5}'.format(CountVectorizer().fit_transform(corpus_train).shape[1]))
print('tokens in X:       {:<5}'.format(CountVectorizer().fit_transform(corpus).shape[1]))
print('all tokens:        {:<5}'.format(CountVectorizer().fit_transform(mira['article base']).shape[1]))

In [None]:
try:
    classifier_comparison = pd.read_pickle('results/based_on_article_base.pickle')
except FileNotFoundError:
    classifier_comparison = pd.DataFrame(columns=['Module', 'Class', 'Name', 'BestParameters', 'BestScore'])

    for (module_name, class_name, classifier_name, parameters) in classifier_selection:
        print('Randomized search for {}.{:<80}'.format(module_name, class_name), end='\r')
        module_of_classifier = import_module(module_name)
        class_of_classifier = getattr(module_of_classifier, class_name)
        classifier = Pipeline([('vct', 'passthrough'),
                               ('scl', 'passthrough'),
                               (classifier_name, class_of_classifier())])
        parameters = {classifier_name + '__' + param: parameters[param]
                      for param in parameters.keys()}
        parameters.update({
            'vct': [CountVectorizer(), TfidfVectorizer()],
            'scl': ['passthrough', StandardScaler(with_mean=False), MaxAbsScaler(), Normalizer()],        
        })
        rs = RandomizedSearchCV(estimator=classifier, n_iter=200, param_distributions=parameters,
                                scoring='accuracy', cv=10, return_train_score=True, verbose=0, n_jobs=-1,
                                random_state=0)
        rs.fit(corpus_train, y_train)

        classifier_comparison = classifier_comparison.append({'Module': module_name,
                                                              'Class': class_name,
                                                              'Name': classifier_name,
                                                              'BestParameters': rs.best_params_,
                                                              'BestScore': rs.best_score_},
                                                             ignore_index=True)
    
    classifier_comparison.to_pickle(path='results/based_on_article_base.pickle')
    classifier_comparison.to_csv(path_or_buf='results/based_on_article_base.csv', index=False)

classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
plot_similarities_and_certainties(load_estimator(5), corpus_train, y_train)

We recognize that there is a large gap between the (mean) similarities resp. (mean) certainties of correctly and incorrectly classified validation instances. 

## Approach with `'article base'` and `'dosage form'`

Next, we work with the features `'article base'` and `'dosage form'`. This leads to a slightly more complicated pipeline since we now have to use two vectorizers.

In [None]:
corpus = mira_clfd[['article base', 'dosage form']].fillna('').values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

In [None]:
try:
    classifier_comparison = pd.read_pickle('results/based_on_article_base_and_dosage_form.pickle')
except FileNotFoundError:
    classifier_comparison = pd.DataFrame(columns=['Module', 'Class', 'Name', 'BestParameters', 'BestScore'])

    for (module_name, class_name, classifier_name, parameters) in classifier_selection:
        print('Randomized search for {}.{:<80}'.format(module_name, class_name), end='\r')
        module_of_classifier = import_module(module_name)
        class_of_classifier = getattr(module_of_classifier, class_name)
        classifier = Pipeline([('ct', ColumnTransformer([('vct1', 'passthrough', 0),
                                                         ('vct2', 'passthrough', 1)])),
                               ('scl', 'passthrough'),
                               (classifier_name, class_of_classifier())])
        parameters = {classifier_name + '__' + param: parameters[param]
                      for param in parameters.keys()}
        parameters.update({
            'ct__vct1': [CountVectorizer(), TfidfVectorizer()],
            'ct__vct2': [CountVectorizer(), TfidfVectorizer()],
            'scl': ['passthrough', StandardScaler(with_mean=False), MaxAbsScaler(), Normalizer()],        
        })
        rs = RandomizedSearchCV(estimator=classifier, n_iter=200, param_distributions=parameters,
                                scoring='accuracy', cv=10, return_train_score=True, verbose=0, n_jobs=-1,
                                random_state=0)
        rs.fit(corpus_train, y_train)

        classifier_comparison = classifier_comparison.append({'Module': module_name,
                                                              'Class': class_name,
                                                              'Name': classifier_name,
                                                              'BestParameters': rs.best_params_,
                                                              'BestScore': rs.best_score_},
                                                             ignore_index=True)
    
    classifier_comparison.to_pickle(path='results/based_on_article_base_and_dosage_form.pickle')
    classifier_comparison.to_csv(path_or_buf='results/based_on_article_base_and_dosage_form.csv', index=False)

classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
def load_estimator(idx):
    module_name = classifier_comparison.loc[idx, 'Module']
    class_name = classifier_comparison.loc[idx, 'Class']
    classifier_name = classifier_comparison.loc[idx, 'Name']
    parameters = classifier_comparison.loc[idx, 'BestParameters'].copy()
    if classifier_name.endswith('support_vector_classifier'):
        parameters.update({
                classifier_name + '__probability': True
            })

    module_of_classifier = import_module(module_name)
    class_of_classifier = getattr(module_of_classifier, class_name)
    classifier = Pipeline([('ct', ColumnTransformer([('vct1', 'passthrough', 0),
                                                     ('vct2', 'passthrough', 1)])),
                           ('scl', 'passthrough'),
                           (classifier_name, class_of_classifier())])
    classifier.set_params(**parameters)
    
    return classifier

normal_vectorizer = make_pipeline(ColumnTransformer([('vct1', CountVectorizer(), 0),
                                                     ('vct2', CountVectorizer(), 1)]), Normalizer())
plot_similarities_and_certainties(load_estimator(4), corpus_train, y_train, normal_vectorizer=normal_vectorizer)

The difference between the (mean) similarities resp. certainties gets smaller again.

## Approach with `'article base'` and extracted features

Next, we also work with the occurance of some numerical features.

In [None]:
mira.columns

In [None]:
numerical_features = ['mass concentration 0 [mg/ml]', 'mass puff concentration 0 [mg/hub]', 'mass flow 0 [mg/h]',
                      'volume flow 0 [ml/h]', 'active ingredient percentage 0 [%]', 'mass 0 [mg]',
                      'volume 0 [ml]', 'count puff 0 [hub]', 'percentage 0 [%]', 'length 0 [cm]', 'count 0']

In [None]:
corpus = np.concatenate([mira_clfd[['article base', 'dosage form']].fillna(''),
                         mira_clfd[numerical_features].notna()], axis=1)

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

In [None]:
try:
    classifier_comparison = pd.read_pickle('results/based_on_article_base_and_extracted_features.pickle')
except FileNotFoundError:
    classifier_comparison = pd.DataFrame(columns=['Module', 'Class', 'Name', 'BestParameters', 'BestScore'])

    for (module_name, class_name, classifier_name, parameters) in classifier_selection:
        print('Randomized search for {}.{:<80}'.format(module_name, class_name), end='\r')
        module_of_classifier = import_module(module_name)
        class_of_classifier = getattr(module_of_classifier, class_name)
        classifier = Pipeline([('ct', ColumnTransformer([('vct1', 'passthrough', 0),
                                                         ('vct2', 'passthrough', 1),
                                                         ('pt', 'passthrough', slice(2, 16))])),
                               ('scl', 'passthrough'),
                               (classifier_name, class_of_classifier())])
        parameters = {classifier_name + '__' + param: parameters[param]
                      for param in parameters.keys()}
        parameters.update({
            'ct__vct1': [CountVectorizer(), TfidfVectorizer()],
            'ct__vct2': [CountVectorizer(), TfidfVectorizer()],
            'scl': ['passthrough', StandardScaler(with_mean=False), MaxAbsScaler(), Normalizer()],        
        })
        rs = RandomizedSearchCV(estimator=classifier, n_iter=200, param_distributions=parameters,
                                scoring='accuracy', cv=10, return_train_score=True, verbose=0, n_jobs=-1,
                                random_state=0)
        rs.fit(corpus_train, y_train)

        classifier_comparison = classifier_comparison.append({'Module': module_name,
                                                              'Class': class_name,
                                                              'Name': classifier_name,
                                                              'BestParameters': rs.best_params_,
                                                              'BestScore': rs.best_score_},
                                                             ignore_index=True)
    
    classifier_comparison.to_pickle(path='results/based_on_article_base_and_extracted_features.pickle')
    classifier_comparison.to_csv(path_or_buf='results/based_on_article_base_and_extracted_features.csv', index=False)

classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
def load_estimator(idx):
    module_name = classifier_comparison.loc[idx, 'Module']
    class_name = classifier_comparison.loc[idx, 'Class']
    classifier_name = classifier_comparison.loc[idx, 'Name']
    parameters = classifier_comparison.loc[idx, 'BestParameters'].copy()
    if classifier_name.endswith('support_vector_classifier'):
        parameters.update({
                classifier_name + '__probability': True
            })

    module_of_classifier = import_module(module_name)
    class_of_classifier = getattr(module_of_classifier, class_name)
    classifier = Pipeline([('ct', ColumnTransformer([('vct1', 'passthrough', 0),
                                                     ('vct2', 'passthrough', 1),
                                                     ('pt', 'passthrough', slice(2, 16))])),
                           ('scl', 'passthrough'),
                           (classifier_name, class_of_classifier())])
    classifier.set_params(**parameters)
    
    return classifier

normal_vectorizer = make_pipeline(ColumnTransformer([('vct1', CountVectorizer(), 0),
                                                     ('vct2', CountVectorizer(), 1),
                                                     ('pt', 'passthrough', slice(2, 16))]), Normalizer())
plot_similarities_and_certainties(load_estimator(1), corpus_train, y_train, normal_vectorizer=normal_vectorizer)

Here, the differences get even smaller.

## Approach with `'article base'` and quotient vectorizer

Later, we would like to try an approach based on the feature `'article base'` and quotient vectorizers. This has to be implemented in a more clever way since the fitting time of the quotient vectorizers is to high. (We could work with a precomputed vocabulary and a predefined quotient matrix, which could be sliced in the cross validation process.)

In [None]:
#corpus = mira_clfd['article base'].values

#corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
#                                                              random_state=0)

In [None]:
#try:
#    classifier_comparison = pd.read_pickle('results/based_on_article_base_and_quotient_vectorizer.pickle')
#except FileNotFoundError:
#    classifier_comparison = pd.DataFrame(columns=['Module', 'Class', 'Name', 'BestParameters', 'BestScore'])

#    for (module_name, class_name, classifier_name, parameters) in classifier_selection:
#        print('Randomized search for {}.{:<80}'.format(module_name, class_name), end='\r')
#        module_of_classifier = import_module(module_name)
#        class_of_classifier = getattr(module_of_classifier, class_name)
#        classifier = Pipeline([('vct', 'passthrough'),
#                               ('scl', 'passthrough'),
#                               (classifier_name, class_of_classifier())])
#        parameters = {classifier_name + '__' + param: parameters[param]
#                      for param in parameters.keys()}
#        parameters.update({
#            'vct': [QuotientCountVectorizer(), QuotientTfidfVectorizer()],
#            'scl': ['passthrough', StandardScaler(with_mean=False), MaxAbsScaler(), Normalizer()],        
#        })
#        rs = RandomizedSearchCV(estimator=classifier, n_iter=200, param_distributions=parameters,
#                                scoring='accuracy', cv=10, return_train_score=True, verbose=0, n_jobs=-1,
#                                random_state=0)
#        rs.fit(corpus_train, y_train)

#        classifier_comparison = classifier_comparison.append({'Module': module_name,
#                                                              'Class': class_name,
#                                                              'Name': classifier_name,
#                                                              'BestParameters': rs.best_params_,
#                                                              'BestScore': rs.best_score_},
#                                                             ignore_index=True)
    
#    classifier_comparison.to_pickle(path='results/based_on_article_base_and_quotient_vectorizer.pickle')
#    classifier_comparison.to_csv(path_or_buf='results/based_on_article_base_and_quotient_vectorizer.csv', index=False)

#classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
#def load_estimator(idx):
#    module_name = classifier_comparison.loc[idx, 'Module']
#    class_name = classifier_comparison.loc[idx, 'Class']
#    classifier_name = classifier_comparison.loc[idx, 'Name']
#    parameters = classifier_comparison.loc[idx, 'BestParameters']
#    if classifier_name.endswith('support_vector_classifier'):
#        parameters.update({
#                classifier_name + '__probability': True
#            })
#
#    module_of_classifier = import_module(module_name)
#    class_of_classifier = getattr(module_of_classifier, class_name)
#    classifier = Pipeline([('tfidf', parameters['tfidf']),
#                           ('scl', parameters['scl']),
#                           (classifier_name, class_of_classifier())])
#    classifier.set_params(**parameters)
    
#    return classifier

#plot_similarities_and_certainties(load_estimator(5), corpus_train, y_train)

## Voting classifier based on `'article base'`

Finally, we work with the feature `'article base'` again and study the results of a voting classifier.

In [None]:
corpus = mira_clfd['article base'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

In [None]:
classifier_comparison = pd.read_pickle('results/based_on_article_base.pickle')
classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
def load_estimator(idx):
    module_name = classifier_comparison.loc[idx, 'Module']
    class_name = classifier_comparison.loc[idx, 'Class']
    classifier_name = classifier_comparison.loc[idx, 'Name']
    parameters = classifier_comparison.loc[idx, 'BestParameters']
    if classifier_name.endswith('support_vector_classifier'):
        parameters.update({
                classifier_name + '__probability': True
            })

    module_of_classifier = import_module(module_name)
    class_of_classifier = getattr(module_of_classifier, class_name)
    classifier = Pipeline([('vct', parameters['vct']),
                           ('scl', parameters['scl']),
                           (classifier_name, class_of_classifier())])
    classifier.set_params(**parameters)
    
    return classifier

for voting in ['hard', 'soft']:
    classifier = VotingClassifier([
        ('clf1', load_estimator(5)),
        ('clf2', load_estimator(2)),
        ('clf3', load_estimator(4)),
        ('clf4', load_estimator(1)),
        ('clf5', load_estimator(0))
    ],
        voting=voting, n_jobs=-1)
    print('{} mean valid score: {:.3f}'.format(voting,
                                               cross_validate(classifier, corpus_train, y_train,
                                                              cv=10, n_jobs=-1)['test_score'].mean()))

In [None]:
# 5 best: hard 67.4 soft 67.1
# 4 best: hard 67.4 soft 67.2
# 3 best: hard 67.0 soft 67.0
# 2 best: hard 67.4 soft 66.7
# best svc and naive bayes: hard 67.0, soft 67.1

In [None]:
plot_similarities_and_certainties(classifier, corpus_train, y_train)

## Selection of classifier

Since the voting classifier does not seem to achieve better results, we select the best performing classifier on the feature `'article base'`, train it on the whole train set and evaluate its quality on the test set.

In [None]:
corpus = mira_clfd['article base'].values

corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.1, stratify=y,
                                                              random_state=0)

In [None]:
classifier_comparison = pd.read_pickle('results/based_on_article_base.pickle')
classifier_comparison.sort_values(by='BestScore', ascending=False)

In [None]:
def load_estimator(idx):
    module_name = classifier_comparison.loc[idx, 'Module']
    class_name = classifier_comparison.loc[idx, 'Class']
    classifier_name = classifier_comparison.loc[idx, 'Name']
    parameters = classifier_comparison.loc[idx, 'BestParameters']
    if classifier_name.endswith('support_vector_classifier'):
        parameters.update({
                classifier_name + '__probability': True
            })

    module_of_classifier = import_module(module_name)
    class_of_classifier = getattr(module_of_classifier, class_name)
    classifier = Pipeline([('vct', parameters['vct']),
                           ('scl', parameters['scl']),
                           (classifier_name, class_of_classifier())])
    classifier.set_params(**parameters)
    
    return classifier

In [None]:
classifier = load_estimator(5)
classifier.fit(corpus_train, y_train);
print('score train: {:.3f}'.format(classifier.score(corpus_train, y_train)))
print('score test:  {:.3f}'.format(classifier.score(corpus_test, y_test)))
print('score all:   {:.3f}'.format(classifier.score(corpus, y)))

In [None]:
plot_similarities_and_certainties(classifier, corpus_train, y_train)

We save this development classifier for the purpose of later visualizations.

In [None]:
joblib.dump(classifier, 'objects/dev_classifier.joblib');

## Application

Now we retrain the selected classifier on the whole corpus for the prediction on all (in particular still unclassified) instances.

In [None]:
classifier.fit(corpus, y);
print('score all:   {:.3f}'.format(classifier.score(corpus, y)))

We append the prediction and its certainty to the dataframe as well as some strings for later representation, e.g. in the web app.

In [None]:
mira['prediction'] = label_encoder.inverse_transform(classifier.predict(mira['article base']))
mira['certainty'] = np.max(classifier.predict_proba(mira['article base']), axis=1)

In [None]:
def print_prediction(instance):
    product_group = instance['product group']
    prediction = instance['prediction']
    return prediction if pd.isna(product_group) else product_group
def print_certainty(instance):
    product_group = instance['product group']
    certainty = instance['certainty']
    if pd.isna(product_group):
        if certainty >= 0.8:
            return 'very certain'
        elif certainty >= 0.5:
            return 'certain'
        elif certainty >= 0.2:
            return 'uncertain'
        elif certainty >= 0:
            return 'very uncertain'
    else:
        return 'confirmed'
    #return str(round(certainty * 100)) if pd.isna(product_group) else 'confirmed'

mira['prediction print'] = mira.apply(print_prediction, axis=1)
mira['certainty print'] = mira.apply(print_certainty, axis=1)

We illustrate the performance of the classifier by investigation of some examples.

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bAspirin\b', flags=re.IGNORECASE)]

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bSymbicort\b', flags=re.IGNORECASE)]

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bIbuprofen\b', flags=re.IGNORECASE)]

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bHydrocortison\b', flags=re.IGNORECASE)]

We can also illustrate that the classifier does not predict anything useful on instances it had no chance to learn something about. As there are much more tokens in the whole data set compared to the preclassified data set, this result is not surprising.

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bHüft\b', flags=re.IGNORECASE)]

In [None]:
mira[['article', 'article base', 'product group', 'prediction', 'certainty', 'prediction print',
      'certainty print']][mira['article'].str.contains(r'\bHüft\b', flags=re.IGNORECASE)
                          & mira['product group'].notna()]

## Save data set

We save the extended data frame.

In [None]:
mira.to_pickle('data/mira_processed.pickle')

## Save classifier and label encoder

We save the classifier as well as the label encoder for later usage, e.g. in the web app.

In [None]:
joblib.dump(classifier, 'objects/classifier.joblib');
joblib.dump(label_encoder, 'objects/label_encoder.joblib');

In [None]:
np.save('data/corpus_train.npy', corpus_train)
np.save('data/corpus_test.npy', corpus_test)
np.save('data/y_train.npy', y_train)
np.save('data/y_test.npy', y_test)