# cat-AI-log. An AI-based product group allocation system

Capstone project.

Sebastian Thomas @ neue fische Bootcamp Data Science<br />
(datascience at sebastianthomas dot de)

# Part 6: Visualization

We visualize some results.

## Imports

### Modules, classes and functions

In [None]:
# data
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt

# machine learning
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import Normalizer

### Data and objects

In [None]:
corpus_train = np.load('data/corpus_train.npy', allow_pickle=True)
corpus_test = np.load('data/corpus_test.npy', allow_pickle=True)
y_train = np.load('data/y_train.npy', allow_pickle=True)
y_test = np.load('data/y_test.npy', allow_pickle=True)

In [None]:
from joblib import load

label_encoder = load('objects/label_encoder.joblib')
classifier = load('objects/dev_classifier.joblib')

## Visualization

### Division of test set into correctly and incorrectly classified instances

In [None]:
corpus_test_correct = corpus_test[y_test == classifier.predict(corpus_test)]
corpus_test_incorrect = corpus_test[y_test != classifier.predict(corpus_test)]

n = corpus_test.shape[0]
n_correct = corpus_test_correct.shape[0]
n_incorrect = corpus_test_incorrect.shape[0]

fig = plt.figure(figsize=(17, 4), dpi=300)
ax = plt.axes()
ax.barh(0, width=n_correct, left=0, color='forestgreen')
ax.barh(0, width=n_incorrect, left=n_correct, color='firebrick')
ax.get_xaxis().set_tick_params(length=0)
ax.get_yaxis().set_visible(False)
ax.set_xlim(0, n)
ax.set_xticks([0, n_correct, n])
for direction in ['left', 'right', 'top', 'bottom']:
    ax.spines[direction].set_visible(False)

plt.savefig('figures/division_of_test_set.png', bbox_inches='tight', pad_inches=0)
plt.savefig('figures/division_of_test_set.svg', bbox_inches='tight', pad_inches=0)
plt.show()

### Similarities of correctly and incorrectly classified instances to learned instances

In [None]:
normal_vectorizer = make_pipeline(CountVectorizer(), Normalizer())
ndt = normal_vectorizer.fit_transform(corpus_train)

#bin_descriptions = ['0.0 to 0.2', '0.2 to 0.4', '0.4 to 0.6', '0.6 to 0.8', '0.8 to 1.0']
bin_descriptions = ['', 'dissimilar', '', '', '', 'similar']
bins = np.linspace(0., 1., 6)

similarities_correct = np.max(ndt.dot(normal_vectorizer.transform(corpus_test_correct).transpose()),
                              axis=0).toarray().flatten()
similarities_correct_binned = pd.cut(np.round(similarities_correct, 2), bins, include_lowest=True)
similarities_correct_percentages = (similarities_correct_binned.value_counts()
                                    / corpus_test_correct.shape[0] * 100)

similarities_incorrect = np.max(ndt.dot(normal_vectorizer.transform(corpus_test_incorrect).transpose()),
                              axis=0).toarray().flatten()
similarities_incorrect_binned = pd.cut(np.round(similarities_incorrect, 2), bins, include_lowest=True)
similarities_incorrect_percentages = (similarities_incorrect_binned.value_counts()
                                      / corpus_test_incorrect.shape[0] * 100)

(fig, (ax1, ax2)) = plt.subplots(1, 2, sharey=True, figsize=(17, 6), dpi=300)

ax1.grid(axis='y', color='black', zorder=0)
ax1.bar(range(5), similarities_correct_percentages, color='forestgreen', zorder=3)
ax1.set_title(r'Similarities of $\bf{correctly}$ catalogued articles')
ax1.set_xlabel('similarity to most similar article learned by cat-AI-log')
ax1.set_ylabel('percentage share')
ax1.set_yticks([0, 20, 40, 60])
ax1.set_xticklabels(bin_descriptions)
ax1.get_xaxis().set_tick_params(length=0)
ax1.spines['left'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

ax2.grid(axis='y', color='black', zorder=0)
ax2.bar(range(5), similarities_incorrect_percentages, color='firebrick', zorder=3)
ax2.set_title(r'Similarities of $\bf{incorrectly}$ catalogued articles')
ax2.set_xlabel('similarity to most similar article learned by cat-AI-log')
ax2.set_yticks([0, 20, 40, 60])
ax2.set_xticklabels(bin_descriptions)
ax2.get_xaxis().set_tick_params(length=0)
ax2.spines['left'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)

plt.savefig('figures/similarities.png', bbox_inches='tight', pad_inches=0)
plt.savefig('figures/similarities.svg', bbox_inches='tight', pad_inches=0)
plt.show()

### Certainties of correctly and incorrectly classified instances

In [None]:
#bin_descriptions = ['0.0 to 0.2', '0.2 to 0.4', '0.4 to 0.6', '0.6 to 0.8', '0.8 to 1.0']
bin_descriptions = ['', 'uncertain', '', '', '', 'certain']
bins = np.linspace(0., 1., 6)

certainties_correct = np.max(classifier.predict_proba(corpus_test_correct), axis=1)
certainties_correct_binned = pd.cut(np.round(certainties_correct, 2), bins, include_lowest=True)
certainties_correct_percentages = (certainties_correct_binned.value_counts() 
                                   / corpus_test_correct.shape[0] * 100)

certainties_incorrect = np.max(classifier.predict_proba(corpus_test_incorrect), axis=1)
certainties_incorrect_binned = pd.cut(np.round(certainties_incorrect, 2), bins, include_lowest=True)
certainties_incorrect_percentages = (certainties_incorrect_binned.value_counts()
                                     / corpus_test_incorrect.shape[0] * 100)

(fig, (ax1, ax2)) = plt.subplots(1, 2, sharey=True, figsize=(17, 6), dpi=300)

ax1.grid(axis='y', color='black', zorder=0)
ax1.bar(range(5), certainties_correct_percentages, color='forestgreen', zorder=3)
ax1.set_title(r'Certainties of $\bf{correctly}$ catalogued articles')
ax1.set_xlabel('cat-AI-log\'s certainty of having chosen the right product group')
ax1.set_ylabel('percentage share')
ax1.set_yticks([0, 20, 40, 60])
ax1.set_xticklabels(bin_descriptions)
ax1.get_xaxis().set_tick_params(length=0)
ax1.spines['left'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)

ax2.grid(axis='y', color='black', zorder=0)
ax2.bar(range(5), certainties_incorrect_percentages, color='firebrick', zorder=3)
ax2.set_title(r'Certainties of $\bf{incorrectly}$ catalogued articles')
ax2.set_xlabel('cat-AI-log\'s certainty of having chosen the right product group')
ax2.set_yticks([0, 20, 40, 60])
ax2.set_xticklabels(bin_descriptions)
ax2.get_xaxis().set_tick_params(length=0)
ax2.spines['left'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['top'].set_visible(False)

plt.savefig('figures/certainties.png', bbox_inches='tight', pad_inches=0)
plt.savefig('figures/certainties.svg', bbox_inches='tight', pad_inches=0)
plt.show()