# Imports

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import random
import time
import joblib
import os
from utils import get_dataset_files, extract_random_entries, extract_first_entries, generate_pixel_columns, load_run, extract_best_entries
from IPython.display import display, Image as IPImage
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from itertools import repeat
from sklearn.metrics import accuracy_score, classification_report

# Data loading + generation

Commented out are a few alternate ways of loading the data.
- Loading all classes, specific classes, or a certain number of classes at random
- Loading all entries in a class, loading a certain number of random entries, loading a certain number of the first entries in a class, or loading a certain number of the most complex entries in a class

In [2]:
load_existing_run = None

if load_existing_run is None:
    # num_cats = 10
    entries_per_cat = 2000
    image_gen_params = {
        'magnification': 4, # Higher values improve antialiasing, but uses more memory during drawing
        'resolution': 32,
        'invert_color': True, # True = white on black
        'stroke_width_scale': 2 # What stroke width to use to trace the lines in the drawing
    }
    
    # files = get_dataset_files()
    # files = random.sample(files, num_cats)
    # names = ['power outlet', 'pickup truck', 'castle']
    names = ['ambulance','bed','bench','bread','castle','cell phone','chair','church','coffee cup','crown','cruise ship','cup','dishwasher','dresser','eye','face',
        'fan','fire hydrant','fish','hammer','hat','helicopter','ice cream','lantern','passport','pickup truck','pillow','power outlet','sailboat','sandwich',
        'snowman','star','strawberry','suitcase','table','telephone','traffic light','watermelon','wine glass']
    files = list(map(lambda n: f"./dataset/{n}.ndjson", names))
    df = extract_best_entries(files, entries_per_cat, recognized=True, skip_first=200)
    # df = extract_random_entries(files, entries_per_cat, recognized=True)
    # df = extract_best_entries(files, entries_per_cat, recognized=True)
    
    print(f'Loaded {len(df)} entries from {files}')
    df = df.sample(len(df))
    print('Done shuffling dataset')
    df = generate_pixel_columns(df, **image_gen_params).reset_index(drop=True)
    print('Done generating pixel columns')

else:
    run = load_run(load_existing_run)
    df = run['data']
    num_cats = len(df['word'].value_counts())
    entries_per_cat = df['word'].value_counts()[df['word'].value_counts().keys()[0]]
    image_gen_params = run['img_params']

Loaded 78000 entries from ['./dataset/ambulance.ndjson', './dataset/bed.ndjson', './dataset/bench.ndjson', './dataset/bread.ndjson', './dataset/castle.ndjson', './dataset/cell phone.ndjson', './dataset/chair.ndjson', './dataset/church.ndjson', './dataset/coffee cup.ndjson', './dataset/crown.ndjson', './dataset/cruise ship.ndjson', './dataset/cup.ndjson', './dataset/dishwasher.ndjson', './dataset/dresser.ndjson', './dataset/eye.ndjson', './dataset/face.ndjson', './dataset/fan.ndjson', './dataset/fire hydrant.ndjson', './dataset/fish.ndjson', './dataset/hammer.ndjson', './dataset/hat.ndjson', './dataset/helicopter.ndjson', './dataset/ice cream.ndjson', './dataset/lantern.ndjson', './dataset/passport.ndjson', './dataset/pickup truck.ndjson', './dataset/pillow.ndjson', './dataset/power outlet.ndjson', './dataset/sailboat.ndjson', './dataset/sandwich.ndjson', './dataset/snowman.ndjson', './dataset/star.ndjson', './dataset/strawberry.ndjson', './dataset/suitcase.ndjson', './dataset/table.ndj

# Data splitting, standardization, and dimensional reduction

In [3]:
# Splitting the data
train_amt = int(len(df) * .8)

train = df[:train_amt]
test = df[train_amt:]

train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

print(f'Train: {len(train)} entries, test: {len(test)} entries.')

pca_on = True

y = train['word'].to_numpy()
X = train.filter(regex='pixel.+').to_numpy()
print("Done generating features and target")

if pca_on:
    if load_existing_run is None:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        pca = PCA(.85)
        X = pca.fit_transform(X)
        print(f'PCA & standardization done. Keeping {pca.n_components_} features')
    else:
        scaler = run['scaler']
        pca = run['pca']
        X = scaler.transform(X)
        X = pca.transform(X)
        print('Applied scaler and PCA.')

save_to_disk = True

if save_to_disk:
    stamp = str(int(time.time()))
    folder = f'./runs/{stamp}/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    pd.DataFrame.to_feather(df, folder + 'data')
    with open(folder + 'img_params', 'w') as f:
        f.writelines(str(image_gen_params))
    print('Done saving dataset to disk')
    if pca_on:
        joblib.dump(pca, folder + 'pca')
        joblib.dump(scaler, folder + 'scaler')
        print('Done saving PCA and scaler to disk')

Train: 62400 entries, test: 15600 entries.
Done generating features and target
PCA & standardization done. Keeping 180 features
Done saving dataset to disk
Done saving PCA and scaler to disk


# Model training

In [7]:
classifiers = {
    'LinearSVC': LinearSVC(dual=False),
    # 'NuSVC': NuSVC(nu=1e-07, tol=1e-09),
    'SGDClassifier': SGDClassifier(loss='epsilon_insensitive', penalty='elasticnet', n_jobs=-1),
    'SVC': SVC(kernel='rbf', C=2.5, gamma=.0001105),
    'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(store_covariance=True, tol=1e-06),
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(store_covariance=True, tol=1e-06),
    'MLPClassifier': MLPClassifier(hidden_layer_sizes=tuple(repeat(int(pca.n_components_ * 1.2), 3)), solver='lbfgs', alpha=1e-07),
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'ExtraTreeClassifier': ExtraTreeClassifier(),
    # 'KernelRidge': KernelRidge(),
    # 'GaussianProcess': GaussianProcessClassifier(1.0 * RBF(1.0)),
}

models = {}
for type, classifier in classifiers.items():
    start = time.time()
    models[type] = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
    end = time.time()
    print(f"Done training {type} model in {'{:.2f}'.format(end - start)}s")

if save_to_disk:
    joblib.dump(models, folder + 'models')
    print("Done saving models to disk")

Done training LinearSVC model in 30.44s
Done training SGDClassifier model in 14.50s
Done training SVC model in 1142.38s
Done training LinearDiscriminantAnalysis model in 45.82s
Done training QuadraticDiscriminantAnalysis model in 42.64s
Done training MLPClassifier model in 681.41s
Done training DecisionTreeClassifier model in 166.19s
Done training ExtraTreeClassifier model in 2.42s
Done saving models to disk


# Model evaluation

In [8]:
print('Random chance: ' + '{:.2f}%'.format(100 / len(names)))

for model_type, model in models.items():
    test2 = test.filter(regex='pixel.+').to_numpy()
    if pca_on:
        test2 = scaler.transform(test2)
        test2 = pca.transform(test2)
    prediction = model.predict(test2)

    truth = test['word'].values.tolist()
    acc_score = accuracy_score(truth, prediction)
    print(f"{model_type} classifier, accuracy: {'{:.2f}%'.format(acc_score * 100)}")
    print(classification_report(truth, prediction, zero_division=0))

Random chance: 2.56%
LinearSVC classifier, accuracy: 40.17%
               precision    recall  f1-score   support

    ambulance       0.38      0.40      0.39       406
          bed       0.33      0.08      0.13       439
        bench       0.44      0.70      0.54       428
        bread       0.40      0.01      0.02       402
       castle       0.23      0.13      0.17       372
   cell phone       0.43      0.44      0.43       442
        chair       0.46      0.75      0.57       405
       church       0.34      0.30      0.32       380
   coffee cup       0.39      0.54      0.45       392
        crown       0.40      0.20      0.27       406
  cruise ship       0.35      0.10      0.16       380
          cup       0.37      0.14      0.20       414
   dishwasher       0.43      0.69      0.53       393
      dresser       0.38      0.24      0.29       381
          eye       0.34      0.61      0.44       365
         face       0.39      0.60      0.47       381
    