In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import random
import time
import joblib
import os
from utils import render_single, extract_best_entries, generate_pixel_columns, extract_random_entries
from IPython.display import display, Image as IPImage
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from itertools import repeat
from sklearn.multiclass import OneVsRestClassifier

In [2]:
entries_per_cat = 1000

data = []

files = ['./dataset/clock.ndjson', './dataset/bicycle.ndjson', './dataset/sailboat.ndjson', './dataset/house.ndjson', './dataset/car.ndjson']
test = [extract_random_entries(file, entries_per_cat, recognized=True) for file in files]
flat_data = [item for sublist in test for item in sublist]
df_loaded = pd.DataFrame.from_dict(flat_data, orient='columns')
print(f'Loaded {len(df_loaded)} entries from {files}')
df_test = df_loaded.sample(len(df_loaded))
print('Done shuffling dataset')
data.append(pd.concat([extract_best_entries(file, entries_per_cat, recognized=True) for file in files], ignore_index=True).sort_values(by='complexity'))
data.append(pd.concat([extract_best_entries(file, entries_per_cat, recognized=True, descending=False) for file in files], ignore_index=True).sort_values(by='complexity'))

Loaded 5000 entries from ['./dataset/clock.ndjson', './dataset/bicycle.ndjson', './dataset/sailboat.ndjson', './dataset/house.ndjson', './dataset/car.ndjson']
Done shuffling dataset


In [3]:
image_gen_params = {
    'magnification': 4,
    'resolution': 64,
    'invert_color': True,
    'stroke_width_scale': 2
}

result = []
df_test = generate_pixel_columns(df_test, **image_gen_params).reset_index(drop=True)
for d in data:
    result.append({ 'data': generate_pixel_columns(d, **image_gen_params).reset_index(drop=True) })
    print(f'Done generating pixel columns')

Done generating pixel columns
Done generating pixel columns


In [4]:
for i, entry in enumerate(result):
    data = entry['data']
    print(f'Processing dataset {i + 1}/{len(result)}...')
    
    y = data['word'].to_numpy()
    X = data.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word', 'complexity']).to_numpy()
    print("Done generating features and target")

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA(.85)
    X = pca.fit_transform(X)
    print(f'PCA & standardization done. Keeping {pca.n_components_} features')
    entry['scaler'] = scaler
    entry['pca'] = pca

Processing dataset 1/2...
Done generating features and target (5000, 4096) Index(['word', 'countrycode', 'timestamp', 'recognized', 'key_id', 'drawing',
       'complexity', 'pixel0', 'pixel1', 'pixel2',
       ...
       'pixel4086', 'pixel4087', 'pixel4088', 'pixel4089', 'pixel4090',
       'pixel4091', 'pixel4092', 'pixel4093', 'pixel4094', 'pixel4095'],
      dtype='object', length=4103)
PCA & standardization done. Keeping 550 features
Processing dataset 2/2...
Done generating features and target (5000, 4096) Index(['word', 'countrycode', 'timestamp', 'recognized', 'key_id', 'drawing',
       'complexity', 'pixel0', 'pixel1', 'pixel2',
       ...
       'pixel4086', 'pixel4087', 'pixel4088', 'pixel4089', 'pixel4090',
       'pixel4091', 'pixel4092', 'pixel4093', 'pixel4094', 'pixel4095'],
      dtype='object', length=4103)
PCA & standardization done. Keeping 474 features


In [6]:
start = time.time()
for entry in result:
    train = entry['data']
    pca = entry['pca']
    scaler = entry['scaler']
    classifier = MLPClassifier(hidden_layer_sizes=tuple(repeat(int(pca.n_components_ * 1.2), 3)), solver='lbfgs', alpha=1e-07)
    y = train['word'].to_numpy()
    X = train.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word', 'complexity']).to_numpy()
    X = scaler.transform(X)
    X = pca.transform(X)
    entry['model'] = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
    end = time.time()
    print(f"Done training model in {'{:.2f}'.format(end - start)}s")
    start = end
    

Done training model in 38.63s
Done training model in 60.86s


In [7]:
from sklearn.metrics import accuracy_score

for i, entry in enumerate(result):
    scaler = entry['scaler']
    pca = entry['pca']
    model = entry['model']
    test = df_test.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word']).to_numpy()
    test = scaler.transform(test)
    test = pca.transform(test)
    prediction = model.predict(test)

    acc_score = accuracy_score(df_test['word'].values.tolist(), prediction)
    print(f"Dataset {i+1} accuracy: {acc_score}")

Dataset 1 accuracy: 0.8458
Dataset 2 accuracy: 0.7516
