In [98]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import random
import time
import joblib
import os
from utils import render_single, render_multiple, get_dataset_files, extract_random_entries, extract_first_entries, generate_pixel_columns
from IPython.display import display, Image as IPImage

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [99]:
num_cats = 25
entries_per_cat = 500

# files = get_dataset_files()
# files = random.choices(files, k=num_cats)
files = ['./dataset/shorts.ndjson', './dataset/motorbike.ndjson', './dataset/envelope.ndjson', './dataset/suitcase.ndjson', './dataset/parrot.ndjson', './dataset/t-shirt.ndjson', './dataset/lighthouse.ndjson', './dataset/flip flops.ndjson', './dataset/speedboat.ndjson', './dataset/toothpaste.ndjson', './dataset/door.ndjson', './dataset/clarinet.ndjson', './dataset/lollipop.ndjson', './dataset/raccoon.ndjson', './dataset/microphone.ndjson', './dataset/mountain.ndjson', './dataset/traffic light.ndjson', './dataset/butterfly.ndjson', './dataset/spoon.ndjson', './dataset/spider.ndjson', './dataset/campfire.ndjson', './dataset/popsicle.ndjson', './dataset/frog.ndjson', './dataset/hot dog.ndjson', './dataset/elephant.ndjson']
data = [extract_first_entries(file, entries_per_cat, recognized=True) for file in files]
flat_data = [item for sublist in data for item in sublist]
df_loaded = pd.DataFrame.from_dict(flat_data, orient='columns')
print(f'Loaded {len(df_loaded)} entries from {files}')

Loaded 12500 entries from ['./dataset/shorts.ndjson', './dataset/motorbike.ndjson', './dataset/envelope.ndjson', './dataset/suitcase.ndjson', './dataset/parrot.ndjson', './dataset/t-shirt.ndjson', './dataset/lighthouse.ndjson', './dataset/flip flops.ndjson', './dataset/speedboat.ndjson', './dataset/toothpaste.ndjson', './dataset/door.ndjson', './dataset/clarinet.ndjson', './dataset/lollipop.ndjson', './dataset/raccoon.ndjson', './dataset/microphone.ndjson', './dataset/mountain.ndjson', './dataset/traffic light.ndjson', './dataset/butterfly.ndjson', './dataset/spoon.ndjson', './dataset/spider.ndjson', './dataset/campfire.ndjson', './dataset/popsicle.ndjson', './dataset/frog.ndjson', './dataset/hot dog.ndjson', './dataset/elephant.ndjson']


In [None]:
img = random.choice(flat_data)
display(IPImage(render_single(img['drawing'])))
print(img['word'])

In [None]:
count = 1000 if entries_per_cat > 1000 else entries_per_cat
word = random.choice(df_loaded['word'].values)
imgs = df_loaded[df_loaded['word'] == word].sample(count)
display(IPImage(render_multiple(imgs['drawing'])))
print(f'{count} superimposed {word}s.')

In [100]:
df = df_loaded.sample(len(df_loaded))
print('Done shuffling dataset')
df = generate_pixel_columns(df, resolution=64, invert_color=True, stroke_width_scale=2)
print('Done generating pixel columns')
df = df.reset_index()

train_amt = int(len(df) * .9)

train = df[:train_amt]
test = df[train_amt:]
# del df

train = train.reset_index()
test = test.reset_index()

print(f'Train: {len(train)} entries, test: {len(test)} entries.')

Done shuffling dataset
Done generating pixel columns
Train: 11250 entries, test: 1250 entries.


In [101]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca_on = True
save_to_disk = True

if save_to_disk:
    stamp = str(int(time.time()))
    folder = f'./runs/{stamp}/'
    if not os.path.exists(folder):
        os.makedirs(folder)
    pd.DataFrame.to_feather(df, folder + 'data')
    print('Done saving dataset to disk')

y = train['word']
X = train.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word'])

if pca_on:
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA(.85)
    X = pca.fit_transform(X)
    print(f'Keeping {pca.n_components_} features')
    if save_to_disk:
        joblib.dump(pca, folder + 'pca')
        joblib.dump(pca, folder + 'scaler')



Done saving dataset to disk
Keeping 640 features


```
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

grid = GridSearchCV(
    estimator=SVC(),
    param_grid={
        'C': [1, 3, 15, 70, 200, 1000],
        'gamma': [.009, .01, .5, 1, 1.5, 3, 6]
    },
    refit=True,
    verbose=2,
    n_jobs=-1
)

grid.fit(X,y)

print('Best hyperparameters:', grid.best_params_)
```

Best hyperparameters: {'C': 3, 'gamma': 0.009}

In [103]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import SGDClassifier

# classifier = LinearSVC(random_state=0, max_iter=100000, dual=False)
# classifier = NuSVC(nu=.1, max_iter=10000)
# classifier = SGDClassifier(loss='epsilon_insensitive', penalty='elasticnet', n_jobs=-1)
classifier = SVC(kernel='rbf', C=3, gamma=.0)
model = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
if save_to_disk: joblib.dump(model, folder + 'model')

In [None]:
sample = test.sample(1)
sample_predict = sample.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word'])

if pca_on:
    sample_predict = scaler.transform(sample_predict)
    sample_predict = pca.transform(sample_predict)

prediction = model.predict(sample_predict)
display(IPImage(render_single(sample['drawing'].iloc[0])))
print(prediction[0])
print(f"{sample['word'].iloc[0]} == {prediction[0]} ? {sample['word'].iloc[0] == prediction[0]}")

In [104]:
test2 = test.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word'])
if pca_on:
    test2 = scaler.transform(test2)
    test2 = pca.transform(test2)
prediction = model.predict(test2)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(test['word'].values.tolist(), prediction)
print(f"Accuracy score: {acc_score}")

Accuracy score: 0.0416
