In [6]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import random
import time
import joblib
from utils import render_single, render_multiple, get_dataset_files, extract_random_entries, generate_pixel_columns
from IPython.display import display, Image as IPImage

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
files = get_dataset_files()
files = random.choices(files, k=100)
# files = ['../dataset/bat.ndjson', '../dataset/snowman.ndjson', '../dataset/spider.ndjson', '../dataset/tshirt.ndjson', '../dataset/binoculars.ndjson', '../dataset/eyeglasses.ndjson', '../dataset/cow.ndjson', '../dataset/dog.ndjson', '../dataset/guitar.ndjson', '../dataset/house.ndjson']
data = [extract_random_entries(file, 10000, recognized=True) for file in files]
flat_data = [item for sublist in data for item in sublist]
df = pd.DataFrame.from_dict(flat_data, orient='columns')
print(f'Loaded {len(df)} entries from {files}')

Loaded 1000000 entries from ['../dataset/garden.ndjson', '../dataset/trumpet.ndjson', '../dataset/dresser.ndjson', '../dataset/car.ndjson', '../dataset/hammer.ndjson', '../dataset/skyscraper.ndjson', '../dataset/sink.ndjson', '../dataset/umbrella.ndjson', '../dataset/calendar.ndjson', '../dataset/ambulance.ndjson', '../dataset/zigzag.ndjson', '../dataset/flower.ndjson', '../dataset/smiley face.ndjson', '../dataset/skateboard.ndjson', '../dataset/bat.ndjson', '../dataset/owl.ndjson', '../dataset/tooth.ndjson', '../dataset/animal migration.ndjson', '../dataset/dresser.ndjson', '../dataset/sailboat.ndjson', '../dataset/guitar.ndjson', '../dataset/microwave.ndjson', '../dataset/lighter.ndjson', '../dataset/fireplace.ndjson', '../dataset/pineapple.ndjson', '../dataset/aircraft carrier.ndjson', '../dataset/house.ndjson', '../dataset/tennis racquet.ndjson', '../dataset/snorkel.ndjson', '../dataset/umbrella.ndjson', '../dataset/blackberry.ndjson', '../dataset/mailbox.ndjson', '../dataset/pants

In [None]:
img = random.choice(flat_data)
display(IPImage(render_single(img['drawing'])))
print(img['word'])

In [None]:
count = 1000
word = random.choice(df['word'].values)
imgs = df[df['word'] == word].sample(count)
display(IPImage(render_multiple(imgs['drawing'])))
print(f'{count} superimposed {word}s.')

In [9]:
df = df.sample(len(df))
print('Done shuffling dataset')
df = generate_pixel_columns(df, resolution=28, invert_color=True)
print('Done generating pixel columns')
df = df.reset_index()
stamp = str(int(time.time()))
pd.DataFrame.to_feather(df, '../data' + stamp)
print('Done saving dataset to disk')
train_amt = int(len(df) * .80)

train = df[:train_amt]
test = df[train_amt:]
del df

train = train.reset_index()
test = test.reset_index()

print(f'Train: {len(train)} entries, test: {len(test)} entries.')

Done saving dataset to disk
Train: 800000 entries, test: 200000 entries.


In [13]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC, NuSVC
from sklearn.decomposition import PCA

y = train['word']
X = train.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word'])

pca = PCA(.85)
pca.fit(X)

X = pca.transform(X)
print(f'Keeping {pca.n_components_} features')

#classifier = LinearSVC(random_state=0, max_iter=100000, dual=False)
classifier = NuSVC(nu=.01)
model = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
joblib.dump(model, '../model' + stamp)
joblib.dump(pca, '../pca' + stamp)



In [None]:
sample = test.sample(1)
sample_predict = sample.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word'])

sample_predict = pca.transform(sample_predict)

prediction = model.predict(sample_predict)
display(IPImage(render_single(sample['drawing'].iloc[0])))
print(prediction[0])
print(f"{sample['word'].iloc[0]} == {prediction[0]} ? {sample['word'].iloc[0] == prediction[0]}")

In [None]:
test = pca.transform(test.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word']))
prediction = model.predict(test)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(test['word'].values.tolist(), prediction)
print(f"Accuracy score: {acc_score}")