In [4]:
%load_ext autoreload
%autoreload 2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from itertools import repeat
import time, os, joblib
import pandas as pd
from utils import generate_pixel_columns, extract_best_entries, extract_first_entries, equalize_by

categories = ['car']
# categories = ['ambulance','bed','bench','bread','castle','cell phone','chair','church','coffee cup','crown','cruise ship','cup','dishwasher','dresser','eye','face',
    # 'fan','fire hydrant','fish','hammer','hat','helicopter','ice cream','lantern','passport','pickup truck','pillow','power outlet','sailboat','sandwich','snowman',
    # 'star','strawberry','suitcase','table','telephone','traffic light','watermelon','wine glass']

image_gen_params = {
    'magnification': 1,
    'resolution': 32,
    'invert_color': True,
    'stroke_width_scale': 1
}

root = './countries'
if not os.path.exists(root):
    os.makedirs(root)

done = 0
for category in categories:
    t0 = time.time()
    print(f'Processing "{category}" ({done + 1}/{len(categories)}):')
    folder = f'{root}/{category}'
    if not os.path.exists(folder):
        os.makedirs(folder)
    df = extract_first_entries(f'./dataset/{category}.ndjson', recognized=True)
    df = equalize_by(df, 'countrycode')
    print(f"Extracted {len(df)} entries from category {category}")
    print(f"Retained {len(df['countrycode'].value_counts())} countries")
    df = generate_pixel_columns(df, **image_gen_params)

    train_amt = int(len(df) * .9)

    train = df[:train_amt]
    test = df[train_amt:]

    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)

    print(f'Train: {len(train)} entries, test: {len(test)} entries.')

    y = train['countrycode'].to_numpy()
    X = train.filter(regex='pixel.+').to_numpy()
    print("Done generating features and target")

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA(.85)
    X = pca.fit_transform(X)
    print(f'PCA & standardization done. Keeping {pca.n_components_} features')

    classifier = MLPClassifier(hidden_layer_sizes=tuple(repeat(int(pca.n_components_ * 1.2), 3)), solver='lbfgs', alpha=1e-07)
    start = time.time()
    model = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
    end = time.time()
    print(f"Done training model in {'{:.2f}'.format(end - start)}s")
    joblib.dump(model, folder + '/model')
    joblib.dump(pca, folder + '/pca')
    joblib.dump(scaler, folder + '/scaler')
    print("Done saving model, pca, and scaler to disk")

    test2 = test.filter(regex='pixel.+').to_numpy()
    test2 = scaler.transform(test2)
    test2 = pca.transform(test2)
    prediction = model.predict(test2)

    countries = list(test['countrycode'].value_counts().keys())
    counts = {}
    for idx in range(len(test)):
        country = test['countrycode'].iloc[idx]
        entry_score = counts.get(country, (0, 0))
        entry_score = (entry_score[0] + 1 if prediction[idx] == country else entry_score[0], entry_score[1] + 1)
        counts[country] = entry_score
    scores = {}
    for country in countries:
        scores[country] = (counts[country][0] / counts[country][1]) * 100

    scores = [(k, v) for k, v in scores.items()]
    scores.sort(key=lambda e : e[1], reverse=True)
    threshold = 100 / len(countries)

    out = []

    out.append(f'Category "{category}"\n')
    out.append(str(image_gen_params))
    out.append(str(scores) + '\n')

    out.append(f'Scores greater than {"{:.2f}".format(threshold)}% (random chance):')
    for entry in scores:
        if entry[1] > threshold:
            out.append(f'  {entry[0]}: {"{:.2f}".format(entry[1])}%')

    from sklearn.metrics import accuracy_score
    acc_score = accuracy_score(test['countrycode'].values.tolist(), prediction)
    out.append(f"Overall accuracy: {'{:.2f}'.format(acc_score)}%\n")

    out = '\n'.join(out)
    with open(folder + '/stats', 'w') as f:
        f.write(out)

    print(out)

    duration = time.time() - t0
    print(f"Finished processing cateogory in {int(duration // 60)}m{int(duration % 60)}s")
    done += 1




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Processing "car" (1/1):
Extracted 27000 entries from category car
Retained 27 countries
Train: 24300 entries, test: 2700 entries.
Done generating features and target
PCA & standardization done. Keeping 504 features
Done training model in 731.36s
Done saving model, pca, and scaler to disk
Category "car"

{'magnification': 1, 'resolution': 32, 'invert_color': True, 'stroke_width_scale': 1}
[('KR', 17.346938775510203), ('TW', 12.5), ('JP', 9.174311926605505), ('AE', 8.421052631578947), ('CZ', 8.421052631578947), ('VN', 7.865168539325842), ('DE', 7.8431372549019605), ('PH', 6.862745098039216), ('IN', 6.862745098039216), ('SA', 6.730769230769231), ('BR', 6.5420560747663545), ('TH', 6.25), ('FI', 5.952380952380952), ('NL', 4.901960784313726), ('AU', 4.25531914893617), ('RO', 4.2105263157894735), ('SK', 4.0), ('PL', 3.4090909090909087), ('FR', 3.3707865168539324), ('HU', 3.260869565217391), ('GB', 3.191489