In [5]:
%load_ext autoreload
%autoreload 2
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neural_network import MLPClassifier
from itertools import repeat
import time, os, joblib
import pandas as pd
from utils import generate_pixel_columns, extract_best_entries, extract_first_entries, equalize_by

# categories = ['car']
categories = ['ambulance','bed','bench','bread','castle','cell phone','chair','church','coffee cup','crown','cruise ship','cup','dishwasher','dresser','eye','face',
    'fan','fire hydrant','fish','hammer','hat','helicopter','ice cream','lantern','passport','pickup truck','pillow','power outlet','sailboat','sandwich','snowman',
    'star','strawberry','suitcase','table','telephone','traffic light','watermelon','wine glass']

image_gen_params = {
    'magnification': 1,
    'resolution': 32,
    'invert_color': True,
    'stroke_width_scale': 1
}

root = './countries'
if not os.path.exists(root):
    os.makedirs(root)

done = 0
for category in categories:
    t0 = time.time()
    print(f'Processing "{category}" ({done + 1}/{len(categories)}):')
    folder = f'{root}/{category}'
    if not os.path.exists(folder):
        os.makedirs(folder)
    df = extract_first_entries(f'./dataset/{category}.ndjson', recognized=True)
    df = equalize_by(df, 'countrycode')
    print(f"Extracted {len(df)} entries from category {category}")
    print(f"Retained {len(df['countrycode'].value_counts())} countries")
    df = generate_pixel_columns(df, **image_gen_params)

    train_amt = int(len(df) * .9)

    train = df[:train_amt]
    test = df[train_amt:]

    train = train.reset_index(drop=True)
    test = test.reset_index(drop=True)

    print(f'Train: {len(train)} entries, test: {len(test)} entries.')

    y = train['countrycode'].to_numpy()
    X = train.filter(regex='pixel.+').to_numpy()
    print("Done generating features and target")

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    pca = PCA(.85)
    X = pca.fit_transform(X)
    print(f'PCA & standardization done. Keeping {pca.n_components_} features')

    classifier = MLPClassifier(hidden_layer_sizes=tuple(repeat(int(pca.n_components_ * 1.2), 3)), solver='lbfgs', alpha=1e-07)
    start = time.time()
    model = OneVsRestClassifier(classifier, n_jobs=-1).fit(X, y)
    end = time.time()
    print(f"Done training model in {'{:.2f}'.format(end - start)}s")
    joblib.dump(model, folder + '/model')
    joblib.dump(pca, folder + '/pca')
    joblib.dump(scaler, folder + '/scaler')
    print("Done saving model, pca, and scaler to disk")

    test2 = test.filter(regex='pixel.+').to_numpy()
    test2 = scaler.transform(test2)
    test2 = pca.transform(test2)
    prediction = model.predict(test2)

    countries = list(test['countrycode'].value_counts().keys())
    counts = {}
    for idx in range(len(test)):
        country = test['countrycode'].iloc[idx]
        entry_score = counts.get(country, (0, 0))
        entry_score = (entry_score[0] + 1 if prediction[idx] == country else entry_score[0], entry_score[1] + 1)
        counts[country] = entry_score
    scores = {}
    for country in countries:
        scores[country] = (counts[country][0] / counts[country][1]) * 100

    scores = [(k, v) for k, v in scores.items()]
    scores.sort(key=lambda e : e[1], reverse=True)
    threshold = 100 / len(countries)

    out = []

    out.append(f'Category "{category}"\n')
    out.append(str(image_gen_params))
    out.append(str(scores) + '\n')

    out.append(f'Scores greater than {"{:.2f}".format(threshold)}% (random chance):')
    for entry in scores:
        if entry[1] > threshold:
            out.append(f'  {entry[0]}: {"{:.2f}".format(entry[1])}%')

    from sklearn.metrics import accuracy_score
    acc_score = accuracy_score(test['countrycode'].values.tolist(), prediction)
    out.append(f"Overall accuracy: {'{:.2f}'.format(acc_score)}%\n")

    out = '\n'.join(out)
    with open(folder + '/stats', 'w') as f:
        f.write(out)

    print(out)

    duration = time.time() - t0
    print(f"Finished processing cateogory in {int(duration // 60)}m{int(duration % 60)}s")
    done += 1




The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Processing "ambulance" (1/39):
Extracted 22000 entries from category ambulance
Retained 22 countries
Train: 19800 entries, test: 2200 entries.
Done generating features and target
PCA & standardization done. Keeping 530 features
Done training model in 582.05s
Done saving model, pca, and scaler to disk
Category "ambulance"

{'magnification': 1, 'resolution': 32, 'invert_color': True, 'stroke_width_scale': 1}
[('TH', 14.285714285714285), ('KR', 12.62135922330097), ('SA', 12.612612612612612), ('IN', 10.784313725490197), ('FI', 10.1010101010101), ('HU', 9.278350515463918), ('CA', 9.090909090909092), ('NL', 8.88888888888889), ('GB', 7.8431372549019605), ('PH', 7.446808510638298), ('BR', 6.024096385542169), ('CZ', 5.555555555555555), ('FR', 5.3097345132743365), ('RU', 5.263157894736842), ('AU', 5.128205128205128), ('PL', 4.958677685950414), ('US', 4.3478260869565215), ('IT', 3.3333333333333335), ('SE', 3.1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


Done training model in 764.84s
Done saving model, pca, and scaler to disk
Category "bench"

{'magnification': 1, 'resolution': 32, 'invert_color': True, 'stroke_width_scale': 1}
[('JP', 13.541666666666666), ('KR', 13.26530612244898), ('PH', 12.195121951219512), ('NL', 12.037037037037036), ('SE', 10.377358490566039), ('FI', 8.51063829787234), ('US', 7.6923076923076925), ('TH', 7.608695652173914), ('RU', 7.000000000000001), ('IT', 6.481481481481481), ('PL', 6.0606060606060606), ('GB', 4.9504950495049505), ('CZ', 4.081632653061225), ('AU', 3.9215686274509802), ('FR', 3.125), ('CA', 2.586206896551724), ('HU', 2.127659574468085), ('DE', 2.083333333333333), ('BR', 1.0309278350515463)]

Scores greater than 5.26% (random chance):
  JP: 13.54%
  KR: 13.27%
  PH: 12.20%
  NL: 12.04%
  SE: 10.38%
  FI: 8.51%
  US: 7.69%
  TH: 7.61%
  RU: 7.00%
  IT: 6.48%
  PL: 6.06%
Overall accuracy: 0.07%

Finished processing cateogory in 12m58s
Processing "bread" (4/39):
Extracted 19000 entries from category b