In [6]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from utils import load_run, extract_random_entries, generate_pixel_columns
import pandas as pd
from sklearn.metrics import accuracy_score

run = load_run('1638012544')
if run is None:
    raise Exception("Invalid run id")

num_entries = 1000

og_data = run['data']
cats = list(og_data['word'].value_counts().keys())
print(cats)

files = list(map(lambda c: f'./dataset/{c}.ndjson', cats))
try:
    img_params = run['img_params']
except KeyError:
    raise Exception("Unknown image params. Aborting")


data = [extract_random_entries(file, num_entries, recognized=True) for file in files]
flat_data = [item for sublist in data for item in sublist]
df = pd.DataFrame.from_dict(flat_data, orient='columns')
print(f'Loaded {len(df)} entries from {files}')
df = generate_pixel_columns(df, **img_params)
print('Done generating pixel columns')
data = df.reset_index(drop=True)

models = run['models']
pca = run['pca'] if 'pca' in run else None
scaler = run['scaler'] if 'scaler' in run else None
print(f"Done loading run. PCA {'not ' if pca is None else ''}found.")

sample = data.sample(1000 if len(data) >= 1000 else len(data)).reset_index(drop=True)
target = sample['word'].values.tolist()
test = sample.drop(columns=['countrycode', 'timestamp', 'recognized', 'key_id', 'drawing', 'word']).to_numpy()

scores = {}
if pca is not None:
    test = scaler.transform(test)
    test = pca.transform(test)

for cls_type, model in models.items():
    print(f'Predicting for classifier {cls_type}')
    prediction = model.predict(test)
    
    print(f"Scoring performance of classifier {cls_type}")
    acc_score = accuracy_score(target, prediction)
    scores[cls_type] = acc_score

for cls_type, score in scores.items():
    print(f"{cls_type} classifier, accuracy: {score}")

/home/chris/swd_2/aai/final-project/notebook
/home/chris/swd_2/aai/final-project/notebook/runs/1638012544
['bench', 'lighthouse', 'rhinoceros']
Loaded 3000 entries from ['./dataset/bench.ndjson', './dataset/lighthouse.ndjson', './dataset/rhinoceros.ndjson']
Done generating pixel columns
Done loading run. PCA found.
Predicting for classifier LinearSVC
Scoring performance of classifier LinearSVC
Predicting for classifier NuSVC
Scoring performance of classifier NuSVC
Predicting for classifier SGDClassifier
Scoring performance of classifier SGDClassifier
Predicting for classifier SVC
Scoring performance of classifier SVC
Predicting for classifier LinearDiscriminantAnalysis
Scoring performance of classifier LinearDiscriminantAnalysis
Predicting for classifier QuadraticDiscriminantAnalysis
Scoring performance of classifier QuadraticDiscriminantAnalysis
LinearSVC classifier, accuracy: 0.751
NuSVC classifier, accuracy: 0.879
SGDClassifier classifier, accuracy: 0.347
SVC classifier, accuracy: 0