# Train

This notebook trains a bunch of classifiers on the like/dislike playlists downloaded in the **Get Spotify Data** notebook. It saves the best one (highest accuracy) in the `/classifiers` folder.

In [None]:
from pkg.spotifyclassifier import *
from pkg.run import *
from pkg.wrangling import *
from pkg.constants import LABEL_LIST
from pkg.serialize import save_classifier

### Prepare Training Data

The metadata, feature space, and genres are read in and stored in the "songs" dictionary by `track_id`.

In [None]:
default_song_data = FrozenMap(read_data('data/data.csv', True))
print('Song data reading complete.')

In [None]:
default_feature_names = tuple(next(iter(default_song_data.values()))['features'].keys())

In [None]:
# Sanity check
sanity_check(default_song_data)

In [None]:
test_cluster_size(default_song_data, 10)

Compute clusters from training data for algorithms that use them.

In [None]:
clustered_song_data, songs_by_cluster = {}, {}
clustered_song_data, songs_by_cluster = get_kmeans_clusters(default_song_data, NUM_CLUSTERS)

In [None]:
# sanity check
if set(next(iter(default_song_data.values()))['features'].keys()) == set(next(iter(clustered_song_data.values()))['features'].keys()):
    raise ValueError('Default features messed up.')

In [None]:
default_training_data, default_validation_data, clustered_training_data, clustered_validation_data = get_experiment_split(default_song_data, clustered_song_data)
training_clusters = get_train_clusters(clustered_training_data, songs_by_cluster[NUM_CLUSTERS])

### Train Classifiers

In [None]:
print('----\nUnclustered\n----')
active_unclustered_results = run_active_suite(default_song_data, default_training_data, default_validation_data, SUPPORTED_ALGS, AL_STRATS)

In [None]:
print('----\nClustered\n----')
active_clustered_results = run_active_suite(default_song_data, clustered_training_data, clustered_validation_data, SUPPORTED_ALGS, AL_STRATS)

In [None]:
print('\n----\nClustered w/ Cluster Sampling\n----')
active_cluster_sampled_results = run_clusters_suite(default_song_data, clustered_training_data, clustered_validation_data, training_clusters)

In [None]:
best_classifier = get_highest_benchmark(
    default_song_data,
    default_training_data,
    default_validation_data,
    clustered_training_data,
    clustered_validation_data
)
save_classifier(best_classifier, None) # specify filename instead of None if desired

print('Classifier saved successfully.')