In [1]:
import sys

import numpy as np
import pandas as pd

sys.path.append("../../")

from helpers.constants import BASE_FEATURES, FEATURE_SETS
from helpers.reduce import load_and_distill
from helpers.stitch import ReadFilesIntoDataframe

`load_and_distill` loads our dataset, limits it to tracks that are tagged as being in exactly one of our base genres, and returns that reduced dataset.  It also removes all genre columns other than the hot encoded columns for the base genres.  By default, it includes features specified in `helpers.constants.BASE_FEATURES`

In [2]:
df = load_and_distill(tags=["tags_genre"])
print(f"{df.shape = }")
df

df.shape = (21351, 24)


Unnamed: 0,tags_genre,mfcc_mean_0,mfcc_mean_1,mfcc_mean_2,mfcc_mean_3,mfcc_mean_4,mfcc_mean_5,mfcc_mean_6,mfcc_mean_7,mfcc_mean_8,...,genre_blues,genre_classical,genre_country,genre_disco,genre_hiphop,genre_jazz,genre_metal,genre_pop,genre_reggae,genre_rock
2,,-707.518066,172.924515,-9.369545,3.948954,1.980628,4.739360,-1.698852,-2.034302,-4.835680,...,0,0,0,0,0,0,0,1,0,0
6,,-679.308899,113.101357,-11.290030,27.696062,-7.500221,2.523050,-6.960320,-0.077308,1.269921,...,0,0,0,0,0,0,0,0,0,1
7,,-644.655396,92.231018,51.315403,39.117119,13.334866,6.326947,1.615414,7.394960,1.892228,...,0,0,0,0,0,0,0,1,0,0
10,,-638.950195,116.760223,-12.944778,29.781858,2.505325,6.105454,-0.822559,0.271074,2.930531,...,0,0,0,0,0,0,0,0,0,1
12,,-727.369507,161.314056,-28.047831,16.271910,5.778194,10.153515,6.292103,0.406335,-2.448078,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55202,,-663.076965,119.231514,26.110373,35.280354,11.771117,-1.575334,14.373231,-6.124588,-4.167463,...,0,0,0,0,0,0,0,1,0,0
55204,,-704.559509,137.016068,21.475670,34.512798,3.327875,3.022218,5.146840,-0.011305,-3.082650,...,0,0,0,0,0,1,0,0,0,0
55205,,-691.049377,179.724976,25.448507,28.137154,19.806662,5.833863,-3.997207,-0.852162,4.942894,...,0,0,0,0,0,1,0,0,0,0
55208,,-831.944580,215.028198,73.213364,23.283628,11.257351,-1.969033,-2.790072,16.264069,3.572612,...,0,0,0,0,0,0,0,1,0,0


In [3]:
df.columns

Index(['tags_genre', 'mfcc_mean_0', 'mfcc_mean_1', 'mfcc_mean_2',
       'mfcc_mean_3', 'mfcc_mean_4', 'mfcc_mean_5', 'mfcc_mean_6',
       'mfcc_mean_7', 'mfcc_mean_8', 'mfcc_mean_9', 'mfcc_mean_10',
       'mfcc_mean_11', 'mfcc_mean_12', 'genre_blues', 'genre_classical',
       'genre_country', 'genre_disco', 'genre_hiphop', 'genre_jazz',
       'genre_metal', 'genre_pop', 'genre_reggae', 'genre_rock'],
      dtype='object')

If `load_and_distill` is called with `multi_label=True` then tracks tagged as being in one or more of our base genres will be returned.

In [4]:
df = load_and_distill(multi_label=True, tags="all")
print(f"{df.shape = }")
df

df.shape = (25562, 62)


Unnamed: 0,tags_file_name,tags_encoding,tags_artist,tags_date,tags_title,tags_album,tags_tracknumber,tags_bpm,tags_composer,tags_copyright,...,genre_blues,genre_classical,genre_country,genre_disco,genre_hiphop,genre_jazz,genre_metal,genre_pop,genre_reggae,genre_rock
2,1324898.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,0,1,0,0
6,1415798.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
7,283598.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,0,1,0,0
10,1149698.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,0,0,0,1
12,285898.mp3,[Lavf58.20.100],,,,,,,,,...,0,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55205,19854.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,1,0,0,0,0
55206,21854.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,1,0,0,1
55208,24854.mp3,[Lavf58.20.100],,,,,,,,,...,0,0,0,0,0,0,0,1,0,0
55210,25554.mp3,[Lavf58.20.100],[Cool Cavemen],[2006],[Pump the Funk Up],[Fusion],[2],,[Cool Cavemen],,...,0,0,0,0,0,0,0,1,0,1


In [5]:
df.columns

Index(['tags_file_name', 'tags_encoding', 'tags_artist', 'tags_date',
       'tags_title', 'tags_album', 'tags_tracknumber', 'tags_bpm',
       'tags_composer', 'tags_copyright', 'tags_genre', 'tags_encodedby',
       'tags_tracktotal', 'tags_artistsort', 'tags_musicbrainz_albumartistid',
       'tags_musicbrainz_albumid', 'tags_musicbrainz_artistid',
       'tags_musicbrainz_trackid', 'tags_license', 'tags_albumartist',
       'tags_discnumber', 'tags_conductor', 'tags_length', 'tags_label',
       'tags_musicbrainz_discid', 'tags_totaltracks', 'tags_disctotal',
       'tags_totaldiscs', 'tags_contentgroup', 'tags_taggingdate',
       'tags_engineer', 'tags_originalartist', 'tags_initialkey',
       'tags_originaldate', 'tags_originalfilename', 'tags_isrc',
       'tags_website', 'tags_lyricist', 'tags_albumartistsort', 'mfcc_mean_0',
       'mfcc_mean_1', 'mfcc_mean_2', 'mfcc_mean_3', 'mfcc_mean_4',
       'mfcc_mean_5', 'mfcc_mean_6', 'mfcc_mean_7', 'mfcc_mean_8',
       'mfcc_mean_

`load_and_distill` has a `features` parameter that takes a list of features to be included or 'all'.  If the `pickle` parameter is specified, the dataset will also be written to the pickle file named, with compression inferred from that name.

In [2]:
# FEATURE_SETS is a list of tuples of feature sets and descriptive file names.
# To avoid repeated loads of the reference data, we load it once and use the
# load_and_distill data parameter.

reference = ReadFilesIntoDataframe().read_mtg_jamendo_files()

folder = "../../datasets/"
for count, (name, features) in enumerate(FEATURE_SETS):
    filename = f"dataset_{count:02d}_{name}"
    df = load_and_distill(data=reference, features=features, pickle=folder + filename)
    print(f"Built dataset {filename} with shape = {df.shape}")

Built dataset dataset_00_all.pickle.bz2 with shape = (21351, 2753)
Built dataset dataset_01_mean.pickle with shape = (21351, 23)
Built dataset dataset_02_mean_cov.pickle with shape = (21351, 192)
Built dataset dataset_03_mean_icov.pickle with shape = (21351, 192)
Built dataset dataset_04_mean_cov_icov.pickle with shape = (21351, 361)
Built dataset dataset_05_pvtt.pickle with shape = (21351, 20)
Built dataset dataset_06_pvtt_mean.pickle with shape = (21351, 33)
Built dataset dataset_07_pvtt_mean_cov.pickle with shape = (21351, 202)
Built dataset dataset_08_pvtt_mean_icov.pickle with shape = (21351, 202)
Built dataset dataset_09_pvtt_mean_cov_icov.pickle with shape = (21351, 371)
