# Preprocessing

## Step 1: Preprocessing of tags

The file `autotagging_genre.tsv`, which contains listing of genre tags, can be found [here](https://github.com/MTG/mtg-jamendo-dataset/blob/master/data/autotagging_genre.tsv) on the repo. It is also saved under `/data/autotagging_genre.tsv` for our convenience.


In [None]:
# Install scikit-learn if necessary
# !pip install -U scikit-learn

In [8]:
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import os

# TODO: Update path
filepath = "/Users/lawrenceclai/Documents/projects/mids_w207_music_genre/src/mids_w207_music_genre/member_workspaces/lawrence/mtg-jamendo-dataset/data/autotagging_genre.tsv"

# Import and clean genre tags
df_tags = pd.read_csv(filepath, header=None, skiprows=1)
df_tags = df_tags[0].str.split("\t", 5, expand=True)
cols_names = ['metadata_track_id', 'metadata_artist_id', 'metadata_album_id', 
              'metadata_path', 'metadata_duration', 'metadata_tags']
df_tags.columns = cols_names
df_tags['metadata_tags'] = df_tags['metadata_tags'].str.replace("---", "_").str.split("\t")
df_tags['metadata_filename'] = df_tags['metadata_path'].apply(lambda x: os.path.basename(x).replace(".mp3", ""))

# Get one-hot-encoding for all genre tags
mlb = MultiLabelBinarizer()
df_dummies = pd.DataFrame(mlb.fit_transform(df_tags['metadata_tags']),
                          columns=mlb.classes_,
                          index=df_tags.index)
df_tags.drop("metadata_tags", axis=1, inplace=True)
df_tags = pd.concat([df_tags, df_dummies], axis=1)

df_tags.head(5)

Unnamed: 0,metadata_track_id,metadata_artist_id,metadata_album_id,metadata_path,metadata_duration,metadata_filename,genre_60s,genre_70s,genre_80s,genre_90s,...,genre_soundtrack,genre_swing,genre_symphonic,genre_synthpop,genre_techno,genre_trance,genre_tribal,genre_triphop,genre_world,genre_worldfusion
0,track_0000214,artist_000014,album_000031,14/214.mp3,124.6,214,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,track_0000215,artist_000014,album_000031,15/215.mp3,151.4,215,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,track_0000216,artist_000014,album_000031,16/216.mp3,234.9,216,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,track_0000217,artist_000014,album_000031,17/217.mp3,127.9,217,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,track_0000218,artist_000014,album_000031,18/218.mp3,180.7,218,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Step 2: Preprocessing acousticbrainz jsons

This step assumes that Step 1 from `download.ipynb` has already been executed and all acousticbrainz jsons have been unpacked from their tars. An illustration of how the acousticbrainz jsons should be organized is shown under `/datasets/acousticbrainz-json`.

In [11]:
# Get a list of filepaths with genre tags
filepaths = df_tags['metadata_path'].unique()

### Step 2.1: Combine and flatten acousticbrainz jsons

In [19]:
import json
import os
import pandas as pd
import numpy as np
import pickle
from typing import Dict, List


def flatten_json(child_value: Dict, parent_key:str="root") -> Dict:
    """Recursive function to flatten json and nested json
    Parameters:
        child_value (Dict): Value of a dictionary
        parent_key (str): Key of a dictionary
    Returns:
        result (Dict): A flattened json in dictionary form
    """
    result = dict()
    for k, v in child_value.items():
        if isinstance(v, dict):
            result = {**result, **flatten_json(v, k)}
        else:
            if isinstance(v, list):
                v = np.array(v)
            result[F"{parent_key}_{k}"] = [v]
    return result


def combine_acousticbrainz_jsons(filepaths: List[str],
                                 folderpath_src: str) -> pd.DataFrame:
    """Preprocess acousticbrainz jsons into one pickle
    Parameters:
        filepaths (List[str]): A list of subfilepaths to tracks
        folderpath_src (str): Source folderpath
    Returns:
        df_result (pd.DataFrame): A dataframe with flattened acousticbrainz features
    """

    # Set up a list to keep track of missing jsons
    df_result = []

    # Loop through all tracks with genre tags
    for idx, filepath in enumerate(filepaths):

        # print(F"processing {idx+1}/{len(filepaths)}...")
        filepath_src = os.path.join(folderpath_src, filepath.replace("mp3", "json"))

        try:
            with open(filepath_src) as fp:
                content = json.load(fp)
        except:
            # print(F"missing {filepath_src}!")
            continue

        content = flatten_json(content)
        df_tmp = pd.DataFrame.from_dict(content)
        df_tmp['metadata_path'] = filepath
        df_result.append(df_tmp)

    df_result = pd.concat(df_result).reset_index(drop=True)

    return df_result

In [21]:
# Update folderpath_src as necessary
folderpath_src = "/Users/lawrenceclai/Documents/projects/mids_w207_music_genre/src/mids_w207_music_genre/datasets/acousticbrainz_json"
df_acousticbrainz = combine_acousticbrainz_jsons(filepaths, folderpath_src)
df_acousticbrainz.head(5)

Unnamed: 0,lowlevel_average_loudness,barkbands_crest_dmean,barkbands_crest_dmean2,barkbands_crest_dvar,barkbands_crest_dvar2,barkbands_crest_max,barkbands_crest_mean,barkbands_crest_median,barkbands_crest_min,barkbands_crest_var,...,hpcp_median,hpcp_min,hpcp_var,tonal_chords_histogram,tonal_thpcp,tonal_chords_key,tonal_chords_scale,tonal_key_key,tonal_key_scale,metadata_path
0,0.203354,2.287306,3.795858,4.570755,12.773065,24.830286,11.892841,11.545135,2.702297,13.075336,...,"[0.479233175516, 0.348901093006, 0.03901030123...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.200233325362, 0.107105828822, 0.02888457477...","[4.54714870453, 1.00633621216, 19.1949310303, ...","[1.0, 0.708558619022, 0.19143679738, 0.1464927...",A,major,D,minor,14/214.mp3
1,0.95807,2.81178,4.748635,9.808616,25.659473,24.480026,6.925749,5.68249,2.164261,14.92437,...,"[0.142016798258, 0.128937482834, 0.10845530033...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.090464040637, 0.0706968680024, 0.0477029457...","[2.39117097855, 0.0, 0.613120794296, 10.361741...","[1.0, 0.846840858459, 0.831799566746, 0.893409...",A,major,D,minor,15/215.mp3
2,0.898877,2.594054,4.430159,9.129704,24.725008,22.518446,7.279491,6.278421,2.492509,12.271876,...,"[0.0901899263263, 0.0737978070974, 0.051438912...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0.0529752671719, 0.051395341754, 0.0362989716...","[37.9964447021, 6.08575391769, 1.75854575634, ...","[1.0, 0.938977897167, 0.645424842834, 0.530215...",B,minor,B,minor,16/216.mp3


### Step 2.2: Flatten 1d and 2d arrays

In [22]:
def flatten_1d_arrays(df: pd.DataFrame,
                      feature_names: List[str])->pd.DataFrame:
    """Flatten 1d arrays within feature_names
    Parameters:
        df (pd.DataFrame): Original dataframe with fields to be flattened
        feature_names (List[str]): List of column names with 1d arrays
    Returns:
        df (pd.DataFrame): Revised dataframe with flattened 1d array fields
    """
    df_flattened_features = []
    for feature_name in feature_names:
        feature_array_shape = df[feature_name][0].shape
        cols_names = [F"{feature_name}_{i}" for i in range(feature_array_shape[0])]
        df_tmp = pd.DataFrame(df[feature_name].tolist(), 
                              index=df.index, 
                              columns=cols_names)
        df_flattened_features.append(df_tmp)
    
    df_flattened_features = pd.concat(df_flattened_features, axis=1)

    df.drop(feature_names, axis=1, inplace=True)
    df = pd.concat([df, df_flattened_features], axis=1)

    return df

def flatten_2d_arrays(df: pd.DataFrame,
                      feature_names: List[str])->pd.DataFrame:
    """Flatten 2d arrays within feature_names
    Parameters:
        df (pd.DataFrame): Original dataframe with fields to be flattened
        feature_names (List[str]): List of column names with 2d arrays
    Returns:
        df (pd.DataFrame): Revised dataframe with flattened 2d array fields
    """
    df_flattened_features = []
    for feature_name in feature_names:
        feature_array_shape = df[feature_name][0].shape

        cols_names = [F"{feature_name}_{i}_{j}" \
                      for i in range(feature_array_shape[0]) \
                      for j in range(feature_array_shape[1])]

        df_tmp = pd.DataFrame([i.flatten() for i in df[feature_name].to_list()], 
                              index=df.index, 
                              columns=cols_names)

        df_flattened_features.append(df_tmp)

    df_flattened_features = pd.concat(df_flattened_features, axis=1)
    df.drop(feature_names, axis=1, inplace=True)
    df = pd.concat([df, df_flattened_features], axis=1)

    return df


In [23]:
# Flatten 1d arrays
features_to_flatten_1d = ['barkbands_dmean','barkbands_dmean2','barkbands_dvar','barkbands_dvar2','barkbands_max','barkbands_mean','barkbands_median','barkbands_min','barkbands_var','beats_loudness_band_ratio_dmean','beats_loudness_band_ratio_dmean2','beats_loudness_band_ratio_dvar','beats_loudness_band_ratio_dvar2','beats_loudness_band_ratio_max','beats_loudness_band_ratio_mean','beats_loudness_band_ratio_median','beats_loudness_band_ratio_min','beats_loudness_band_ratio_var','erbbands_dmean','erbbands_dmean2','erbbands_dvar','erbbands_dvar2','erbbands_max','erbbands_mean','erbbands_median','erbbands_min','erbbands_var','gfcc_mean','hpcp_dmean','hpcp_dmean2','hpcp_dvar','hpcp_dvar2','hpcp_max','hpcp_mean','hpcp_median','hpcp_min','hpcp_var','melbands_dmean','melbands_dmean2','melbands_dvar','melbands_dvar2','melbands_max','melbands_mean','melbands_median','melbands_min','melbands_var','mfcc_mean','spectral_contrast_coeffs_dmean','spectral_contrast_coeffs_dmean2','spectral_contrast_coeffs_dvar','spectral_contrast_coeffs_dvar2','spectral_contrast_coeffs_max','spectral_contrast_coeffs_mean','spectral_contrast_coeffs_median','spectral_contrast_coeffs_min','spectral_contrast_coeffs_var','spectral_contrast_valleys_dmean','spectral_contrast_valleys_dmean2','spectral_contrast_valleys_dvar','spectral_contrast_valleys_dvar2','spectral_contrast_valleys_max','spectral_contrast_valleys_mean','spectral_contrast_valleys_median','spectral_contrast_valleys_min','spectral_contrast_valleys_var','tonal_chords_histogram','tonal_thpcp']
df_acousticbrainz = flatten_1d_arrays(df_acousticbrainz, features_to_flatten_1d)

# Flatten 2d arrays
features_to_flatten_2d = ['gfcc_cov','gfcc_icov','mfcc_cov','mfcc_icov']
df_acousticbrainz = flatten_2d_arrays(df_acousticbrainz, features_to_flatten_2d)

df_acousticbrainz.head(5)

Unnamed: 0,lowlevel_average_loudness,barkbands_crest_dmean,barkbands_crest_dmean2,barkbands_crest_dvar,barkbands_crest_dvar2,barkbands_crest_max,barkbands_crest_mean,barkbands_crest_median,barkbands_crest_min,barkbands_crest_var,...,mfcc_icov_12_3,mfcc_icov_12_4,mfcc_icov_12_5,mfcc_icov_12_6,mfcc_icov_12_7,mfcc_icov_12_8,mfcc_icov_12_9,mfcc_icov_12_10,mfcc_icov_12_11,mfcc_icov_12_12
0,0.203354,2.287306,3.795858,4.570755,12.773065,24.830286,11.892841,11.545135,2.702297,13.075336,...,0.000121,-0.000552,0.000111,-0.00081,-0.000838,0.001762,-0.003689,0.003405,-0.012208,0.023467
1,0.95807,2.81178,4.748635,9.808616,25.659473,24.480026,6.925749,5.68249,2.164261,14.92437,...,-0.001957,0.004144,-0.002776,-0.001259,-0.003586,-0.001336,0.001763,-0.000964,-0.014582,0.031316
2,0.898877,2.594054,4.430159,9.129704,24.725008,22.518446,7.279491,6.278421,2.492509,12.271876,...,-0.001152,0.002122,-0.002031,-0.001206,-6.9e-05,-0.001388,-6.6e-05,0.003733,-0.016123,0.028036


## Step 3: Combine acousticbrainz jsons with genre tags

In [3]:
# Inner join genre tags with acousticbrainz features
df_data = pd.merge(left=df_tags, 
                   right=df_acousticbrainz, 
                   on="metadata_path", 
                   how="inner")

# Rearrange columns in dataframe by key > non-features > features
key = "metadata_path"
non_feature_prefixes = ["metadata_", "audio_", "tags_", "genre_", "version_"]
non_features = sorted([c for c in df_data.columns \
                       if c.split("_")[0] in non_feature_prefixes\
                       or c != key])
features = sorted(list(set(df_data.columns) - set(non_features)))
df_data = df_data[[key] + non_features + features]

# Update filepath as necessary
filepath = "/Users/lawrenceclai/Documents/projects/mids_w207_music_genre/src/mids_w207_music_genre/datasets/processed/gtm_jamendo_genre_features.pickle.bz2"

# Export data into a pickle.bz2 file
df_data.to_pickle(filepath, compression="bz2")

## (Optional) Step 4: Manage smaller chunks of pickle.bz2

This step is used to split and/or combine the chunks as created from `.pickle.bz2`. Alternatively, you can simply download the file from [here](https://drive.google.com/file/d/16DJpJiOIrEGKtOy-fE95jiaZIMZCxHmf/view?usp=sharing).

### Step 4.1: Break pickle.bz2 into smaller chunks

In [9]:
import os
# Update filepath as necessary
filepath = "/Users/lawrenceclai/Documents/projects/mids_w207_music_genre/src/mids_w207_music_genre/datasets/processed/gtm_jamendo_genre_features.pickle.bz2"
folderpath = os.path.dirname(filepath)
cmd = F"cd {folderpath} && split -b 50M {filepath} {filepath}_"
os.system(cmd)

0

### Step 4.2: Combine pickle.bz2 chunks into one

In [5]:
import os
# Update filepath as necessary
filepath_prefix = "/Users/lawrenceclai/Documents/projects/mids_w207_music_genre/src/mids_w207_music_genre/datasets/processed/gtm_jamendo_genre_features.pickle.bz2_"
folderpath = os.path.dirname(filepath)
filepath_dst = filepath_prefix[:-1]
cmd = F"cd {folderpath} && cat {filepath_prefix}_* > {filepath_dst}"
os.system(cmd)

0

In [6]:
import pandas as pd
df_data = pd.read_pickle(filepath_dst)
df_data.head(5)

Unnamed: 0,metadata_path,audio_properties_analysis_sample_rate,audio_properties_bit_rate,audio_properties_codec,audio_properties_downmix,audio_properties_equal_loudness,audio_properties_length,audio_properties_lossless,audio_properties_md5_encoded,audio_properties_replay_gain,...,zerocrossingrate_dmean,zerocrossingrate_dmean2,zerocrossingrate_dvar,zerocrossingrate_dvar2,zerocrossingrate_max,zerocrossingrate_mean,zerocrossingrate_median,zerocrossingrate_min,zerocrossingrate_var,metadata_path.1
0,14/214.mp3,44100,320067,mp3,mix,0,124.551834,0,1200910aabd3fd3cdf5a849cb904132b,-11.563334,...,0.002656,0.003597,7.5e-05,0.000111,0.527832,0.028812,0.016113,0.001953,0.003941,14/214.mp3
1,14/305414.mp3,44100,320058,mp3,mix,0,144.039185,0,2d5a2d61064bee89ce049d079a6d6d8d,-7.486107,...,0.034709,0.038623,0.001546,0.002,0.501465,0.107286,0.075195,0.001465,0.012051,14/305414.mp3
2,14/976714.mp3,44100,320068,mp3,mix,0,122.044083,0,3bb66ab5ce0f392ba11f87ba37977a68,-6.426849,...,0.002515,0.003219,3.9e-05,3.4e-05,0.496582,0.023289,0.022461,0.006348,0.00012,14/976714.mp3
3,14/976814.mp3,44100,320077,mp3,mix,0,108.904488,0,9b63bc3de203d33062abdb07e2e248ba,-16.022282,...,0.007516,0.009051,9.2e-05,0.000108,0.500488,0.037016,0.033203,0.002441,0.000638,14/976814.mp3
4,14/1169714.mp3,44100,320025,mp3,mix,0,334.524078,0,6020e9512528d5984c49e81cdcaa8950,1.482885,...,0.001608,0.002085,2.5e-05,3.2e-05,0.493164,0.018905,0.01709,0.006348,0.000124,14/1169714.mp3
