# Libraries

Restart your kernel afterwards!

In [3]:
!pip install -U \
    scikit-learn imbalanced-learn scikit-multilearn \
    umap-learn pynndescent numba \
    adjustText \
    yellowbrick datascience albumentations \
    git+https://github.com/scikit-learn-contrib/hdbscan.git#egg=hdbscan
    

# Second to last line is so pip doesn't throw a fit

Collecting hdbscan
  Cloning https://github.com/scikit-learn-contrib/hdbscan.git to /tmp/pip-install-29g76izv/hdbscan_275da23193fd4e03a8853a2cb00d72fd
  Running command git clone -q https://github.com/scikit-learn-contrib/hdbscan.git /tmp/pip-install-29g76izv/hdbscan_275da23193fd4e03a8853a2cb00d72fd
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting datascience
  Downloading datascience-0.17.0.tar.gz (721 kB)
[K     |████████████████████████████████| 721 kB 5.4 MB/s 
Collecting albumentations
  Downloading albumentations-1.1.0-py3-none-any.whl (102 kB)
[K     |████████████████████████████████| 102 kB 55.1 MB/s 
Collecting yellowbrick
  Downloading yellowbrick-1.3.post1-py3-none-any.whl (271 kB)
[K     |████████████████████████████████| 271 kB 57.1 MB/s 
[?25h  Downloading yellowbrick-1.3-py3-none-any.whl (271 kB)
[K     |████████████████████████████████| 271 kB

# Preliminaries

Root path with all subdirectories:

In [4]:
ROOT_PATH = "./drive/MyDrive/spotify/"

Import libraries:

In [4]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from adjustText import adjust_text

import umap
import hdbscan.flat as hdflat

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer

from imblearn.under_sampling import RandomUnderSampler

from skmultilearn.model_selection import iterative_train_test_split

import os
import uuid
import base64
import pickle
import csv
from ast import literal_eval

import sys
sys.path.append(os.path.join(ROOT_PATH, 'notebooks'))
from utils import *

FileNotFoundError: ignored

# Main

First, we define a function to convert between Spotify ID's (22 character alphanumeric) and UUIDs, to use with PostgreSQL.

In [None]:
def to_uuid(s_id):
    return str(uuid.UUID(bytes=base64.urlsafe_b64decode(s_id + '==')))

Then, we set the columns with numerical features:

In [None]:
feature_cols =  ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']

Our aggregated CSV generated with Postgres has columns with the pattern <feature>_avg. We define these names and read them with Pandas:

In [None]:
artist_cols = ['id'] + [c + '_avg' for c in feature_cols]
df_artists = pd.read_csv(
    os.path.join(ROOT_PATH, '/raw/artists_aggregated.csv'), 
    usecols = artist_cols
)
df_artists = df_artists.dropna()
artist_ids = df_artists['id']
df_artists = df_artists.drop('id', axis=1)

We additionally read a file containing artist information such as ID, name and, most importantly, associated genres:

In [None]:
df_artists_genres = pd.read_csv(
    os.path.join(ROOT_PATH, '/raw/artists_filtered.csv'),
    usecols = ['id', 'name', 'followers', 'genres']
)
df_artists_genres['id_orig'] = df_artists_genres['id']
df_artists_genres = df_artists_genres.set_index('id_orig')
df_artists_genres['id'] = df_artists_genres['id'].apply(to_uuid)

We standardize all of the numerical features in our DataFrame:

In [None]:
sc = StandardScaler()
df_artists_scaled = sc.fit_transform(df_artists.to_numpy())

Finally, we perform dimensionality reduction using UMAP. We cache the results for given hyperparameters so that we can reuse them:

In [None]:
n_neighbors = 60
n_clusters = 7
min_cluster_size = 10000

n_components = 2
densmap = False

if 'loudness' in feature_cols:
    root = os.path.join(ROOT_PATH, '/pickles/loud/')
else:
    root = os.path.join(ROOT_PATH, '/pickles/no_loud/')

if densmap:
    suffix = f"{n_neighbors}_{n_components}_1_{dens_lambda}.pkl"
else:
    suffix = f"{n_neighbors}_{n_components}_0.pkl"

embed_path = root + "embed_" + suffix
model_path = root + "model_" + suffix

if os.path.exists(embed_path) and os.path.exists(model_path): # If cache file for given parameters exists, read it
    with open(model_path, 'rb') as f:
        umap_model = pickle.load(f)
    with open(embed_path, 'rb') as f:
        artists_embedded = pickle.load(f)
else:                                                         # Else, perform UMAP and save
    umap_model = umap.UMAP(
        n_neighbors = n_neighbors,
        n_components = n_components,
        min_dist = 0.0,
        random_state = 42,
        low_memory = False,
        n_jobs = -1,
        verbose = True,
    )

    artists_embedded = umap_model.fit_transform(df_artists_scaled)
    with open(embed_path, 'wb') as f:
        pickle.dump(artists_embedded, f)
    with open(model_path, 'wb') as f:
        pickle.dump(umap_model, f)

Mon Nov 29 00:36:01 2021 Building and compiling search function


Afterwards, we cluster using HDBSCAN:

In [None]:
scan = hdflat.HDBSCAN_flat(
    artists_embedded,
    min_cluster_size = min_cluster_size,
    min_samples = 1,
    n_clusters = n_clusters,
    memory = './cache/',
    cluster_selection_method = 'leaf',
    gen_min_span_tree = True,
)

labels = scan.labels_

We create an auxiliary DataFrame with the artists' names, IDs, and projected coordinates:

In [None]:
df_embed = pd.DataFrame(artist_ids)
df_embed = df_embed.merge(df_artists_genres, on='id')
df_embed['x'] = artists_embedded[:,0]
df_embed['y'] = artists_embedded[:,1]
if artists_embedded.shape[1] == 3:
    df_embed['z'] = artists_embedded[:,2]

Additionally, we extract the most popular artists for visualization:

In [None]:
df_popular = df_embed.sort_values('followers', ascending=False).head(50)
label_text = list(df_popular['name'])
label_coords = df_popular[['x', 'y']].to_numpy()

We create two DataFrames; `df_cluster_hit` contains all the points that HDBSCAN tagged as belonging to a cluster, and `df_cluster_miss` those that it determined to be noise:

In [None]:
df_cluster = df_embed.copy()
df_cluster['cluster'] = labels
df_cluster = df_cluster.set_index('id')

df_cluster_hit = df_cluster[df_cluster['cluster'] != -1]
df_cluster_miss = df_cluster[df_cluster['cluster'] == -1]

We count how many times each genre appears in each cluster, and assign it to the one it appears on the most. The resulting dictionary `genre_map` has entries of the form `(genre name, assigned cluster)`.

Additionally, we create a list with only the `k` most common genres for each cluster (`cluster_genre_map`). We will use all songs belonging to these genres for the classification task.

In [None]:
genre_cluster_map = genre_clusters(df_cluster_hit)
cluster_genre_map = cluster_genres(df_cluster_hit)
cluster_genre_map = [set(list(cluster_genre_map[i][0][:5])) for i in range(n_clusters)]

genre_map = {}
for genre, (labels, counts) in genre_cluster_map.items():
    if counts[0] >= 10:
        genre_map[genre] = labels[0]


Since an artist can have multiple genres, we need a way to determine which artists belong to each cluster. We do this via the dictionary `artist_map`, which has entries of the form `(artist id, list of clusters)`.

Moreover, an artist can be representative of several genres in the same cluster. For example, Skrillex is tagged as both "EDM" and "brostep", both of which belong to the "electronica" cluster. Since the individual genre labels are important for the classification task, we create a second dictionary, `artist_map_2`, which has all the genres an artist belongs to, and their corresponding cluster.

In [None]:
2artist_genres = df_artists_genres['genres'].to_dict()
artist_map = {}
artist_map_2 = {}

for artist, genre_str in artist_genres.items():
    genres = literal_eval(genre_str)
    clusters, clusters_2 = set(), set()
    for genre in genres:
        if genre in genre_map:
            clusters.add(genre_map[genre])
        for i, genre_set in enumerate(cluster_genre_map):
            if genre in genre_set:
                clusters_2.add((i, genre))
                break
    
    if clusters:
        artist_map[artist] = list(clusters)
    if clusters_2:
        artist_map_2[artist] = list(clusters_2)

# Files

We set the save path for our files for the given set of hyperparameters:

In [None]:
all_features = feature_cols + ['key', 'mode', 'time_signature', 'explicit', 'duration_ms']
if 'loudness' in feature_cols:
    training_path = os.path.join(ROOT_PATH, f'/training/loud/{min_cluster_size}/{n_clusters}/')
else:
    training_path = os.path.join(ROOT_PATH, f'/training/no_loud/{min_cluster_size}/{n_clusters}/')

Since our songs file (`songs_merged.csv`) is too big to load on memory, we have to iterate through it line-by-line:

In [None]:
fields_genres = ['id'] + all_features + ['genre']

cluster_files = [None] * n_clusters
cluster_writers = [None] * n_clusters
for i in range(n_clusters):
    cluster_files[i] = open(os.path.join(training_path, f'/genres/examples{i}.csv'), 'w')
    cluster_writers[i] = csv.DictWriter(cluster_files[i], fields_genres)
    cluster_writers[i].writeheader()

with \
    open(os.path.join(ROOT_PATH, '/raw/songs_merged.csv'), 'r') as f, \
    open(os.path.join(training_path, 'supergenres/clusters.csv'), 'w') as f_write:
    reader = csv.DictReader(f)

    fields = ['id'] + all_features + ['cluster']
    writer = csv.DictWriter(f_write, fields)
    writer.writeheader()
    
    for row in reader:
        flag = False
        
        to_write = {}
        for feature in all_features:
            try:
                to_write[feature] = float(row[feature])
            except Exception:
                flag = True
                break
            if np.isnan(to_write[feature]):
                flag = True
                break
        to_write['id'] = row['id']
    
        if flag:
            continue

        artists = literal_eval(row['artists'])
        seen_cluster = [False] * n_clusters
        seen_genres = set()
        
        for artist in artists:
            if artist in artist_map:
                for cluster in artist_map[artist]:
                    if not seen_cluster[cluster]:
                        temp_write = to_write.copy()
                        temp_write['cluster'] = int(cluster)
                        writer.writerow(temp_write)
                        seen_cluster[cluster] = True

            if artist in artist_map_2:
                for cluster, genre in artist_map_2[artist]:
                    if genre not in seen_genres:
                        temp_write = to_write.copy()
                        temp_write['genre'] = genre
                        cluster_writers[cluster].writerow(temp_write)
                        seen_genres.add(genre)

for f in cluster_files:
    f.close()

In [None]:
topk = cluster_genres(df_cluster_hit)
topk = [x[0][:5] for x in topk]
topk = set([x for subl in topk for x in subl])

In [None]:
df_comb = pd.DataFrame()
for i in range(n_clusters):
    df_temp = pd.read_csv(os.path.join(training_path, f'/genres/examples{i}.csv'))
    df_temp = df_temp[df_temp['genre'].isin(topk)]
    df_comb = pd.concat([df_comb, df_temp])

group = df_comb.groupby('id')

df_genre = group.first()
group_genres = group['genre'].apply(lambda x: list(set(x)))
df_genre['genre'] = group_genres

X = df_genre.drop('genre', axis=1).to_numpy()

mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_genre['genre'])

X_train, y_train, X_test, y_test = iterative_train_test_split(X, y, test_size=0.2)

with open(os.path.join(training_path, f'/genres/X_train_all.pkl'), 'wb') as f:
    pickle.dump(X_train, f)

with open(os.path.join(training_path + f'/genres/y_train_all.pkl'), 'wb') as f:
    pickle.dump(y_train, f)

with open(os.path.join(training_path, f'/genres/X_test_all.pkl'), 'wb') as f:
    pickle.dump(X_test, f)

with open(os.path.join(training_path, f'/genres/y_test_all.pkl'), 'wb') as f:
    pickle.dump(y_test, f)

# Figures

We define a custom color palette to help distinguish between each cluster:

In [None]:
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % rgb

palette = [(235, 172, 35), (189, 189, 189), (0, 140, 249), (0, 110, 0), (184, 0, 88), (209, 99, 230), (135, 133, 0), (255, 146, 135), (0, 187, 173), (89, 84, 214), (0, 198, 248), (0, 167, 108), (178, 69, 2)]
palette_hex = [rgb_to_hex(p) for p in palette]
my_palette = sns.color_palette(palette_hex, 7)
my_palette

Projected points

In [None]:
sns.set(font_scale=1.2, style='white')

fig, ax = plt.subplots(figsize=(16,14))

sns.histplot(
    data = df_cluster,
    x = 'x',
    y = 'y',
    bins = 150,
    ax = ax
)

label_text, label_coords = get_labels(df_cluster, popular=15, rep=4)

sns.scatterplot(
    x = label_coords[:,0],
    y = label_coords[:,1],
    color = 'red',
    s = 30,
    legend = False,
    ax = ax,
)

texts = [
    ax.text(x, y, name, ha='center', va='center', bbox=dict(boxstyle="round", fc="white", lw=0, alpha=0.6))
    for (x, y), name in zip(label_coords, label_text)
]
adjust_text(texts)

ax.set_xlim(1, 16)
ax.set_ylim(-4, 10)

fig.savefig("./drive/MyDrive/spotify/fig/projected.png", bbox_inches='tight', dpi=150)
plt.close()

Clustered points

In [None]:
sns.set(font_scale=1.2, style='white')
fig, ax = plt.subplots(figsize=(16,14))

sns.histplot(
    data = df_cluster_hit,
    x = 'x',
    y = 'y',
    hue = 'cluster',
    palette = my_palette,
    bins = 350,
    alpha = 1,
    ax = ax
)

label_text, label_coords = get_labels(df_cluster_hit, popular=15, rep=4)

sns.scatterplot(
    x = label_coords[:,0],
    y = label_coords[:,1],
    color = 'red',
    s = 30,
    legend = False,
    ax = ax,
)

texts = [
    ax.text(x, y, name, ha='center', va='center', bbox=dict(boxstyle="round", fc="white", lw=0, alpha=0.6))
    for (x, y), name in zip(label_coords, label_text)
]
adjust_text(texts)

ax.set_xlim(1, 16)
ax.set_ylim(-4, 10)

fig.savefig(f"./drive/MyDrive/spotify/fig/cluster_{n_clusters}.png", bbox_inches='tight', dpi=150)
plt.close()

`n` most common genres in each cluster.

In [None]:
sns.set(font_scale=2.5, style='white')

n = 15
temp = list(cluster_genres(df_cluster_hit))
for i, (genres, counts) in enumerate(temp):
    genres, counts = genres[:n], counts[:n]
    
    fig, ax = plt.subplots(figsize=(10,14))
    sns.barplot(
        y = genres,
        x = counts,
        palette = 'mako'
    )
    ax.set_xlabel("Número de apariciones")
    ax.set_ylabel("Género")
    ax.set_title(f"Cluster {i}")
    fig.savefig(f"./drive/MyDrive/spotify/fig/bar_{i}.png", bbox_inches='tight', dpi=150)
    plt.close()

Feature distribution for each cluster

In [None]:
sns.set(font_scale=2, style='white')

df_artists_merge = df_artists.copy()
df_artists_merge['id'] = artist_ids
df_artists_merge = df_artists_merge.set_index('id')
df_artists_merge = df_artists_merge.join(df_cluster_hit)
df_artists_merge = df_artists_merge.dropna()
df_artists_merge = df_artists_merge.sort_values('cluster')
df_artists_merge['cluster'] = df_artists_merge['cluster'].astype(int).astype(str)

for feature in feature_cols:
    fig, ax = plt.subplots(figsize=(10,8))
    sns.violinplot(
        data = df_artists_merge,
        x = 'cluster',
        y = feature + '_avg',
        palette = my_palette,
        ax = ax
    )
    ax.set_xlabel("Cluster")
    ax.set_ylabel('')
    ax.set_title(feature.title())
    fig.savefig(f"./drive/MyDrive/spotify/fig/cluster_{feature}.png", bbox_inches='tight', dpi=150)
    plt.close()

Histograms for each feature across all points

In [None]:
sns.set(font_scale=1.5, style='white')

for feature in feature_cols:
    fig, ax = plt.subplots(figsize=(10,8))
    sns.histplot(
        data = df_artists,
        x = feature + '_avg',
        palette = 'mako',
        stat = 'density',
        ax = ax
    )
    ax.set_xlabel('')
    ax.set_ylabel('Densidad')
    ax.set_title(feature.title())
    fig.savefig(f"./drive/MyDrive/spotify/fig/hist_{feature}.png", bbox_inches='tight', dpi=150)
    plt.close()

Joint plots for each possible feature combination

In [None]:
sns.set(font_scale=1.5)
for i,feature1 in enumerate(feature_cols):
    for feature2 in feature_cols[i+1:]:
        fig, ax = plt.subplots(figsize=(10,8))
        sns.histplot(
            data = df_artists,
            x = feature1 + '_avg',
            y = feature2 + '_avg',
            ax = ax
        )
        ax.set_xlabel(feature1.title())
        ax.set_ylabel(feature2.title())
        ax.set_title(f"{feature1.title()} vs. {feature2.title()}")
        fig.savefig(f"./drive/MyDrive/spotify/fig/joint_{feature1}_{feature2}.png", bbox_inches='tight', dpi=150)
        plt.close()

In [None]:
i = 0
temp = {to_uuid(key): value for key, value in artist_map_2.items() if any([v[0]==i for v in value])}
df_temp = pd.DataFrame([temp]).T
df_temp = df_temp.explode(0)
df_temp = pd.DataFrame(df_temp[0].tolist(), index=df_temp.index, columns=['cluster', 'genre'])
df_temp = df_temp[df_temp['cluster']==i]
df_temp = df_temp.drop('cluster', axis=1)
df_temp = df_temp.join(df_cluster)
df_temp = df_temp[['name', 'x', 'y', 'genre']]

fig, ax = plt.subplots(figsize=(14,14))
sns.scatterplot(
    data = df_temp,
    x = 'x',
    y = 'y',
    hue = 'genre',
    linewidth = 0
)
plt.close()