In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PowerTransformer, RobustScaler, QuantileTransformer
from sklearn.cluster import BisectingKMeans
import matplotlib.pyplot as plt

from src.analysis.get_pca_config import get_pca_config
from src.analysis.cluster_data import cluster_data

In [None]:
plt.style.use("dark_background")

In [None]:
# set seed for reproducibility
seed = 123

In [None]:
audio_df = pd.read_csv('./data/df_audio_features_5000.csv', index_col=0)
audio_df.head()

Two columns, id and html we don't need for our analysis. Furthermore the .csv was probably saved with index on resulting in "unnamed: 0" column.
Let's start with cleaning the columns

In [None]:
# check column naming
audio_df.columns

## data cleaning
The column names contain many whitespaces that should be removed before we can drop any columns

In [None]:
# clean up column names
audio_df.columns = audio_df.columns.str.replace(' ', '')
audio_df.columns

In [None]:
# drop weird first column and other unneeded columns
audio_df = audio_df.drop(columns=['type', 'id', 'html'])
audio_df

In [None]:
# check for duplicates
print(f"Number of duplicated rows in the data is: {audio_df.duplicated().sum()}")

In [None]:
# remove duplicates
audio_df.drop_duplicates(inplace=True)
print(f"Number of duplicated rows in the data is after cleaning: {audio_df.duplicated().sum()}")

In [None]:
# check for empty values
audio_df.isnull().sum()

In [None]:
audio_df.info()

## Data preparations

In [None]:
# set indices
audio_df.set_index(['name', 'artist'], inplace=True)
audio_df.head()

Further consideration of column to be dropped
duration_ms and time_signature

In [None]:
# drop duration_ms
audio_df.drop(['duration_ms'], inplace=True)

In [None]:
# set scaler
scalers = [
    StandardScaler(),
    MinMaxScaler(),
    RobustScaler(),
    QuantileTransformer(n_quantiles=audio_df.shape[0]),
    PowerTransformer(),
]
scaler_names = [
    'standard scaler',
    'minmax scaler',
    'robust scaler',
    'quantile scaler',
    'power scaler',
]

In [None]:
# get pca components for different scalers
for scaler_name, scaler in zip(scaler_names, scalers):
    get_pca_config(
        df=audio_df,
        scaler_name=scaler_name,
        scaler=scaler,
        normalize=False,
    )

In [None]:
# set number of clusters for each scaler
pca = True  # use pca
pca_components = [3, 5, 7, 3, 10]
n_clusters = 25

for scaler_name, scaler, pca_component in zip(scaler_names, scalers, pca_components):
    _ = cluster_data(
        df=audio_df,
        scaler_name=scaler_name,
        scaler=scaler,
        cluster_alg=BisectingKMeans(n_clusters=n_clusters, n_init=3, random_state=seed),
        normalize=False,
        pca=pca,
        pca_comp=pca_component,
        verbose=True,
    )

# normalize

In [None]:
# get pca components for different scalers
for scaler_name, scaler in zip(scaler_names, scalers):
    get_pca_config(
        df=audio_df,
        scaler_name=scaler_name,
        scaler=scaler,
        normalize=True,
    )

In [None]:
pca = True  # use pca
pca_components = [8, 5, 7, 3, 4]  # int to keep x-amount of parameters or float to keep x amount of variance
n_clusters = 25

for scaler_name, scaler, pca_component in zip(scaler_names, scalers, pca_components):
    _ = cluster_data(
        df=audio_df,
        scaler_name=scaler_name,
        scaler=scaler,
        cluster_alg=BisectingKMeans(n_clusters=n_clusters, n_init=1, random_state=seed),
        normalize=True,
        pca=pca,
        pca_comp=pca_component,
        verbose=True,
    )