# MFCC Clustering with TSNE and KMeans

Found at: [MaSC Compendium Visualization](https://github.com/chrispla/MaSC_sim_vis/blob/master/mfcc_t-SNE.ipynb)

mfcc_t-SNE.ipynb: Compute MFCC from audio, reduce dimension to 2 with t-SNE, and plot

In [4]:
import pickle

import altair as alt
import librosa.feature
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler

In [9]:
def get_feature(y, sr):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc = mfcc.flatten()
    stft = librosa.core.stft(y=y)
    s_db = librosa.core.power_to_db(np.abs(stft) ** 2)
    mean = np.mean(s_db)

    return mfcc, mean

In [None]:
with open('../pickles/processed_data.pkl', 'rb') as f:
    data = pickle.load(f)

In [None]:
all_mfcc = []
all_mean = []
labels = []
for label, y, sr in data:
    mfcc, mean = get_feature(y, sr)
    if len(mfcc) != 16809:
        continue
    labels.append(label)
    all_mfcc.append(mfcc)
    all_mean.append(mean)

In [None]:
scl1 = StandardScaler()
all_mfcc_scaled = scl1.fit_transform(all_mfcc)
all_mfcc_scaled_red2 = TSNE(n_components=2).fit_transform(all_mfcc_scaled)

In [14]:
kmeans = KMeans(n_clusters=10, n_init='auto', random_state=None)
kmeans.fit(all_mfcc_scaled_red2)  # change number of clusters here
clusters = kmeans.predict(all_mfcc_scaled_red2)

In [15]:
# x and y
mfcc1 = []
mfcc2 = []
for i in range(len(all_mfcc_scaled_red2)):
    mfcc1.append(all_mfcc_scaled_red2[i][0])
    mfcc2.append(all_mfcc_scaled_red2[i][1])

## Visualization

In [16]:
df1 = pd.DataFrame(
    {'x': mfcc1, 'y': mfcc2, 'color': clusters, 'path': np.asarray(labels), 'label': np.asarray(labels)})
chart1 = alt.Chart(df1).mark_circle(opacity=0.6, size=50).encode(x='x', y='y', color='color:N', href='path',
                                                                 tooltip=['label']).interactive()

In [17]:
df2 = pd.DataFrame({'x': mfcc1, 'y': mfcc2, 'color': np.asarray(all_mean), 'path': np.asarray(labels),
                    'label': np.asarray(labels)})
chart2 = alt.Chart(df2).mark_circle(opacity=0.6, size=50).encode(x='x', y='y', color='color:Q', href='path',
                                                                 tooltip=['label']).interactive()

In [18]:
df3 = pd.DataFrame(
    {'x': mfcc1, 'y': mfcc2, 'color': clusters, 'path': np.asarray(labels), 'label': np.asarray(labels)})
chart3 = alt.Chart(df3).mark_circle(size=80).encode(x='x', y='y', color='color:N', href='path',
                                                    tooltip=['label']).interactive()

In [19]:
df4 = pd.DataFrame({'x': mfcc1, 'y': mfcc2, 'color': np.asarray(all_mean), 'path': np.asarray(labels),
                    'label': np.asarray(labels)})
chart4 = alt.Chart(df4).mark_circle(size=30).encode(x='x', y='y', color='color:Q', href='path',
                                                    tooltip=['label']).interactive()

In [20]:

# Collections
# df5 = pd.DataFrame({'x': mfcc1, 'y': mfcc2, 'color': collection, 'path': np.asarray(labels),
#                     'label': np.asarray(labels)})
# chart5 = alt.Chart(df5).mark_circle(opacity=0.6, size=50).encode(x='x', y='y', color='color:N', href='path',
#                                                                  tooltip=['label']).interactive()

display(chart1)
display(chart2)
display(chart3 + chart4)
# display(chart5)
