In [17]:
from __future__ import print_function
import librosa
import matplotlib.pyplot as plt
import numpy as np
import librosa.display
import random
from numpy import linalg as LA
import time
import seaborn as sns;sns.set()
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.cluster import SpectralClustering
import plotly
from plotly import offline as py
import plotly.graph_objs as go
plotly.offline.init_notebook_mode(connected=True)
from statistics import mode
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.kernel_approximation import RBFSampler
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
import pickle
import pandas as pd
from sklearn.manifold import TSNE
from sklearn import mixture

In [50]:
def data_gen_from_mfcc():
    hop_length = 512
    data_3d = np.zeros((1000,20,1077))
    genres = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']
    j=0
    for genre in genres:
        for i in range(10):
            filename = './genres/%s/%s.0000%d.au'%(genre,genre,i)
            y, sr = librosa.load(filename,duration=25)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length)
            data_3d[j,:,:] = mfcc
            j= j+1

        for i in range(10,100):
            filename = './genres/%s/%s.000%d.au'%(genre,genre,i)
            y, sr = librosa.load(filename,duration=25)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length=hop_length)
            data_3d[j,:,:] = mfcc
            j= j+1
    data_flattened = data_3d.reshape(1000,-1)
    return data_3d,data_flattened

def visualize_data(data):
    %matplotlib qt
    genres = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']
    j=1
    f = plt.figure(figsize=(15,15))
    for i in range(10):
        plt_n = f.add_subplot(2,5,j)
        librosa.display.specshow(librosa.power_to_db(data_3d[i+(j-1)*100,:,:],ref=np.max), x_axis="time",y_axis="mel")
        plt.colorbar()
        title_f = 'MFCC Spectrum - For: %s'%genres[i]
        plt.title(title_f)
        j = j+1
    plt.tight_layout()
    plt.show()

def data_split(new_arr):
    data_train_new = np.zeros((700,new_arr.shape[1]))
    data_test_new = np.zeros((300,new_arr.shape[1]))
    k= 0
    l = 0
    for i in range(10):
        for j in range(70):
            data_train_new[k] = new_arr[i*100+j]
            k= k+1
        for m in range(70,100):
            data_test_new[l] = new_arr[i*100+m]
            l = l+1
    return data_train_new,data_test_new

def data_split_3d(new_arr):
    data_train_new = np.zeros((700,new_arr.shape[1],new_arr.shape[2]))
    data_test_new = np.zeros((300,new_arr.shape[1],new_arr.shape[2]))
    k= 0
    l = 0
    for i in range(10):
        for j in range(70):
            data_train_new[k] = new_arr[i*100+j]
            k= k+1
        for m in range(70,100):
            data_test_new[l] = new_arr[i*100+m]
            l = l+1
    return data_train_new,data_test_new

    
def fun_hit_top5(indices,size,size2,n):
    hit = np.zeros((n,1))
    k = 0
    for i in range(n):
        for j in range(size):
            k = i*size+j;
            C = np.where((indices[k,:]<i*size2+size2) & ((indices[k,:]>=i*size2)))
            if (len(C[0]) > 0):
                hit[i] = hit[i]+ 1;

    return hit

def fun_hit_top1(indices,size,size2,n):
    hit = np.zeros((n,1))
    k = 0
    for i in range(n):
        for j in range(size):
            k = i*size+j;
            C = np.where((indices[k,0]<i*size2+size2) & ((indices[k,0]>=i*size2)))
            if (len(C[0]) > 0):
                hit[i] = hit[i]+ 1;

    return hit

def knn_top_recommendation(data_new,true_centroid,true_centorid_t):
    [data_train_new,data_test_new] = data_split(data_new)
    neigh = KNeighborsClassifier(n_neighbors=5)
    neigh.fit(data_train_new, true_centroid.T)

    pred = neigh.predict(data_test_new)
    [dist,indices] = neigh.kneighbors(data_test_new,return_distance=True)
    hit_5 = fun_hit_top5(indices,30,70,10)
    hit_1 = fun_hit_top1(indices,30,70,10)
    trct = true_centroid_t.T
    prec = precision_score(trct, pred,average=None)
    accuracy = accuracy_score(trct, pred)
    return pred,accuracy,prec,hit_5,hit_1


In [19]:
def visualize_data_3D(data_flattened):
    pca=PCA(n_components=3)
    data_work_pca=pca.fit_transform(data_flattened)

    fig = plt.figure(figsize=(15,15))
    # ax = fig.add_subplot(1,1,1, projection='3d')
    colors = ["red", "green", "blue", "cyan","magenta","yellow","black","orange","pink","darkgreen"]
    colors = ["red", "green", "blue", "cyan","magenta","yellow","black","orange","pink", "violet"]
    labels = ['label0','label1','label2','label3','label4','label5','label6','label7','label8','label9']

    traces_scatter=[]
    for j in range(10):
        traces_scatter.append(go.Scatter3d(
            x=data_work_pca[j*100:j*100+20,0],
            y=data_work_pca[j*100:j*100+20,1],
            z=data_work_pca[j*100:j*100+20,2],
            name=genres[j],
            mode='markers',
            marker=dict(
                size=12,
                color=colors[j],                # set color to an array/list of desired values
                opacity=0.8
            )
        )
        )
    layout = go.Layout(
        margin=dict(
            l=0,
            r=0,
            b=0,
            t=0
        )
    )
    fig = go.Figure(data=traces_scatter, layout=layout)
    py.iplot(fig, filename='3d-scatter-colorscale')


    
def evaluate_mapping(x_predict,cls,k):
    accuracy=0
    mapping={}
    for i in range(0,k):
        true_labels=[]
        for (index,labels) in enumerate(x_predict):
            if labels==i:
                true_labels.append(cls[0][index])
        mapping[i]=Counter(true_labels).most_common(1)[0][0]
#     print(mapping)
    for (index,value) in enumerate(cls[0]):
        if mapping[x_predict[index]]==value:
            accuracy=accuracy+(1/len(cls))
    return accuracy,mapping


def tsne_plot(new_arr):
    colors = ["red", "green", "blue", "cyan","magenta","yellow","black","orange","pink", "violet"]
    genres = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']
    X_embedded = TSNE(n_components=2).fit_transform(new_arr)
    fig = plt.figure(figsize=(20,10))
    ax = fig.add_subplot(111)
    for j in range(10):
        ax.scatter(X_embedded[j*100:j*100+100,0],X_embedded[j*100:j*100+100,1],color = colors[j],label=genres[j])
    chartBox = ax.get_position()
    ax.set_position([chartBox.x0, chartBox.y0, chartBox.width*0.6, chartBox.height])
    ax.legend(loc='upper center', bbox_to_anchor=(1.08, 0.8), shadow=True, ncol=1)
    plt.show()

In [104]:
def cluster_purity(predict_labels):
    count = 0
    s=0
    p = np.zeros((10,10),dtype=int)
    for j in range(10):
        k=0
        for i in predict_labels[j*30:j*30+30]:
            p[predict_labels[j*30+k],j] = p[predict_labels[j*30+k],j] +1
            k = k+1
    cp = ['C1','C2','C3','C4','C5','C6','C7','C8','C9','C10']
    df_new = pd.DataFrame(p,columns=genres,index=cp)
    df_new
    return df_new

def bar_graph_data(genres,hit,hit2,hit3):
    %matplotlib qt

    genre_df = genres+genres+genres
    genre_df = np.reshape(genre_df, (30,1))

    bt = ['Original']*10
    sv = ['AutoEncoder']*10
    si = ['Siamese-NN']*10
    classifier_df = bt+sv+si
    classifier_df = np.reshape(classifier_df,(30,1))

    # prec = prec2.tolist()
    # prec1 = precision1.tolist()
    # prec2 = precision2.tolist()

    prec_df = hit.tolist()+hit2.tolist()+ hit3.tolist()
    print(len(prec_df))
    prec_df = np.reshape(prec_df,(30,1))

    df = pd.DataFrame(genre_df)
    df1 = pd.DataFrame(classifier_df)
    df2 = pd.DataFrame(prec_df)

    df['genres'] = df
    df['classifier'] = df1
    df['Number of Hits'] = df2
    # print df

    import seaborn as sns
    import matplotlib.pyplot as plt

    sns.set(style="darkgrid", color_codes=True)

    # setting the plot size for all plots
    sns.set(rc={'figure.figsize':(11.7,8.27)})

    # create a countplot
    #columns = ['BoostedTrees','SVM','KNN']
    sns.factorplot('genres','Number of Hits','classifier', data=df, kind = "bar",palette = "BuPu_d",legend=False)
    #plt.legend(loc='bottom', bbox_to_anchor=(1.2, 0.8), shadow=True, ncol=1)
    plt.legend(loc='best')
    plt.show()

def recomm_performance(hit):
    return np.sum(hit)/300


In [43]:
#data_3d,data_flattened = data_gen_from_mfcc()
#np.save('flattened_data_dataset.npy',data_flattened)
#np.save('2D_data_dataset.npy',data_3d)
#[data_train_3d,test_3d] = data_split_3d(data_3d)
np.save('data_train_3d.npy',test_train_3d)
visualize_data(data_3d)
colors = ["red", "green", "blue", "cyan","magenta","yellow","black","orange","pink", "violet"]
genres = ['blues','classical','country','disco','hiphop','jazz','metal','pop','reggae','rock']

In [8]:
data_3d = np.load('2D_data_dataset.npy',mmap_mode='r')
data_flattened_old = np.load('flattened_data_dataset.npy',mmap_mode='r')

In [26]:
c= np.arange(10).reshape(10,1)
d = np.matlib.repmat(c,1,70)
true_centroid=d.reshape(-1,700)

c= np.arange(10).reshape(10,1)
d = np.matlib.repmat(c,1,30)
true_centroid_t=d.reshape(-1,300)

c= np.arange(10).reshape(10,1)
d = np.matlib.repmat(c,1,100)
true_centroid_old=d.reshape(-1,1000)

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 

In [34]:
## For original data with PCA

pca=PCA(n_components=512)
data_pca=pca.fit_transform(data_flattened)
tsne_plot(data_pca)
[pred1,accuracy1,prec1,hit5_1,hit1_1]= knn_top_recommendation(data_pca,true_centroid,true_centroid_t)
reco5_value = recomm_performance(hit5_1)
reco1_value = recomm_performance(hit1_1)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



In [36]:
## For Autoencoder Data

with open('values.pkl', 'rb') as f:
  mynewlist = pickle.load(f)
new_arr = np.array(mynewlist)
tsne_plot(new_arr)
[pred2,accuracy2,prec2,hit5_2,hit1_2]= knn_top_recommendation(new_arr,true_centroid,true_centroid_t)
reco5_value2 = recomm_performance(hit5_2)
reco1_value2 = recomm_performance(hit1_2)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


Precision is ill-defined and being set to 0.0 in labels with no predicted samples.



In [72]:
## For Siamese network Encodings
new_arr_2_f = np.load('siamese_encodings_finale.npy',mmap_mode='r')
new_arr_2 = new_arr_2_f.T
tsne_plot(new_arr_2)
[pred3,accuracy3,prec3,hit5_3,hit1_3]= knn_top_recommendation(new_arr_2,true_centroid,true_centroid_t)
reco5_value3 = recomm_performance(hit5_3)
reco1_value3 = recomm_performance(hit1_3)


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



In [94]:
print(reco5_value)
print(reco5_value2)
print(reco5_value3)
print("\n")
print(np.mean(prec1))
print(np.mean(prec2))
print(np.mean(prec3))

0.44333333333333336
0.49666666666666665
0.5833333333333334


0.16234032321290523
0.19402098671828705
0.2818743669660545


In [78]:
kmeans = KMeans(n_clusters=10).fit(data_pca)
kmeans2 = KMeans(n_clusters=10).fit(new_arr)
kmeans3 = KMeans(n_clusters=10).fit(new_arr_2)

In [86]:
[data_train,data_test] = data_split(data_pca)
[data_new_train,data_new_test] = data_split(new_arr)
[data_new_2_train,data_new_2_test] = data_split(new_arr_2)

pred_test1 = kmeans.predict(data_test)
pred_test2 = kmeans2.predict(data_new_test)
pred_test3 = kmeans3.predict(data_new_2_test)

In [89]:
cfm=confusion_matrix(true_centroid_t.T, pred_test1)
sns.heatmap(cfm,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x1375d3dd8>

In [90]:
cfm2=confusion_matrix(true_centroid_t.T, pred_test2)
sns.heatmap(cfm2,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x12f3743c8>

In [91]:
cfm3=confusion_matrix(true_centroid_t.T, pred_test3)
sns.heatmap(cfm3,annot=True)

<matplotlib.axes._subplots.AxesSubplot at 0x123cfe780>

In [105]:
bar_graph_data(genres,hit5_1,hit5_2,hit5_3)

30
