In [47]:
import os
import json
import time
import pandas as pd
import numpy as np
import scipy
import math
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import eigs
from scipy.sparse import identity
from numpy import linalg
from scipy import cluster
from scipy.sparse import csgraph
import itertools

In [54]:
#User input function
def get_parameters():
    answer = input("\nDo you want to use default parameters? y/n ")
    
    if answer == "y":
        k = 2
    else:
        k = int(input("Number of clusters: "))     
    return k

In [5]:
#Function used for import of data in json format
def data_iterator(path):
    
    print("Importing data. Please wait.\n")
    
    for root, dirs, files in os.walk(path):
        for f in files:
            if f.endswith('.json'):
                fp = os.path.join(root,f)
                with open(fp) as o:
                    data = json.load(o)
                yield {"similars" : data["similars"], "track_id": data["track_id"], "tags": data["tags"]}

In [6]:
#function will return matrix W of csr format

def create_weight_matrix():
    
    row,col,data=[],[],[]
    pairs = set() #set of pairs of nodes (row, column) for which the edge between them exists
    
    tmp = dict()
    
    per_row = dict()

    
    for i in range(len(df)):
        #row will be index of the song S
        if track_id[i] not in nodes:
            all_songs.append(track_id[i])
            nodes[track_id[i]] = len(nodes)
        
        r = nodes[track_id[i]]

        #now indices of similar songs are needed
        for j in range(len(similars[i])):
            similar_song = similars[i][j][0] #similar song
            weight = similars[i][j][1] #weight
            
            #column will be index of the song which is similar to S
            if similar_song not in nodes:
                nodes[similar_song] = len(nodes)
                all_songs.append(similar_song)
            c = nodes[similar_song]

            pair = (r,c)
            sim = (c,r)
            
            if pair not in tmp:
                tmp[pair] = weight
                tmp[sim] = weight
            else:
                current_weight = tmp[pair]
                if weight > current_weight:
                    tmp[pair] = weight
                    tmp[sim] = weight

    for pair in tmp:
        row.append(pair[0])
        col.append(pair[1])
        data.append(tmp[pair])
                        
    # calculate the graph adjacency matrix as a coo_matrix
    N = len(nodes)
    A = scipy.sparse.coo_matrix((data,(row,col)),shape=(N,N))
    
    #convert to csr_matrix
    A = scipy.sparse.csr_matrix(A)
    
    return A

In [7]:
#LAPLACIAN
def create_laplacian_matrix(W, n = False):
    
    if n == False:
        L = csgraph.laplacian(W, normed=False)
    else:
        L = csgraph.laplacian(W, normed=True)
    
    return L

In [8]:
def spectral_clustering(lap, k = 2):

    vals, vecs = scipy.sparse.linalg.eigsh(lap,lap.shape[0]-1)
    
    if k == 2:
        eig_space = vecs[:,0]
    elif k == 3:
        eig_space = vecs[:,[0,1]]
    elif k == 4:
        eig_space = vecs[:,[0,1,2]]
    else:
        eig_space = vecs[:,[0,1,2,3]]

    c,labels = scipy.cluster.vq.kmeans2(eig_space, k)

    clusters = dict()
    
    for i,c in enumerate(labels):
        if c not in clusters:
            clusters[c] = []

        clusters[c].append(i)
        
    return clusters

In [52]:
def get_items(s):
    s_coo = s.tocoo()
    return set(zip(s_coo.row, s_coo.col))

def get_ratio_cut(W, clusters):
    cut = 0
    #for c in range(len(clusters)):
    for m in range(0,len(clusters)-1):   
        for n in range(m+1, len(clusters)):
            for i in clusters[m]:
                for j in clusters[n]:
                    if ((i,j) in get_items(W)) == True:
                        cut += W[i,j]
                        
    return cut

def get_min_cut(W):

    mincut = math.inf

    permutations = itertools.permutations(nodes)
    
    for p in permutations:
        ar = np.array(p)
        for i in range(1,len(ar)):
            clusters2 = dict()
            clusters2[0] = np.array(range(0,i))
            clusters2[1] = np.array(range(i,len(ar)-1))
            cut = get_ratio_cut(W, clusters2)
            #print(clusters2[0],clusters2[1],cut)
            if cut != 0 and cut < mincut:
                mincut = cut
                
    return mincut

In [74]:
path = input("Insert path to dataset: ")
df = pd.DataFrame(data_iterator(path))
#GLOBAL VARIABLES:
nodes = dict()
tags_unique = dict()
all_songs = []
track_id = df["track_id"]
similars = df["similars"]
tags = df["tags"]

W = create_weight_matrix()

L = create_laplacian_matrix(W)

clusters = spectral_clustering(L, 2)

print("Results of clustering: ")

for i in range(len(clusters)):
    print("To cluster number",i,"belong following songs:")
    sngs = []
    for j in clusters[i]:
        sngs.append(all_songs[j])
    print(sngs)

print("---------------------")
print("Min cut: ", get_min_cut(W))

print ("Min cut hard assignment: ",get_ratio_cut(W, clusters))

#clusters

Insert path to dataset: C:\Users\Maki\Desktop\testset
Importing data. Please wait.

Results of clustering: 
To cluster number 0 belong following songs:
['JAGODA', 'LUBENICA']
To cluster number 1 belong following songs:
['KRUSKA', 'JABUKA', 'SLJIVA']
---------------------
Min cut:  0.5
Min cut hard assignment:  0.2
