In [63]:
import numpy as np
from matplotlib.pyplot import axis
from numpy import linalg as la

def normalize(x):
    for it, row in enumerate(x):
        x[it] = row / np.sqrt(np.sum(row**2))
    return x

def discetizationEigenVectorData(eigenVector):
    Y = np.array([[0.0 for _ in range(len(eigenVector[0]))] for _ in range(len(eigenVector))])
    j = []
    for row in eigenVector:
        j.append(np.unravel_index(row.argmax(), row.shape))
    for it, i in enumerate(j):
        Y[it][i[0]] = 1
    return Y 

def discretization(eigenVectors):
    eigenVectors = normalize(eigenVectors)
    n = len(eigenVectors)
    k = len(eigenVectors[0])
    
    R = np.array([[0.0 for _ in range(k)] for _ in range(k)])
    R[:,0] = np.transpose(eigenVectors[round(n/2)-1])
    
    c = np.array([0 for _ in range(n)])
    c = np.reshape(c, (n,1))
    for j in range(1,k):
        c = c + abs(np.matmul(eigenVectors, np.reshape(R[:,j-1], (k,1))))    
        i = np.unravel_index(c.argmin(), c.shape)
        R[:,j] = np.transpose(eigenVectors[i[0]])
    lastObjectiveValue = 0
    for i in range(20):
        eigenDiscrete = discetizationEigenVectorData(np.matmul(eigenVectors, R))
        u,s,v = la.svd(np.matmul(np.transpose(eigenDiscrete), eigenVectors))
        v=np.transpose(v)
        NcutValue = 2*(n-np.sum(s))
        if abs(NcutValue-lastObjectiveValue) < np.finfo(float).eps:
            break
        
        lastObjectiveValue = NcutValue
        R = np.matmul(v, np.transpose(u))
    return eigenDiscrete
    
def SpectralClustering(affinity, K, type=3):
    
    d = np.sum(affinity, axis=1)
    for it, i in enumerate(d):
        if i == 0:
            d[it] = np.finfo(float).eps
            
    D = np.diag(d)
    L = D - affinity
    if type == 1:
        NL = L
    elif type == 2:
        Di = np.diag(1/d)
        NL = Di * L
    elif type == 3:
        Di = np.diag(1/np.sqrt(d))
        NL = np.matmul(Di,L)
        NL = np.matmul(NL,Di)
    
    eig = la.eig(NL)
    res = sorted(range(len(eig[0])), key=lambda k: eig[0][k])
    U = eig[1][:,res[0:K]]
    if type == 3:
        U = normalize(U)
    eigDiscrete = discretization(U)
    labels = []
    for row in eigDiscrete:
        labels.append(np.unravel_index(row.argmax(), row.shape)[0])
    for it, i in enumerate(labels):
        labels[it] = i + 1
    return labels

In [99]:
import snf
from snf import get_n_clusters
from data_convert import load_data_txt
from random import seed
from random import shuffle
import numpy as np
from sklearn.metrics import normalized_mutual_info_score

def generateCVRuns(data, N=10, K=5):
    mats = []
    seed(N)
    for i in range(N):
        seq = np.arange(len(data))
        shuffle(seq)
        mat = np.array_split(seq, K)
        mats.append(mat)
    return mats

datas = load_data_txt(r"C:\Users\tpavo\Desktop\Tirocinio\tcga_aml\OS\0\gene_tr_0_0.txt", r"C:\Users\tpavo\Desktop\Tirocinio\tcga_aml\OS\0\meth_tr_0_0.txt", r"C:\Users\tpavo\Desktop\Tirocinio\tcga_aml\OS\0\mirna_tr_0_0.txt", r"C:\Users\tpavo\Desktop\Tirocinio\tcga_aml\OS\0\labels_OS_tr_0_0.txt")
affinity_networks = snf.make_affinity(datas.data, K=20, mu=0.5)
W = snf.snf(affinity_networks, K=20)
lab = datas.labels
clm = 'spectral'
clustInfo = False

In [106]:
def snf_cv(W, lab, clm, infocl, K=5, N=10):
    median_NMI = []
    nsamp = len(W)
    SNFNMI_all = []
    cv_folds = generateCVRuns(lab)
    for nfold in cv_folds:
        SNFNMI_K = []
        for row in nfold:
            W_k = W[np.ix_(row, row)]
            lab_k = []
            for el in row:
                lab_k.append(lab[el])
            if clm == 'spectral':
                if clustInfo:
                    nclust = len(np.unique(lab))
                    group_k = SpectralClustering(W_k, nclust)
                else:
                    nclust = get_n_clusters(W_k)
                    group_k = SpectralClustering(W_k, nclust[0])
            SNFNMI_K.append(normalized_mutual_info_score(group_k, lab_k))
        median_NMI.append(np.median(SNFNMI_K))
    median_NMI = np.median(median_NMI)
    return median_NMI

In [110]:
snf_cv(W, lab, 'spectral', False)

63


0.08343009733226453

In [111]:
import snf
from snf import get_n_clusters
from data_convert import load_data_txt
from random import seed
from random import shuffle
import numpy as np
from sklearn.metrics import normalized_mutual_info_score
from joblib import Parallel, delayed
from sklearn.utils import Bunch

def generateCVRuns(data, N=10, K=5):
    mats = []
    seed(N)
    for i in range(N):
        seq = np.arange(len(data))
        shuffle(seq)
        mat = np.array_split(seq, K)
        mats.append(mat)
    return mats

def snf_cv(W, lab, clm, infocl, K=5, N=10):
    median_NMI = []
    nsamp = len(W)
    SNFNMI_all = []
    cv_folds = generateCVRuns(lab)
    for nfold in cv_folds:
        SNFNMI_K = []
        for row in nfold:
            W_k = W[np.ix_(row, row)]
            lab_k = []
            for el in row:
                lab_k.append(lab[el])
            if clm == 'spectral':
                if clustInfo:
                    nclust = len(np.unique(lab))
                    group_k = SpectralClustering(W_k, nclust)
                else:
                    nclust = get_n_clusters(W_k)
                    group_k = SpectralClustering(W_k, nclust[0])
            SNFNMI_K.append(normalized_mutual_info_score(group_k, lab_k))
        median_NMI.append(np.median(SNFNMI_K))
    median_NMI = np.median(median_NMI)
    return median_NMI
            

def NMI_tuning(distL, K, alpha, lab, clm, infocl):
    affinityL = snf.make_affinity(distL, K=K, mu=alpha)
    W_K = snf.snf(affinityL, K=K)
    return snf_cv(W_K, lab, clm=clm, infocl=infocl)

def snf_tuning(distL, lab, clm, infocl):
    #min and max K values
    minK = 10
    maxK = 30
    stepK = 1
    K_values = range(minK, maxK+stepK, stepK)
    
    #min and max alpha values
    min_alpha = 0.3
    max_alpha = 0.8
    step_alpha = 0.05
    alpha_values = np.arange(min_alpha, max_alpha+step_alpha, step_alpha)
    
    NMI_tun = Parallel(n_jobs=2)(delayed(NMI_tuning)(distL, k, alpha, lab, clm, infocl) for k in K_values for alpha in alpha_values)
    
    nk = len(K_values)
    nalpha = len(alpha_values)
    
    idx_max_alpha_fk = []
    max_nmi_fk = []
    tab_median_NMI = []
    
    for elk in range(nK):
        max_nmi_fk.append(max(NMI_tun[elk]))
        tab_median_NMI.append(NMI_tun[elk])
        
    best_K_idx = np.unravel_index(max_nmi_fk.argmax(), max_nmi_fk.shape)
    best_K = K_values[best_K_idx]
    
    best_alpha_idx = np.unravel_index(NMI_tun.argmax(), NMI_tun.shape)
    best_alpha = alpha_values[best_alpha_idx]

    return Bunch(best_K=best_K, best_alpha=best_alpha, tab_median_NMI=tab_median_NMI)