In [1]:
%matplotlib inline

import os
import time
import pickle
import random
import warnings
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import svm
import data_normalization
from sklearn import metrics
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from pandas import DataFrame as df
from sklearn.cluster import OPTICS
from collections import OrderedDict
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from sklearn.ensemble import IsolationForest
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import precision_recall_fscore_support


In [2]:
from sklearn.cluster import Birch
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
# from sklearn.cluster import OPTICS
from sklearn.cluster import SpectralClustering
from sklearn.cluster import AgglomerativeClustering


In [3]:
current_seed = np.random.get_state()[1][0]
with open('granary_realdata.txt', 'a+') as o:
    o.write(str(current_seed)+'\n')

In [4]:
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, linewidth=150, precision=2)

In [5]:
def flat_ground_truth(ground_truth):
    k = 0
    tn = ['good', 'bad']
    labels_true, labels_true_indices, target_name = [], [], []
    for v in ground_truth:
        for vv in v:
            labels_true.append(int(k))
            labels_true_indices.append(vv)
            target_name.append(tn[k])
        k += 1
        
    return labels_true, labels_true_indices

In [6]:
def flat_cluster_results(cluster_results):

    labels_pred, labels_pred_indices = [], []
    for k, v in cluster_results.items():
        labels_pred += [int(k) for i in v]
        labels_pred_indices += [i for i in v]
    
    return labels_pred, labels_pred_indices

### Quetelet Coefficient and Relative Cluster Contribution (QCRCC) table

In [7]:
def QCRCC(Y, results):
    
    """
    input: the original entity-to-feature matrix and the one which  is used for clustering process.
    The original adjacency matrix and the one which is used for the clustering process.
    And finally, the sorted clustering results dictionary.
    computes and prints out grand means of data matrices, "features mean" in each cluster, 
    cluster contribution, relative_difference
    return: Dict of Dict 
    """
    
    QScaD = {}
    
    k = 0
    for v in results:
        
        TY = np.sum(np.multiply(Y, Y))
        Tv = np.sum(np.multiply(Y, Y), axis=0)
        Cent, Ck, Bcnt, Bk, Rcnt, Rk, Dcnt, Dk, RCI, Rck, Bv, \
        YExpl, YUnExpl = {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}
        
        i = 0
        for pivot, indices in v.items():
            
            Cent[str(i)] = np.mean(Y[indices, :], axis=0)
            Ck[str(i)] = np.mean(Y[indices, :], )
            B_kv = np.divide(len(indices) * np.mean(np.power(Y[indices, :], 2), axis=0), 1)  # B_kv
            Bcnt[str(i)] = B_kv
            Bk[str(i)] = np.sum(B_kv)  # B_k+
            
            B_kRv = np.divide(B_kv, Tv)  # B(k|v) 
            Rcnt[str(i)] = B_kRv
            Rk[str(i)] = np.sum(B_kRv)
            
            G_kRv = np.subtract(B_kRv, np.divide(np.sum(B_kRv), TY))  # g(K|v)
            Dcnt[str(i)] = G_kRv
            Dk[str(i)] = np.sum(G_kRv)
            
            Q_kRv = np.subtract(np.divide(np.multiply(TY, B_kv), np.multiply(Tv, np.sum(B_kv))), 1)  # q(k|v)
            RCI[str(i)] = Q_kRv
            Rck[str(i)] = np.sum(Q_kRv)
                        
            i += 1
        
            QScaD['Cent'] = Cent 
            QScaD['Ck'] = Ck
            QScaD['Bcnt'] = Bcnt
            QScaD['Rcnt'] = Rcnt
            QScaD['Dcnt'] = Dcnt
            QScaD['RCI'] = RCI

            QScaD['Bk'] = Bk
            QScaD['Rk'] = Rk
            QScaD['Dk'] = Dk
            QScaD['Rck'] = Rck

        
    k += 1
    return QScaD

In [8]:
def FNR_FPR(labels_true, labels_pred):
    tn, fp, fn, tp = confusion_matrix(labels_true, labels_pred).ravel()
    fnr = fn/(fn + tp)
    fpr = fp/(fp+tn)
    return fnr, fpr
    

In [9]:
def perf_measure(y_true, y_pred):
    
    cnf_matrix = confusion_matrix(y_true, y_pred)
    
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    
    FSCORE = np.divide((2*PPV*TPR), (PPV+TPR))

    

    
    return PPV, TPR, FSCORE, FNR, FPR, TNR

#### Load and Visulize Y with Quantatitve features

In [10]:
name_of_particle = 'Egammac'

# X = np.load("matrices/" + name_of_particle + ".npy")
# _ = np.load("matrices/" + name_of_particle + ".npy")
X_train = np.load("matrices/" + name_of_particle +"_train.npy",)
y_train = np.load("matrices/" + name_of_particle +"_y_train.npy",)
X_val = np.load("matrices/" + name_of_particle +"_val.npy",)
y_val = np.load("matrices/" + name_of_particle +"_y_val.npy",)
X_test = np.load("matrices/" + name_of_particle +"_test.npy",)
y_test = np.load("matrices/" + name_of_particle +"_y_test.npy",)

X_train = X_train[:, :-3]
X_val = X_val[:, :-3]
X_test = X_test[:, :-3]
_, V = X_train.shape
K = 2
V

209

In [11]:
# y = X[:, -1]  # for pca
# Y, Y_rel_cntr, Yz, Yz_rel_cntr, Yrng, Yrng_rel_cntr, cnst_features = data_normalization.preprocess_Y(Yin=X, nscf={})

In [12]:
# from sklearn import preprocessing

In [13]:
# min_max_scaler = preprocessing.MinMaxScaler()
# Yrng =  min_max_scaler.fit_transform(X)

In [14]:
# scaler = preprocessing.StandardScaler().fit(X)
# scaler

In [15]:
# Yz = scaler.transform(Y)

### demonstrating the data sets 

Let us compute the PCA of each case and plot the first two principle components. Moreover, we plo the image plot of the corresponding entity-to-feature matrix next to it.


In [16]:
# colors = ['g', 'r']

# target_names = ['good', 'bad']
# lw = 2

# fig = plt.figure(figsize=(20.5, 12.5))
# pca = PCA(n_components=2)

# Y_r = pca.fit(Y).transform(Y)
# Y_r_z = pca.fit(Yz).transform(Yz)
# Y_r_r = pca.fit(Yrng).transform(Yrng)

# ax = fig.add_subplot(131)
# for color, i, target_name in zip(colors, [0., 1.], target_names):
#     plt.scatter(Y_r[y == i, 0], Y_r[y == i, 1], 
#                 color=color, alpha=.9, lw=lw,
#                 label=target_name)
    
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title("PCA of " + name_of_particle +"-NN", fontsize=15)


# ax = fig.add_subplot(132)
# for color, i, target_name in zip(colors, [0, 1], target_names):
#     plt.scatter(Y_r_z[y == i, 0], Y_r_z[y == i, 1], 
#                 color=color, alpha=.9, lw=lw,
#                 label=target_name)

# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title("PCA of " + name_of_particle +"-Z-scoring", fontsize=15)


# ax = fig.add_subplot(133)
# for color, i, target_name in zip(colors, [0., 1.], target_names):
#     plt.scatter(Y_r_r[y == i, 0], Y_r_r[y == i, 1], 
#                 color=color, alpha=.9, lw=lw,
#                 label=target_name)

# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title("PCA of " + name_of_particle +"-MinMax", fontsize=15)


# plt.savefig("figures/" + name_of_particle + ".png" )
# plt.show()


In [17]:
# import matplotlib.pyplot as plt

# from sklearn import datasets
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# iris = datasets.load_iris()

# X = iris.data
# y = iris.target
# target_names = iris.target_names

# pca = PCA(n_components=2)
# X_r = pca.fit(X).transform(X)

# lda = LinearDiscriminantAnalysis(n_components=2)
# X_r2 = lda.fit(X, y).transform(X)

# # Percentage of variance explained for each components
# print('explained variance ratio (first two components): %s'
#       % str(pca.explained_variance_ratio_))

# plt.figure()
# colors = ['navy', 'turquoise', 'darkorange']
# lw = 2

# for color, i, target_name in zip(colors, [0, 1, 2], target_names):
#     plt.scatter(X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=.8, lw=lw,
#                 label=target_name)
# plt.legend(loc='best', shadow=False, scatterpoints=1)
# plt.title('PCA of IRIS dataset')


In [18]:
# X = Yz
# fig = plt.figure(1, figsize=(9.5, 5.5))
# plt.clf()
# ax = Axes3D(fig, rect=[0, 0, .95, 1], elev=48, azim=134)

# plt.cla()
# pca = PCA(n_components=3)
# pca.fit(X)
# X = pca.transform(X)

# for name, label in [('component1', 0), ('component2', 1), ('component3', 2)]:
#     ax.text3D(X[labels_true == label, 0].mean(),
#               X[labels_true == label, 1].mean() + 1.5,
#               X[labels_true == label, 2].mean(), name,
#               horizontalalignment='center',
#               bbox=dict(alpha=.8, edgecolor='w', facecolor='w'))
# # Reorder the labels to have colors matching the cluster results
# # labels_true = np.choose(labels_true, [1, 2, 0])
# ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=labels_true,
#            edgecolor='k', cmap=plt.cm.viridis,)

# # ax.w_xaxis.set_ticklabels([])
# # ax.w_yaxis.set_ticklabels([])
# # ax.w_zaxis.set_ticklabels([])
# plt.title("PCA of " + name_of_particle, fontsize=15)
# # plt.savefig("figures/" + name_of_particle + "3d" + ".png" )


# INITIALLY BLANK






## Applying the proposed Algorithm on Real Data with Z-scoring Preprocessing

I Run it from terminal in order to be able to minitore it!

In [19]:
# out_ms_z, AMI_z, NMI_z, ARI_z, FSCORE_z, ARGMAX_z, T_z = apply_anc(data_type='z-score'.lower(),
#                                                                    Y=Y, labels_true=labels_true
#                                                                    , with_noise=False)

# #out_ms_z: stands for OUTput_MemberShip No_Preprocessing

# with open (os.path.join('../RealData_computation', "out_ms_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(out_ms_z, fp)        
    
# with open (os.path.join('../RealData_computation', "AMI_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(AMI_z, fp)

# with open (os.path.join('../RealData_computation', "NMI_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(NMI_z, fp)
    
# with open (os.path.join('../RealData_computation', "ARI_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(ARI_z, fp)

# with open (os.path.join('../RealData_computation', "FSCORE_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(FSCORE_z, fp)

# with open (os.path.join('../RealData_computation', "ARGMAX_z.pickle"),'wb') as fp:  # Small_Quantatitive
#     pickle.dump(ARGMAX_z, fp)
    
# with open (os.path.join('../RealData_computation', "T_z.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(T_z, fp)

# print("Results are saved!")

## Loading the saved Results 

Could save time for the case of large data sets

In [20]:
# with open (os.path.join('../RealData_computation', "out_ms_z.pickle"), 'rb') as fp:  # Clustering Results
#     out_ms_z = pickle.load(fp)        
    
# with open (os.path.join('../RealData_computation', "AMI_z.pickle"), 'rb') as fp:  # AMI
#     AMI_z = pickle.load(fp)

# with open (os.path.join('../RealData_computation', "NMI_z.pickle"), 'rb') as fp:  # NMI
#     NMI_z = pickle.load(fp)
    
# with open (os.path.join('../RealData_computation', "ARI_z.pickle"), 'rb') as fp:  # ARI
#     ARI_z = pickle.load(fp)

# with open (os.path.join('../RealData_computation', "FSCORE_z.pickle"), 'rb') as fp:  # FSCORE
#     FSCORE_z = pickle.load(fp)

# with open (os.path.join('../RealData_computation', "ARGMAX_z.pickle"), 'rb') as fp:  # ARGMAX, pivot which returns max score of ARI
#     ARGMAX_z = pickle.load(fp)  
    
# with open (os.path.join('../RealData_computation', "T_z.pickle"), 'rb') as fp:  # Clustering Results
#     T_z = pickle.load(fp)   


### Evaluating the Performance of Algorithm using ARI, NMI and AMI

In [21]:
# res_rep_ari = []  # ARI results of repeatation for the corresponding setting
# res_rep_nmi = []  # NMI results of repeatation for the corresponding setting
# res_rep_ami = []  # AMI results of repeatation for the corresponding setting
# for pivot, result in ARI_z.items():
#     res_rep_ari.append(result)
#     res_rep_nmi.append(NMI_z[pivot])
#     res_rep_ami.append(AMI_z[pivot])

# ave_ari = np.mean(np.asarray(res_rep_ari))
# std_ari = np.std(np.asarray(res_rep_ari))
# max_ari = max(res_rep_ari)

# ave_nmi = np.mean(np.asarray(res_rep_nmi))
# std_nmi = np.std(np.asarray(res_rep_nmi))
# max_nmi = max(res_rep_nmi)

# ave_ami = np.mean(np.asarray(res_rep_ami))
# std_ami = np.std(np.asarray(res_rep_ami))
# max_ami = max(res_rep_ami)
        
# # print("Setting: %.2f" % pivot , "ARI-max=%.2f," % max_ari ,"ARI-ave=%.2f," % ave_ari, "ARI-std=%.2f," % std_ari, "NMI max: %.2f" % max_nmi ,"NMI ave=%.2f," % ave_nmi, "NMI std=%.2f," % std_nmi,)
# print("1:", max_ari , "2:", ave_ari, "3:", std_ari, "4:", max_nmi ,"5:", ave_nmi,"6:", std_nmi,)

## Cluster Analysis - Data are preprocced with Z-scoring

In [22]:
# for pivot, result in T_z.items():
#     nc = 0
#     print("The impact of features in the Cluster Recovery with Pivot equal to: " + str(pivot))
#     for k, v in T_z[pivot]['Cent'].items():
#         print("cluster"+ str(nc+1) + ": ", 'Cent:', T_z[pivot]['Cent'][str(nc)])
#         print("cluster"+ str(nc+1) + ": ", 'Bcnt:', T_z[pivot]['Bcnt'][str(nc)] )
#         print("cluster"+ str(nc+1) + ": ", 'Rcnt:', T_z[pivot]['Rcnt'][str(nc)])
#         print("cluster"+ str(nc+1) + ": ", 'Dcnt:', T_z[pivot]['Dk'][str(nc)])
#         print("cluster"+ str(nc+1) + ": ", 'RCI: ', T_z[pivot]['RCI'][str(nc)])
#         print(" ")
#         nc += 1 
#     print( )
#     print( )

# Comparing with other works from the literature 

Since in this setting we have more than two clusters, applying One-class SVM and Isolated forest are not feasible, because those algorithms are devoted to anomaly detection though here we aim to clusterize the data.





## Hierarchical clustering (Agglomerative) - Without Noisy features

[1] ref: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering


In [23]:
clustering_agg = AgglomerativeClustering(n_clusters=K).fit(X_test)

labels_pred_agg = clustering_agg.labels_     

AMI_agg = metrics.adjusted_mutual_info_score(y_test, labels_pred_agg)
NMI_agg = metrics.normalized_mutual_info_score(y_test, labels_pred_agg)
ARI_agg = metrics.adjusted_rand_score(y_test, labels_pred_agg)
FSCORE_agg = precision_recall_fscore_support(y_test, labels_pred_agg, average='weighted')   
PPV, TPR, FSCORE, FNR, FPR, TNR = perf_measure(y_true=y_test, y_pred=labels_pred_agg)

PPV, TPR, FSCORE, FNR, FPR, TNR

(array([0.99, 0.03]),
 array([0.85, 0.3 ]),
 array([0.91, 0.05]),
 array([0.15, 0.7 ]),
 array([0.7 , 0.15]),
 array([0.3 , 0.85]))

## Spectral Clustering - Not applicable

Ref [2]
Normalized cuts and image segmentation, 2000 Jianbo Shi, Jitendra Malik http://citeseer.ist.psu.edu/viewdoc/summary?doi=10.1.1.160.2324

A Tutorial on Spectral Clustering, 2007 Ulrike von Luxburg http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.165.9323

Multiclass spectral clustering, 2003 Stella X. Yu, Jianbo Shi https://www1.icsi.berkeley.edu/~stellayu/publication/doc/2003kwayICCV.pdf

In [24]:
# # Y, _, Yz, _, Yrng, _, = preprocess_Y(Yin=Y)

# clustering_spec = SpectralClustering(n_clusters=K, n_jobs=5).fit(X_test)

# labels_pred_spec = clustering_spec.labels_ 


# AMI_spec = metrics.adjusted_mutual_info_score(labels_true, labels_pred_spec)
# NMI_spec = metrics.normalized_mutual_info_score(labels_true, labels_pred_spec)
# ARI_spec = metrics.adjusted_rand_score(labels_true, labels_pred_spec)
# FSCORE_spec = precision_recall_fscore_support(labels_true, labels_pred_spec, average='weighted')
# fnr_spec, fpr_spec = FNR_FPR(labels_true=y_test, labels_pred=labels_pred_spec)

# print("FSCORE:", FSCORE_spec, "FNR:", fnr_spec, "FPR:", fpr_spec)

## DBSCAN - Density-Based Spatial Clustering - Without Noisy Features

Ref [3]
Ester, M., H. P. Kriegel, J. Sander, and X. Xu, “A Density-Based Algorithm for Discovering Clusters in Large Spatial Databases with Noise”. In: Proceedings of the 2nd International Conference on Knowledge Discovery and Data Mining, Portland, OR, AAAI Press, pp. 226-231. 1996

Schubert, E., Sander, J., Ester, M., Kriegel, H. P., & Xu, X. (2017). DBSCAN revisited, revisited: why and how you should (still) use DBSCAN. ACM Transactions on Database Systems (TODS), 42(3), 19.


In [25]:
# Epsilon, min_sample
setting_dbscan = [(10, 2), ]  #(20, 2), (10, 15), (20, 2), (20, 5), (20, 15)]

In [26]:
# Y, _, Yz, _, Yrng, _, = preprocess_Y(Yin=X_test)

for i in setting_dbscan:
    
    print("setting:", i)
    clustering_dbscan = DBSCAN(eps=i[0], min_samples=[1], n_jobs=-2).fit(X_test)

    labels_pred_dbscan = clustering_dbscan.labels_ 

    with open (os.path.join('RealData_computation', "clustering_dbscan" + str(i)+ ".pickle"), 'wb') as fp:  # Small_Quantatitive
        pickle.dump(clustering_dbscan, fp)  

    AMI_dbscan = metrics.adjusted_mutual_info_score(y_test, labels_pred_dbscan)
    NMI_dbscan = metrics.normalized_mutual_info_score(y_test, labels_pred_dbscan)
    ARI_dbscan = metrics.adjusted_rand_score(y_test, labels_pred_dbscan)
    FSCORE_dbscan = precision_recall_fscore_support(y_test, labels_pred_dbscan, average='weighted')
    PPV, TPR, FSCORE, FNR, FPR, TNR = perf_measure(y_true=y_test, y_pred=labels_pred_dbscan)

PPV, TPR, FSCORE, FNR, FPR, TNR

setting: (10, 2)


(array([0.99, 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  ,
        0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.  , 0.

## OPTICS ( Ordering Points To Identify the Clustering Structure) - Without Noisy Features

Ref [4]: 

Ankerst, Mihael, Markus M. Breunig, Hans-Peter Kriegel, and Jörg Sander. “OPTICS: ordering points to identify the clustering structure.” ACM SIGMOD Record 28, no. 2 (1999): 49-60.

Schubert, Erich, Michael Gertz. “Improving the Cluster Structure Extracted from OPTICS Plots.” Proc. of the Conference “Lernen, Wissen, Daten, Analysen” (LWDA) (2018): 318-329


In [27]:

# # fit the model
# clustering_optic = OPTICS(n_jobs=-2).fit(X_test)

# labels_pred_optic = clustering_optic.labels_ 


# AMI_optic = metrics.adjusted_mutual_info_score(labels_true, labels_pred_optic)
# NMI_optic = metrics.normalized_mutual_info_score(labels_true, labels_pred_optic)
# ARI_optic = metrics.adjusted_rand_score(labels_true, labels_pred_optic)
# FSCORE_optic = precision_recall_fscore_support(labels_true, labels_pred_optic, average='weighted')
# fnr_optic, fpr_optic = FNR_FPR(labels_true=y_test, labels_pred=labels_pred_optic)

# print("FSCORE:", FSCORE_optic, "fnr:", fnr_optic, "fpr:", fpr_optic)


In [28]:
# with open (os.path.join('../RealData_computation', "clustering_optic.pickle"), 'rb') as fp:  # Small_Quantatitive
#     clustering_optic = pickle.load(fp)  

# labels_pred_optic = clustering_optic.labels_ 
# AMI_optic = metrics.adjusted_mutual_info_score(labels_true, labels_pred_optic)
# NMI_optic = metrics.normalized_mutual_info_score(labels_true, labels_pred_optic)
# ARI_optic = metrics.adjusted_rand_score(labels_true, labels_pred_optic)
# FSCORE_optic = precision_recall_fscore_support(labels_true, labels_pred_optic, average='weighted')


## Birch  
Ref [4]

Tian Zhang, Raghu Ramakrishnan, Maron Livny BIRCH: An efficient data clustering method for large databases. https://www.cs.sfu.ca/CourseCentral/459/han/papers/zhang96.pdf

Roberto Perdisci JBirch - Java implementation of BIRCH clustering algorithm https://code.google.com/archive/p/jbirch

In [29]:
# # fit the model
# clustering_birch = Birch(n_clusters=K).fit(X_test)

# labels_pred_birch = clustering_birch.labels_ 


# AMI_birch = metrics.adjusted_mutual_info_score(y_test, labels_pred_birch)
# NMI_birch = metrics.normalized_mutual_info_score(y_test, labels_pred_birch)
# ARI_birch = metrics.adjusted_rand_score(y_test, labels_pred_birch)
# FSCORE_birch = precision_recall_fscore_support(y_test, labels_pred_birch, average='weighted')
# fnr_birch, fpr_birch = FNR_FPR(labels_true=y_test, labels_pred=labels_pred_birch)

# print("FSCORE:", FSCORE_birch, "fnr:", fnr_birch, "fpr:", fpr_birch)


## K-mean 
Ref[5] 

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

In [30]:
# %%time
# Y, _, Yz, _, Yrng, _, = preprocess_Y(Yin=Y)


# fit the model
clustering_kmean = KMeans(n_clusters=K, n_jobs=-2).fit(X_test)

labels_pred_kmean = clustering_kmean.labels_ 

# with open (os.path.join('../RealData_computation', "clustering_kmean.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(clustering_kmean, fp)  


AMI_kmean = metrics.adjusted_mutual_info_score(y_test, labels_pred_kmean)
NMI_kmean = metrics.normalized_mutual_info_score(y_test, labels_pred_kmean)
ARI_kmean = metrics.adjusted_rand_score(y_test, labels_pred_kmean)
FSCORE_kmean = precision_recall_fscore_support(y_test, labels_pred_kmean, average='weighted')
PPV, TPR, FSCORE, FNR, FPR, TNR = perf_measure(y_true=y_test, y_pred=labels_pred_kmean)

PPV, TPR, FSCORE, FNR, FPR, TNR


(array([0.99, 0.02]),
 array([0.67, 0.45]),
 array([0.8 , 0.04]),
 array([0.33, 0.55]),
 array([0.55, 0.33]),
 array([0.45, 0.67]))

### spilitting data into train and test

## Comparison 

In [31]:
# print("\t", "precision", "recall", "fscore")
# print("DBSCAN          :", "%.2f" % FSCORE_dbscan[0],  "%.2f" % FSCORE_dbscan[1],  "%.2f" % FSCORE_dbscan[2])
# print("OPTIC           :", "%.2f" % FSCORE_optic[0],   "%.2f" % FSCORE_optic[1],   "%.2f"  % FSCORE_optic[2])
# print("BIRCH           :", "%.2f" % FSCORE_birch[0],   "%.2f" % FSCORE_birch[1],   "%.2f" % FSCORE_birch[2])
# print("KMEAN           :", "%.2f" % FSCORE_kmean[0],   "%.2f" % FSCORE_kmean[1],   "%.2f" % FSCORE_kmean[2])
# print("Isolation Forest:", "%.2f" % FSCORE_IsolFor[0], "%.2f" % FSCORE_IsolFor[1], "%.2f" % FSCORE_IsolFor[2])
# print("One Class SVM   :", "%.2f" % FSCORE_ocs[0],     "%.2f" % FSCORE_ocs[1],     "%.2f" % FSCORE_ocs[2])


In [32]:
print("finish")

finish


### Conclusions:

..