In [None]:
import os
import json
import pickle
import warnings
import numpy as np
import networkx  as nx
from sklearn import metrics
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from collections import OrderedDict, defaultdict
# import simultaneous_anomalous_clustering as sanc
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import precision_recall_fscore_support

In [2]:
warnings.filterwarnings('ignore')
np.set_printoptions(suppress=True, linewidth=150, precision=2)

In [3]:
def flat_ground_truth(ground_truth):
    
    labels_true, labels_true_indices = [], []
    for k, v in ground_truth.items():
        for vv in v:
            labels_true.append(int(k)+1)
            labels_true_indices.append(vv)
            
    return labels_true, labels_true_indices

## Loading data for small-size 


In [4]:
with open(os.path.join('../data', 'MC(1000, 10, 15).pickle'), 'rb') as fp:
    SAN = pickle.load(fp)

In [5]:
SETTINGS = []
for setting, repeats in SAN.items():
    SETTINGS.append(setting)
    print("setting:", setting)

setting: (0.7, 0.3, 0.7)
setting: (0.7, 0.3, 0.9)
setting: (0.7, 0.6, 0.7)
setting: (0.7, 0.6, 0.9)
setting: (0.9, 0.3, 0.7)
setting: (0.9, 0.3, 0.9)
setting: (0.9, 0.6, 0.7)
setting: (0.9, 0.6, 0.9)


### preparing data for CESNA

In [14]:
# key = 0
# name = 'MC'

# for setting, repeats in SAN.items():
#     print("setting:", setting)
            
#     for repeat, matrices in repeats.items():
#         print("repeat:", repeat)

#         GT = matrices['GT']

#         Yin  = matrices['Y']
#         N, V = Yin.shape

#         Ynin = matrices['Yn']

#         Pin  = matrices['P']
#         Np, Vp = Pin.shape

#         # This section is not needed here for the conversion
#         k = 1
#         interval = 1
#         labels_true_sorted_final, labels_true_indices = [], []
#         for v in GT:
#             tmp_indices = []
#             for vv in range(v):
#                 labels_true_sorted_final.append(str(k))
#                 tmp_indices.append(interval+vv)

#             k += 1
#             interval += v
#             labels_true_indices += tmp_indices

#         # preparing data for CESNA
#         Yc_unique = np.zeros([N, V])
#         interval = 0

#         for v in range(V):
#             for i in range(N):
#                 Yc_unique[i, v] = Yin[i, v] + interval
#             interval += max(list(set(Yin[:, v])))


#         test_feature = []
#         key += 1
# #         key = [str(i) for i in setting]
# #         key = ''.join(key)
# #         key += str(repeat)

#         with open (os.path.join("cesna_medium_data/", str(key) + ".nodefeat"), "w") as fp:
#             for v in range(V):
#                 for i in range(N):
#                     fp.write(str(i+1))
#                     fp.write("\t")
#                     fp.write(str(int(Yc_unique[i, v])))
#                     fp.write("\n")
#                     test_feature.append((str(i+1), str(int(Yc_unique[i, v]))))

#         diff_features = []
#         for v in range(1, V):
#             for i in range(N):
#                 if not (str(i+1) , str(int(Yc_unique[i, v]))) in test_feature:
#                     diff_features.append((str(i+1) , str(int(Yc_unique[i, v]))))

#         print("  ")
#         print("diff features:", diff_features, len(diff_features))

#         test_nets = []
#         with open (os.path.join("cesna_medium_data/", str(key) + ".edges"), "w") as fp:
#             for i in range(Np):
#                 for j in range(Np):
#                     if Pin[i, j] != 0 or Pin[j, i] != 0:
#                         fp.write(str(i+1))
#                         fp.write("\t")
#                         fp.write(str(j+1))
#                         fp.write("\n")
#                         test_nets.append((str(i+1), str(j+1)))
#         #                test_nets.append((str(j+1), str(i+1)))                                                                                       

# #             diff_nets = []
# #             for i in range(Np):
# #                 print("i:", i)
# #                 for j in range(Np):
# #                     if Pin[i, j] != 0:
# #                         if not (str(i+1), str(j+1)) in test_nets:
# #                             diff_nets.append((str(i+1), str(j+1)))

# #             print("test_nets:", len(test_nets))
# #             print("diff_nets:", diff_nets, len(diff_nets))
#         print("done!")




### Evaluating CESNA



#### Loading the saved result and other post processing methods

In [15]:
ARI_CESNA = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 
NMI_CESNA = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 
AMI_CESNA = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 

In [20]:
print("results of applying CESNA:")
print("\t", "  p", "  q", " a/e", "\t", "  ARI     ", "  NMI",)
print(" \t", " \t", " \t", " Ave", " std", " Ave", " std")


ari_cesna, nmi_cesna, ami_cesna = [], [], []
for i in range(81):  
    
    if i+1 <= 80:
        
        repeat  = (i %10) + 1
        setting = i // 10

        # load the saved result
        with open("/home/Soroosh/snap/examples/cesna/results_medium/medium_" + str(i+1)+ "cmtyvv.txt", 'r') as fp:
            cesna = fp.readlines()
        
            
        GT = SAN[SETTINGS[setting]][repeat-1]['GT']

        # This section is not needed here for the conversion
        k = 1
        interval = 1
        labels_true_final_cesna, labels_true_indices = [], []
        for v in GT:
            tmp_indices = []
            for vv in range(v):
                labels_true_final_cesna.append(str(k))
                tmp_indices.append(interval+vv)

            k += 1
            interval += v
            labels_true_indices += tmp_indices
        
        N = len(labels_true_final_cesna)

        # flattening the result   
        cesna_indices = []
        for i in cesna:
            cesna_indices.append([int(ii) for ii in i.split()])
        
        cesna_labels = np.zeros([N])
        label = 1
        for lst in cesna_indices:
            for l in lst:
                cesna_labels[l-1] = label
            label += 1


        ari_cesna.append(metrics.adjusted_rand_score(labels_true=labels_true_final_cesna, labels_pred=cesna_labels))
        nmi_cesna.append(metrics.normalized_mutual_info_score(labels_true=labels_true_final_cesna, labels_pred=cesna_labels))
        ami_cesna.append(metrics.adjusted_mutual_info_score(labels_true=labels_true_final_cesna, labels_pred=cesna_labels))

        if repeat == 10:

            ARI_CESNA[SETTINGS[setting]][str(repeat)] = ari_cesna
            NMI_CESNA[SETTINGS[setting]][str(repeat)] = nmi_cesna
            AMI_CESNA[SETTINGS[setting]][str(repeat)] = ami_cesna

            print( SETTINGS[setting],
                  "%.3f" % np.mean(np.asarray(ari_cesna)),
                  "%.3f" % np.std(np.asarray(ari_cesna)),
                  '%0.3f' % np.mean(np.asarray(nmi_cesna)),
                  '%0.3f' % np.std(np.asarray(nmi_cesna))   
                 )
            

            ari_cesna, nmi_cesna, ami_cesna = [], [], []    


results of applying CESNA:
	   p   q  a/e 	   ARI        NMI
 	  	  	  Ave  std  Ave  std
(0.7, 0.3, 0.7) 0.715 0.128 0.838 0.061
(0.7, 0.3, 0.9) 0.764 0.068 0.886 0.034
(0.7, 0.6, 0.7) 0.016 0.008 0.251 0.023
(0.7, 0.6, 0.9) 0.060 0.024 0.352 0.064
(0.9, 0.3, 0.7) 0.849 0.076 0.921 0.026
(0.9, 0.3, 0.9) 0.894 0.053 0.939 0.031
(0.9, 0.6, 0.7) 0.474 0.089 0.661 0.059
(0.9, 0.6, 0.9) 0.632 0.058 0.752 0.039


### preparing data for SIAN

In [None]:
# key = 0
# index = 0
# for setting, repeats in SAN.items():
    
# #     print("setting:", setting)
#     for repeat, matrices in repeats.items():
# #         print("repeat:", repeat)
        
#         GT = matrices['GT']
        
#         Yin  = matrices['Y']
#         N, V = Yin.shape
        
#         Ynin = matrices['Yn']
        
#         Pin  = matrices['P']
#         Np, Vp = Pin.shape
        
#         array2list = []
#         for i in range(N):
#             array2list.append(list(Yin[i, :]))
            
#         unique_features = []
#         for i in array2list:
#             if not i in unique_features:
#                 unique_features.append(i)

# #         print("unique_features:", unique_features)
        
#         labels_comb_man = ["meta" + str(i+1) for i in range(len(unique_features))]
# #         print(len(labels_comb_man))
        
#         Ggml = nx.from_numpy_array(Pin)

#         attributes_dict = {}
                
#         label = 1
#         labels_true = []
#         for uf in unique_features:  #UniqueFeatures
#             tmp_labels = []
#             for i in range(N):
#                 if uf == array2list[i]:
#                     attributes_dict[i] ={}
#                     key = labels_comb_man[label-1]
#                     attributes_dict[i][key] = key
#                     tmp_labels.append(label)
#             label += 1
        
#         index += 1
#         nx.set_node_attributes(Ggml, attributes_dict)
#         nx.write_gml(G=Ggml, path="sian_medium_data_tmp/medium_" + str(index) + "_SIAN.gml")
        
#         with open ("sian_medium_data_tmp/medium_"+ str(index) +"_SIAN.gml", 'r') as fp:
#             GML = fp.readlines()
        
#         with open ("sian_medium_data/medium_" + str(index) +".gml", 'w') as fp:
#             for i in range(len(GML)):
#                 if i ==3:
#                     fp.write(GML[0])
#                     fp.write(GML[1])
#                     fp.write(GML[2])
#                     fp.write("    ")
#                     fp.write(GML[3].split()[0])
#                     fp.write(" ")
#                     fp.write(GML[4].split()[1])
#                     fp.write("\n")
#                     fp.write(GML[5])
#                 elif "label" in GML[i] and i!=3:
#                     fp.write(GML[i-2])
#                     fp.write(GML[i-1])
#                     fp.write("    ")
#                     fp.write(GML[i].split()[0])
#                     fp.write(" ")
#                     fp.write(GML[i+1].split()[1])
#                     fp.write("\n")
#                     fp.write(GML[i+2])
#                 elif "edge" in GML[i]:
#                     fp.write(GML[i])
#                     fp.write(GML[i+1])
#                     fp.write(GML[i+2])
#                     fp.write(GML[i+4])
                    
                    
# print("Conversion is Done!")

### Evaluating the SIAN's results



In [23]:
ARI_SIAN = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 
NMI_SIAN = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 
AMI_SIAN = {SETTINGS[j//10]:{} for j in range(80)}  # 80 data sets 

In [24]:
print("results of applying SIAN:")
print("\t", "  p", "  q", " a/e", "\t", "  ARI     ", "  NMI",)
print(" \t", " \t", " \t", " Ave", " std", " Ave", " std")

# dataset_in_which_SIAN_does_not_converge = [2, 6, 10, 21, 29, 42, 44, 47, 48, 49, 61, 62, 65, 66, 67, 68, 69] 
dataset_in_which_SIAN_produces_memory_error = []
N = 1000
ari_sian, nmi_sian, ami_sian = [], [], []

for i in range(81):  #81
    
    # load the saved result
    if i+1 <= 80:  # and not i+1 in dataset_in_which_SIAN_does_not_converge:
        repeat  = (i %10) + 1
        setting = i // 10
        with open("/home/Soroosh/Newman_2016/Newman_Clauset_code/sian_medium_results/medium_result" + str(i+1)+ ".txt", 'rb') as fp:
            SIAN = fp.readlines()
        
        sian_labels = []
        if len(SIAN) == N:  # number of nodes = number of line in sians output
            for line in range(len(SIAN)):
                tmp = SIAN[line].split()
                prob = [float(tmp[t]) for t in range(len(tmp))  if t > 1]
                sian_labels.append(np.argmax(np.asarray(prob)))
        else:
            dataset_in_which_SIAN_produces_memory_error.append(i+1)
            sian_labels += ['0' for i in range(N)]
            
            
        GT = SAN[SETTINGS[setting]][repeat-1]['GT']
        k = 1
        interval = 1
        labels_true_final_sian, labels_true_indices = [], []
        for v in GT:
            tmp_indices = []
            for vv in range(v):
                labels_true_final_sian.append(str(k))
                tmp_indices.append(interval+vv)

            k += 1
            interval += v
            labels_true_indices += tmp_indices

        
        ari_sian.append(metrics.adjusted_rand_score(labels_true=labels_true_final_sian, labels_pred=sian_labels))
        nmi_sian.append(metrics.normalized_mutual_info_score(labels_true=labels_true_final_sian, labels_pred=sian_labels, average_method='arithmetic'))
        ami_sian.append(metrics.adjusted_mutual_info_score(labels_true=labels_true_final_sian, labels_pred=sian_labels,))

        if repeat == 10:

            ARI_SIAN[SETTINGS[setting]][str(repeat)] = ari_sian
            NMI_SIAN[SETTINGS[setting]][str(repeat)] = nmi_sian
            AMI_SIAN[SETTINGS[setting]][str(repeat)] = ami_sian
            
            print( SETTINGS[setting],
                  "%.3f" % np.mean(np.asarray(ari_sian)),
                  "%.3f" % np.std(np.asarray(ari_sian)),
                  '%0.3f' % np.mean(np.asarray(nmi_sian)),
                  '%0.3f' % np.std(np.asarray(nmi_sian))   
                 )

            ari_sian, nmi_sian, ami_sian = [], [], []    


results of applying SIAN:
	   p   q  a/e 	   ARI        NMI
 	  	  	  Ave  std  Ave  std
(0.7, 0.3, 0.7) 0.000 0.000 -0.000 0.000
(0.7, 0.3, 0.9) 0.026 0.077 0.059 0.178
(0.7, 0.6, 0.7) 0.000 0.000 -0.000 0.000
(0.7, 0.6, 0.9) 0.000 0.000 -0.000 0.000


FileNotFoundError: [Errno 2] No such file or directory: '/home/Soroosh/Newman_2016/Newman_Clauset_code/sian_medium_results/medium_result48.txt'