In [1]:
import pandas as pd
import numpy as np
import os
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import *
from collections import Counter

In [2]:
class HierarchicalClustering:
    affinity='euclidean'
    linkage='ward'
    distance_threshold=9
    
    def __init__(self, state: str):
        self.ACs = []
        for ac_name in os.listdir('./ACs'):
            if state in ac_name:
                ac_i = pd.read_pickle('./ACs/'+ac_name)
                self.ACs.append(ac_i)
        self.n_rows = len(self.ACs[0])

    def get_X_Y(self, row: int) -> (pd.DataFrame, pd.DataFrame):
        data = []
        for AC in self.ACs:
            AC = pd.DataFrame(AC)
            data.append(AC.iloc[row,:].tolist())
        data = pd.DataFrame(data).reset_index(drop=True)
        X = data.iloc[:,1:]
        Y = data.iloc[:,0]
        
        return X, Y.astype(int).tolist()   
    
    def cluster(self, x_row: pd.DataFrame) -> (list):
                
        clustering = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward', distance_threshold=self.distance_threshold).fit(x_row)

        labels = clustering.labels_
        
        color_counts = Counter(labels)
            
        anomaly_score = [1 - color_counts[color] / len(labels) for color in labels]
        
        return anomaly_score
    
    def cluster_all(self, do_score=False) -> (list, list):
        
        anomaly_score, y = [], []
        
        for row in range(self.n_rows):
            x_row, y_row = hc.get_X_Y(row)
            if sum(y_row) >= len(y_row)//2: 
                print('oops')

            anomaly_score_row = hc.cluster(x_row)
            anomaly_score += anomaly_score_row
            y += y_row
                    
        return anomaly_score, y
    
    def auc_score(self, anomaly_score: list, y: list):
        # AUC
        fpr, tpr, thresholds = roc_curve(y, anomaly_score)
        AUC = auc(fpr, tpr)
        print('AUC score: %f' % AUC)
    
    def score(self, anomaly_score: list, y: list, threashold=0.8) -> None:
        
        yhat = [ 1 if y_i >= threashold else 0 for y_i in anomaly_score]
        
        # accuracy: (tp + tn) / (p + n)
        accuracy = accuracy_score(y, yhat)
        print('Accuracy: %f' % accuracy)
        # precision tp / (tp + fp)
        precision = precision_score(y, yhat)
        print('Precision: %f' % precision)
        # recall: tp / (tp + fn) More important in this case
        recall = recall_score(y, yhat)
        print('Recall: %f' % recall)
        # f1: 2 tp / (2 tp + fp + fn)
        f1 = f1_score(y, yhat)
        print('F1 score: %f' % f1)

In [3]:
hc = HierarchicalClustering('group')
anomaly_score, y = hc.cluster_all()
hc.auc_score(anomaly_score, y)
for p in [0.4,0.5,0.6,0.7,0.8,0.9]:
    print(p)
    hc.score(anomaly_score, y, threashold=p)

AUC score: 0.959907
Accuracy: 0.999440
Precision: 0.370968
Recall: 0.575000
F1 score: 0.450980


In [4]:
# for distance_threshold in [8,9,10,11,12,13]:
#     hc.distance_threshold = distance_threshold
#     anomaly_score, y = hc.cluster_all()
#     print(distance_threshold)
#     hc.auc_score(anomaly_score, y)
#     for p in [0.4,0.5,0.6,0.7,0.8,0.9]:
#         print(p)
#         hc.score(anomaly_score, y, threashold=p)
    

8
AUC score: 0.973565
0.4
Accuracy: 0.905360
Precision: 0.004000
Recall: 0.950000
F1 score: 0.007966
0.5
Accuracy: 0.935120
Precision: 0.005825
Recall: 0.950000
F1 score: 0.011578
0.6
Accuracy: 0.956900
Precision: 0.008517
Recall: 0.925000
F1 score: 0.016880
0.7
Accuracy: 0.976700
Precision: 0.014831
Recall: 0.875000
F1 score: 0.029167
0.8
Accuracy: 0.989780
Precision: 0.025145
Recall: 0.650000
F1 score: 0.048417
0.9
Accuracy: 0.999740
Precision: 0.791667
Recall: 0.475000
F1 score: 0.593750
9
AUC score: 0.981989
0.4
Accuracy: 0.960550
Precision: 0.009545
Recall: 0.950000
F1 score: 0.018901
0.5
Accuracy: 0.973750
Precision: 0.014280
Recall: 0.950000
F1 score: 0.028138
0.6
Accuracy: 0.981530
Precision: 0.019670
Recall: 0.925000
F1 score: 0.038522
0.7
Accuracy: 0.990290
Precision: 0.034965
Recall: 0.875000
F1 score: 0.067243
0.8
Accuracy: 0.996890
Precision: 0.080495
Recall: 0.650000
F1 score: 0.143251
0.9
Accuracy: 0.999790
Precision: 1.000000
Recall: 0.475000
F1 score: 0.644068
10
AUC s

In [5]:
hc.auc_score([0]*len(anomaly_score), y)

AUC score: 0.500000


In [6]:
hc.score([0]*len(anomaly_score), y, threashold=0.5)

Accuracy: 0.999600
Precision: 0.000000
Recall: 0.000000
F1 score: 0.000000


  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# class HierarchicalClustering:
#     def __init__(self, state: str):
#         self.ACs = []
#         for ac_name in os.listdir('./ACs'):
#             if state in ac_name:
#                 ac_i = pd.read_pickle('./ACs/'+ac_name)
#                 self.ACs.append(ac_i)
#         self.n_rows = len(self.ACs[0])

#     def get_X_Y(self, row: int) -> (pd.DataFrame, pd.DataFrame):
#         data = []
#         for AC in self.ACs:
#             AC = pd.DataFrame(AC)
#             data.append(AC.iloc[row,:].tolist())
#         data = pd.DataFrame(data).reset_index(drop=True)
#         X = data.iloc[:,1:]
#         Y = data.iloc[:,0]
        
#         return X, Y    
    
#     def cluster(self, X: pd.DataFrame, Y: pd.Series, method='ward') -> (list, list):
#         dendrogram = sch.dendrogram(sch.linkage(X, method=method), no_plot=True)
        
#         leaves = dendrogram['leaves']
#         leaves_color_list = dendrogram['leaves_color_list']
#         color_counts = Counter(dendrogram['leaves_color_list'])
        
#         n_leaves = len(leaves_color_list)
        
#         anomaly_score = [0] * n_leaves
#         for i, color in zip(leaves, leaves_color_list):
#             anomaly_score[i] = 1 - (color_counts[color]/n_leaves)
            
#         return anomaly_score, Y.astype(int).tolist()
    
#     def cluster_all(self, do_score=False, method='ward') -> (list, list):
        
#         anomaly_score, y = [], []
        
#         for row in range(self.n_rows):
#             X, Y = hc.get_X_Y(row)
#             if sum(Y) > len(Y)//2: continue
                
#             anomaly_score_row, y_row  = hc.cluster(X,Y,method)
            
#             anomaly_score += anomaly_score_row
#             y += y_row
                    
#         return anomaly_score, y
    
#     def score(self, anomaly_score: list, y: list, threashold: float) -> list:
        
#         yhat = [ 1 if y_i >= threashold else 0 for y_i in anomaly_score]
            
#         # accuracy: (tp + tn) / (p + n)
#         accuracy = accuracy_score(y, yhat)
#         print('Accuracy: %f' % accuracy)
#         # precision tp / (tp + fp)
#         precision = precision_score(y, yhat)
#         print('Precision: %f' % precision)
#         # recall: tp / (tp + fn)
#         recall = recall_score(y, yhat)
#         print('Recall: %f' % recall)
#         # f1: 2 tp / (2 tp + fp + fn)
#         f1 = f1_score(y, yhat)
#         print('F1 score: %f' % f1)
#         # AUC
#         fpr, tpr, thresholds = roc_curve(y, anomaly_score)
#         AUC = auc(fpr, tpr)
#         print('AUC score: %f' % AUC)
#         return yhat
# 
# 
# hc = HierarchicalClustering('harmonic_low_18_cold_fullyOpen')
# anomaly_score, y = hc.cluster_all()
# for threashold in [6/10, 7/10, 8/10, 9/10]:
#     print('Threashold : ' + str(threashold))
#     hc.score(anomaly_score, y, threashold)
#     print('--------------------')       