In [1]:
import pandas as pd
import numpy as np
import os
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.metrics import *
from collections import Counter

In [2]:
class HierarchicalClustering:
    affinity='euclidean'
    linkage='ward'
    distance_threshold=9
    
    def __init__(self):
        self.ACs = []
        for ac_name in os.listdir('./ACs'):
            ac_i = pd.read_pickle('./ACs/'+ac_name)
            self.ACs.append(ac_i)
        self.n_rows = len(self.ACs[0])

    def get_X_Y(self, row: int) -> (pd.DataFrame, pd.DataFrame):
        data = []
        for AC in self.ACs:
            AC = pd.DataFrame(AC)
            data.append(AC.iloc[row,:].tolist())
        data = pd.DataFrame(data).reset_index(drop=True)
        X = data.iloc[:,1:]
        Y = data.iloc[:,0]
        
        return X, Y.astype(int).tolist()   
    
    def cluster(self, x_row: pd.DataFrame) -> (list):
                
        clustering = AgglomerativeClustering(n_clusters=None, affinity='euclidean', linkage='ward', distance_threshold=self.distance_threshold).fit(x_row)

        labels = clustering.labels_
        
        color_counts = Counter(labels)
            
        anomaly_score = [1 - color_counts[color] / len(labels) for color in labels]
        
        return anomaly_score
    
    def cluster_all(self, do_score=False) -> (list, list):
        
        anomaly_score, y = [], []
        
        for row in range(self.n_rows):
            x_row, y_row = self.get_X_Y(row)
            if sum(y_row) >= len(y_row)//2: 
                print('oops')

            anomaly_score_row = self.cluster(x_row)
            anomaly_score += anomaly_score_row
            y += y_row
                    
        return anomaly_score, y
    
    def auc_score(self, anomaly_score: list, y: list):
        # AUC
        fpr, tpr, thresholds = roc_curve(y, anomaly_score)
        AUC = auc(fpr, tpr)
        print('AUC score: %f' % AUC)
    
    def score(self, anomaly_score: list, y: list, threashold=0.8) -> int:
        
        yhat = [ 1 if y_i >= threashold else 0 for y_i in anomaly_score]
        
        # accuracy: (tp + tn) / (p + n)
        accuracy = accuracy_score(y, yhat)
        print('Accuracy: %f' % accuracy)
        # precision tp / (tp + fp)
        precision = precision_score(y, yhat)
        print('Precision: %f' % precision)
        # recall: tp / (tp + fn) More important in this case
        recall = recall_score(y, yhat)
        print('Recall: %f' % recall)
        # f1: 2 tp / (2 tp + fp + fn)
        f1 = f1_score(y, yhat)
        print('F1 score: %f' % f1)
        return f1

In [3]:
hc = HierarchicalClustering()

for d in [18,19,20,21,22,23,24,25,26,27,28,29]:
    hc.distance_threshold = d
    print('distance_threshold: ' + str(d))
    anomaly_score, y = hc.cluster_all()
    hc.auc_score(anomaly_score, y)
    for p in sorted(list(set(anomaly_score))):
        print(p)
        hc.score(anomaly_score, y, threashold=p)

distance_threshold: 18
AUC score: 0.987760
0.0
Accuracy: 0.040000
Precision: 0.040000
Recall: 1.000000
F1 score: 0.076923
0.09999999999999998
Accuracy: 0.400000
Precision: 0.062500
Recall: 1.000000
F1 score: 0.117647
0.19999999999999996
Accuracy: 0.679000
Precision: 0.110803
Recall: 1.000000
F1 score: 0.199501
0.30000000000000004
Accuracy: 0.775000
Precision: 0.150943
Recall: 1.000000
F1 score: 0.262295
0.4
Accuracy: 0.838000
Precision: 0.198020
Recall: 1.000000
F1 score: 0.330579
0.5
Accuracy: 0.892000
Precision: 0.270270
Recall: 1.000000
F1 score: 0.425532
0.6
Accuracy: 0.912000
Precision: 0.312500
Recall: 1.000000
F1 score: 0.476190
0.7
Accuracy: 0.932000
Precision: 0.370370
Recall: 1.000000
F1 score: 0.540541
0.8
Accuracy: 0.965000
Precision: 0.533333
Recall: 1.000000
F1 score: 0.695652
0.9
Accuracy: 0.975000
Precision: 0.682927
Recall: 0.700000
F1 score: 0.691358
distance_threshold: 19
AUC score: 0.991146
0.0
Accuracy: 0.040000
Precision: 0.040000
Recall: 1.000000
F1 score: 0.0769

AUC score: 0.999687
0.0
Accuracy: 0.040000
Precision: 0.040000
Recall: 1.000000
F1 score: 0.076923
0.09999999999999998
Accuracy: 0.690000
Precision: 0.114286
Recall: 1.000000
F1 score: 0.205128
0.19999999999999996
Accuracy: 0.933000
Precision: 0.373832
Recall: 1.000000
F1 score: 0.544218
0.4
Accuracy: 0.989000
Precision: 0.784314
Recall: 1.000000
F1 score: 0.879121
0.7
Accuracy: 0.995000
Precision: 0.888889
Recall: 1.000000
F1 score: 0.941176
0.8
Accuracy: 0.998000
Precision: 0.952381
Recall: 1.000000
F1 score: 0.975610
0.9
Accuracy: 0.988000
Precision: 1.000000
Recall: 0.700000
F1 score: 0.823529


In [4]:
anomaly_score

[0.9,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.19999999999999996,
 0.19999999999999996,
 0.8,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.8,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.9,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.19999999999999996,
 0.8,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.19999999999999996,
 0.8,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.09999999999999998,
 0.9,
 0