In [1]:
import numpy as np
import pandas as pd
import random
import scipy.io as scio
from numpy import genfromtxt
from sklearn.cluster import KMeans
from scipy.spatial import distance
from sklearn.metrics import average_precision_score, precision_recall_curve

In [2]:
from KmeansPPcenters import KMeanPlusPlus
from Gonzalez_centers import Gonzalez
from KMeansOut import kmeansOutliers, cost
from Noise import add_random_noise, compute_phi_star, cost

In [3]:
susy_data = pd.read_csv('realData/SUSY/SUSY.csv', header=None)
print("SUSY loaded")


SUSY loaded


In [4]:
np.amin(susy_data)

0     0.000000e+00
1     2.548815e-01
2    -2.102927e+00
3    -1.734789e+00
4     4.285860e-01
5    -2.059306e+00
6    -1.734202e+00
7     2.598711e-04
8    -1.727117e+00
9     7.693475e-08
10   -1.671863e+01
11    2.673070e-01
12    1.041228e-03
13    2.048078e-03
14    0.000000e+00
15    2.734135e-02
16    4.452858e-03
17    3.211849e-07
18    4.172130e-08
dtype: float64

In [5]:
7.693475e-08

7.693475e-08

In [6]:
np.amax(susy_data)

0      1.000000
1     20.553450
2      2.101605
3      1.734839
4     33.035622
5      2.059721
6      1.734686
7     21.068876
8      1.740689
9     23.386438
10    20.487904
11    21.075718
12    16.166821
13     6.731210
14    20.686245
15    21.152262
16    15.613705
17     1.591660
18     1.000000
dtype: float64

In [7]:
labels= susy_data.values[:,0:1]

In [8]:
processed_data= np.delete(susy_data.values,0, axis=1)
data_all= processed_data
data_part_8= processed_data[:,0:8]
data_part_10= processed_data[:,0:10]

In [9]:
data= data_part_8[0:10000, :]
print("Using part of the data")

Using part of the data


In [10]:
num_clusters=[10, 20]
zs =[25, 50,100]
min_values=[-16, 0, 7.693475e-08]
max_values= [15, 20, 33]

for num_cluster in num_clusters:
    for z in zs:
        for min_value in min_values:
            for max_value in max_values:
                print("num_cluster:{}, z:{}, min_value:{}, max_value:{}".format(num_cluster, z, min_value, max_value))
                #print("Adding noise")
                data_with_outliers, z_indx = add_random_noise(data, z, max_value, min_value)
                data_inliers= np.delete(data, z_indx, axis=0)
                
                #print("KPP initilization to calculate phi_star")
                init= data[np.random.choice(1, len(data)-1)]
                KPP=KMeanPlusPlus(num_clusters=num_cluster, init=init)
                KPP.fit(data_with_outliers)
                phi_star= compute_phi_star(data_inliers,num_cluster, KPP.centers, z)
                print("Phi_star: {}".format(phi_star))
                
                #print("Calculating KMO")
                centers, cid, dist= kmeansOutliers(data_with_outliers, phi_star, z, num_cluster)
                costs, z_alg = cost(data_with_outliers, cid, centers, z)
                
                #print("Actual_outliers:{}, Calculated_outliers:{}". format(z_indx, z_alg))
                
                #print("Calculating precision and recall")
                precision = len(z_indx)/(len(z_indx)+ len(np.setdiff1d(z_indx, z_alg)))
                recall= precision = len(z_indx)/(len(z_indx)+ len(np.setdiff1d(z_alg, z_indx)))
                
                #x1= KPP.predict(data_with_outliers)
                #x2= cid
                #precision = len(x1)/(len(x1)+ len(np.setdiff1d(x1, x2)))
                #recall= len(x1)/(len(x1)+ len(np.setdiff1d(x2, x1)))
                #print(x1)
                #print(cid)
                print(("Precision:{}, recall:{}". format(precision, recall)))
                #print("centers: {}, cid: {}, dist: {}".format(centers, cid, dist))
                #print("Next")
                
                

num_cluster:10, z:25, min_value:-16, max_value:15
Phi_star: 22358.515520280198
Precision:0.7575757575757576, recall:0.7575757575757576
num_cluster:10, z:25, min_value:-16, max_value:20
Phi_star: 21748.042431638085
Precision:0.6756756756756757, recall:0.6756756756756757
num_cluster:10, z:25, min_value:-16, max_value:33
Phi_star: 22717.37705525755
Precision:0.6578947368421053, recall:0.6578947368421053
num_cluster:10, z:25, min_value:0, max_value:15
Phi_star: 23098.35435951625
Precision:0.625, recall:0.625
num_cluster:10, z:25, min_value:0, max_value:20
Phi_star: 24192.058262977873
Precision:0.5681818181818182, recall:0.5681818181818182
num_cluster:10, z:25, min_value:0, max_value:33
Phi_star: 24470.855413785634
Precision:0.5434782608695652, recall:0.5434782608695652
num_cluster:10, z:25, min_value:7.693475e-08, max_value:15
Phi_star: 23088.84618417039
Precision:0.5, recall:0.5
num_cluster:10, z:25, min_value:7.693475e-08, max_value:20
Phi_star: 25995.324918981387
Precision:0.51020408163