In [1]:
%matplotlib inline
import os
import time
import pickle
import random
import warnings
import itertools
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
from copy import copy, deepcopy
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support


In [2]:
def perf_measure(y_true, y_pred):
    
    cnf_matrix = confusion_matrix(y_true, y_pred)
    
    FP = cnf_matrix.sum(axis=0) - np.diag(cnf_matrix)  
    FN = cnf_matrix.sum(axis=1) - np.diag(cnf_matrix)
    TP = np.diag(cnf_matrix)
    TN = cnf_matrix.sum() - (FP + FN + TP)

    FP = FP.astype(float)
    FN = FN.astype(float)
    TP = TP.astype(float)
    TN = TN.astype(float)

    # Specificity or true negative rate
    TNR = TN/(TN+FP) 
    # Sensitivity, hit rate, recall, or true positive rate
    TPR = TP/(TP+FN)
    # Precision or positive predictive value
    PPV = TP/(TP+FP)
    # Negative predictive value
    NPV = TN/(TN+FN)
    # Fall out or false positive rate
    FPR = FP/(FP+TN)
    # False negative rate
    FNR = FN/(TP+FN)
    # False discovery rate
    FDR = FP/(TP+FP)
    # Overall accuracy
    ACC = (TP+TN)/(TP+FP+FN+TN)
    
    FSCORE = np.divide((2*PPV*TPR), (PPV+TPR))

    

    
    return PPV, TPR, FSCORE, FNR, FPR, TNR

#### Load and Visulize Y with Quantatitve features

In [3]:
name_of_particle = 'Egammac'

# X = np.load("matrices/" + name_of_particle + ".npy")
# _ = np.load("matrices/" + name_of_particle + ".npy")
X_train = np.load("matrices/" + name_of_particle +"_train.npy",)
y_train = np.load("matrices/" + name_of_particle +"_y_train.npy",)
X_val = np.load("matrices/" + name_of_particle +"_val.npy",)
y_val = np.load("matrices/" + name_of_particle +"_y_val.npy",)
X_test = np.load("matrices/" + name_of_particle +"_test.npy",)
y_test = np.load("matrices/" + name_of_particle +"_y_test.npy",)

X_train = X_train[:, :-3]
X_val = X_val[:, :-3]
X_test = X_test[:, :-3]
_, V = X_train.shape
K = 2
V

209

In [4]:
ones = np.where(y_test==1)
ones[0].shape

(263,)

In [5]:
zeros = np.where(y_test==0)
zeros[0].shape

(17551,)

# INITIALLY BLANK






### spilitting data into train and test

In [6]:
y_test = [-1 if i ==1 else 1 for i in y_test]
y_train = [-1 if i ==1 else 1 for i in y_train]

In [7]:
set(y_test)

{-1, 1}

In [8]:
n_estimator = [100, 3000]
max_samples = [500, 2000]
max_feature = [0.5, 1.]
setting_isol = list(itertools.product(n_estimator, max_samples, max_feature))

In [9]:
setting_isol = [(100, 2000, 1)]
for i in setting_isol:
    
    print("setting:", i)
    
    start = time.time()
    clf_IsolFor = IsolationForest(n_estimators=i[0], max_samples=i[1],
                                  max_features=i[2], bootstrap=False,
                                  behaviour='new', contamination='auto',
                                  n_jobs=-2)  # ,
    clf_IsolFor.fit(X_train)
    labels_pred_IsolFor_train = clf_IsolFor.predict(X_train)

    end = time.time()
    
    print(end-start)    

    AMI_IsolFor_train = metrics.adjusted_mutual_info_score(y_train, labels_pred_IsolFor_train)
    NMI_IsolFor_train = metrics.normalized_mutual_info_score(y_train, labels_pred_IsolFor_train)
    ARI_IsolFor_train = metrics.adjusted_rand_score(y_train, labels_pred_IsolFor_train)
    FSCORE_IsolFor_train = precision_recall_fscore_support(y_train, labels_pred_IsolFor_train, average='weighted')
    
    labels_pred_IsolFor = clf_IsolFor.predict(X_test)
    AMI_IsolFor_test = metrics.adjusted_mutual_info_score(y_test, labels_pred_IsolFor)
    NMI_IsolFor_test = metrics.normalized_mutual_info_score(y_test, labels_pred_IsolFor)
    ARI_IsolFor_test = metrics.adjusted_rand_score(y_test, labels_pred_IsolFor)
    FSCORE_IsolFor_test = precision_recall_fscore_support(y_test, labels_pred_IsolFor, average='weighted')
    
    

setting: (100, 2000, 1)
2.800055742263794




In [10]:
PPV, TPR, FSCORE, FNR, FPR, TNR = perf_measure(y_true=y_test, y_pred=labels_pred_IsolFor)
    
PPV, TPR, FSCORE, FNR, FPR, TNR

(array([0.10630408, 0.9895913 ]),
 array([0.3269962 , 0.95880577]),
 array([0.16044776, 0.97395532]),
 array([0.6730038 , 0.04119423]),
 array([0.04119423, 0.6730038 ]),
 array([0.95880577, 0.3269962 ]))

In [11]:
labels_pred_IsolFor

array([ 1, -1,  1, ..., -1,  1, -1])

In [12]:
# with open (os.path.join('RealData_computation', "clustering_IsolFor.pickle"), 'wb') as fp:  # Small_Quantatitive
#     pickle.dump(clf_IsolFor, fp)  

In [13]:
# labels_pred_IsolFor = clf_IsolFor.predict(X_test)

In [14]:
# AMI_IsolFor = metrics.adjusted_mutual_info_score(y_test, labels_pred_IsolFor)
# NMI_IsolFor = metrics.normalized_mutual_info_score(y_test, labels_pred_IsolFor)
# ARI_IsolFor = metrics.adjusted_rand_score(y_test, labels_pred_IsolFor)
# FSCORE_IsolFor = precision_recall_fscore_support(y_test, labels_pred_IsolFor, average='weighted')



In [15]:
# print("precision:", "%.2f" % FSCORE_IsolFor_test[0], "recall:", "%.2f" % FSCORE_IsolFor_test[1], "fscore:", "%.2f" % FSCORE_IsolFor_test[2])

In [16]:
setting_isol

[(100, 2000, 1)]

In [17]:
type(y_test)

list

In [18]:
ones_ = np.where(labels_pred_IsolFor==1)
ones_[0].shape

(17005,)

In [19]:
zeros_ = np.where(labels_pred_IsolFor==-1)
zeros_[0].shape

(809,)

In [20]:
cnf_matrix = confusion_matrix(y_test, labels_pred_IsolFor)
cnf_matrix

array([[   86,   177],
       [  723, 16828]])