# Libaries

In [82]:
#Data Structures
import numpy as np
import pandas as pd


# Learning evaluation
from  sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

#Algorithms
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope


# Definitions

In [83]:
np.seed = 42 #seed for random selections

#mapping for algorithm name and the class of the algorithm
algorithms = {
    'LocalOutlierFactor': LocalOutlierFactor, 
    'OneClassSVM': OneClassSVM, 
    'IsolationForest': IsolationForest,
    'EllipticEnvelope': EllipticEnvelope
}

# Functions

In [145]:
#Split the dataset into train and test data

# split_type: type os splitting of the examples from the interest class: "cross-validation" or "random"
# number: number of folds in case of type == "cross-validation", or number or examples in case of type == "random"
def get_indexes(data, split_type, number_trials, number_examples): 
    indexes = []
    if split_type == 'cross-validation': 
        kf = KFold(n_splits=number_trials, shuffle=True)
        for ids_train, ids_test in kf.split(data):
            indexes_train = classe_indexes[ids_train]
            indexes.append(index_train)
    elif split_type == 'random':
        for it in range(number_trials):
            indexes.append(np.random.choice(data, size=number_examples, replace=False))
    else:
        raise ValueError('Unsuported split type. Please, use split_type = {"cross-validation","random"}.')
    return indexes

In [164]:
def get_train_test_data(X, all_indexes, indexes_train):
    indexes_test = list(set(all_indexes) - set(indexes_train))
    return X[indexes_train], X[indexes_test]

In [173]:
def get_classes_test(y, classe, all_indexes, indexes_train): 
    indexes_test = list(set(all_indexes) - set(indexes_train))
    y_test = np.ones(len(indexes_test), dtype=np.int)
    for i, element in enumerate(y[indexes_test]): 
        if element != classe: 
            y_test[i] = -1
    return y_test

In [None]:
def get_evaluation_metrics(classifier, alg_name, parameters, fold_number, X_test, y_test, model_building_time=0): 
  
  evaluation = {} 
  start_time_classification = time.time()
  predictions = model.predict(X_test)
  elapsed_time_classification = (time.time() - start_time_classification) / 1000
  evaluation['Algorithm'] = alg_name
  evaluation['Parameters'] = parameters
  evaluation['class'] = fold_number
  evaluation['It_Number'] = fold_number
  evaluation['Accuracy'] = accuracy_score(y_test,predictions)
  evaluation['Precision'] = f1_score(y_test,predictions)
  evaluation['Recall'] = f1_score(y_test,predictions)
  evaluation['F1'] = f1_score(y_test,predictions)
  evaluation['ROC_AUC'] = f1_score(y_test,predictions)
  evaluation['Building_Time'] = model_building_time
  evaluation['Confusion_Matrix'] = confusion_matrix(y_test,predictions).tolist()
  evaluation['Classification_Time'] = elapsed_time_classification
  evaluation['Memory'] = sys.getsizeof(model) / 1024
  
  return evaluation 

In [174]:
#X: dada
#y: classes
# split_type: type os splitting of the examples from the interest class: "cross-validation" or "random"
#classifier: OCL algorithm
# number_trials: number of folds in case of split_type == "cross-validation", or number or repetitions in case of split_type == "random"
# number_examples: number of labeled_examples if split_type == "random"
def one_class_learning(X, y, classifier, split_type="cross-validation", number_trials=10, number_examples=10): 
    
    f1s = []
    all_indexes = set(range(len(X)))
    classes = np.unique(y)
    for classe in classes: 
        classe_indexes = np.argwhere(y == classe).reshape(-1)
        for indexes_train in get_indexes(classe_indexes, split_type, number_trials, number_examples):
            X_train, X_test = get_train_test_data(X, all_indexes, indexes_train)
            y_test = get_classes_test(y, classe, all_indexes, indexes_train)
            classifier.fit(X_train)
            score = f1_score(y_test, classifier.predict(X_test), average='binary')
            print(confusion_matrix(y_test, classifier.predict(X_test)))
            print(score)
            f1s.append(score)
    return f1s


# Área de Testes

In [175]:
import statistics

In [176]:
resultados = one_class_learning(X, y, classifier, split_type = "random")
resultados

[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[[99  1]
 [ 0 40]]
0.9876543209876543
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1]
[[100   0]
 [  0  40]]
1.0
[ 1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1
  1  1  

[0.9876543209876543,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 0.963855421686747,
 1.0,
 0.9195402298850576,
 1.0,
 0.6896551724137931,
 0.6106870229007634,
 0.6153846153846153,
 0.6557377049180327,
 0.6545454545454545,
 0.6504065040650406,
 0.6611570247933884,
 0.6399999999999999,
 0.6060606060606061,
 0.6060606060606061,
 0.6015037593984962,
 0.6153846153846153,
 0.6153846153846153,
 0.6153846153846153,
 0.6153846153846153,
 0.6153846153846153,
 0.5882352941176471,
 0.6153846153846153,
 0.6153846153846153,
 0.6201550387596899]

In [177]:
statistics.mean(resultados)

0.7459443694556633

In [7]:
df = pd.read_csv('/home/rafael/Downloads/iris.csv')
data = df.to_numpy()


In [8]:
X = data[:,:-1]
y = data[:,-1]

In [162]:
classifier = algorithms['LocalOutlierFactor']( novelty=True)
#classifier = OneClassSVM()

In [16]:
kf = KFold(n_splits=3)

In [91]:
kf = KFold(n_splits=3, shuffle=True)
teste = kf.split(X)
for t1, t2 in teste: 
    print(f't1: {t1}')
    print(f't2: {t2}')
    print('=================================')


t1: [  0   1   4   5   6   7   8  10  11  12  14  15  16  18  19  20  21  22
  23  24  26  27  29  30  31  35  37  40  41  42  43  44  45  46  47  48
  49  50  51  52  53  55  57  58  61  63  64  66  70  71  72  74  75  77
  79  80  82  83  84  89  90  91  92  95  97  98 100 103 105 108 109 111
 112 113 118 119 120 122 123 124 125 126 127 129 130 131 132 133 134 136
 137 139 141 142 143 144 146 147 148 149]
t2: [  2   3   9  13  17  25  28  32  33  34  36  38  39  54  56  59  60  62
  65  67  68  69  73  76  78  81  85  86  87  88  93  94  96  99 101 102
 104 106 107 110 114 115 116 117 121 128 135 138 140 145]
t1: [  0   1   2   3   8   9  10  12  13  15  16  17  18  21  22  23  25  26
  28  31  32  33  34  36  38  39  41  44  45  47  51  54  55  56  59  60
  61  62  63  64  65  66  67  68  69  71  73  74  76  78  79  80  81  82
  83  85  86  87  88  89  90  91  93  94  95  96  97  99 101 102 103 104
 105 106 107 108 110 114 115 116 117 119 121 122 123 126 128 129 131 134
 135 136 138

In [95]:
teste

<generator object _BaseKFold.split at 0x7f83186c5de0>

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49])

In [73]:
params = classifier.get_params()

In [76]:
params['metric'] = 'euclidean'
params

{'algorithm': 'auto',
 'contamination': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 20,
 'novelty': True,
 'p': 2}

In [79]:
classifier.set_params(**params)

LocalOutlierFactor(algorithm='auto', contamination='auto', leaf_size=30,
                   metric='euclidean', metric_params=None, n_jobs=None,
                   n_neighbors=20, novelty=True, p=2)

In [80]:
classifier.get_params()

{'algorithm': 'auto',
 'contamination': 'auto',
 'leaf_size': 30,
 'metric': 'euclidean',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 20,
 'novelty': True,
 'p': 2}

In [117]:
a = np.array([1,2,3,4,5,6,7,8,9,10])
np.random.choice(a, size=9, replace=False)

array([ 9, 10,  6,  2,  8,  1,  5,  3,  4])