# Validating measures for LOF results

In [1]:
import pandas as pd
import numpy as np
import pickle as pkl
import matplotlib.pyplot as plt
import time

### Data Directory

In [2]:
INPUT_DIR = '../data/interim/'
OUTPUT_DIR = '../data/processed/'

### Reading predicted file

In [4]:
ip_file = INPUT_DIR + 'fd001_lof_5000.csv'
df_pred = pd.read_csv(ip_file, sep=',')
df_pred['labels'] = 0

In [5]:
df_pred.head()

Unnamed: 0.1,Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,...,meas15,meas16,meas17,meas20,meas21,unit,time_cycles,lof,lrd,labels
0,0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,...,8.4195,0.03,392,39.06,23.419,1,1,0.96058,14.109831,0
1,1,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,...,8.4318,0.03,392,39.0,23.4236,1,2,0.958267,15.404271,0
2,2,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,...,8.4178,0.03,390,38.95,23.3442,1,3,1.026987,12.961634,0
3,3,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,8.3682,0.03,392,38.88,23.3739,1,4,0.983711,13.88637,0
4,4,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,...,8.4294,0.03,393,38.9,23.4044,1,5,1.027018,12.726336,0


### Make labels based on the top 'n' lof scores

In [6]:
num_groundtruth_outliers = 1000

df1 = df_pred.sort_values('lof',ascending = False).head(num_groundtruth_outliers).copy()
min_score = min(df1['lof'])
df_pred['labels'] = df_pred['lof'].apply(lambda x: -1 if x >= min_score else 0) # Make everything in labels column into 0 or -1 (outliers with greater lof scores)

In [7]:
df_pred.loc[df_pred['lof'] >= min_score].head()

Unnamed: 0.1,Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,...,meas15,meas16,meas17,meas20,meas21,unit,time_cycles,lof,lrd,labels
84,84,0.0025,0.0003,100.0,518.67,642.28,1590.12,1405.96,14.62,21.61,...,8.4499,0.03,392,39.01,23.284,1,85,1.045367,12.271519,-1
325,325,0.0026,0.0001,100.0,518.67,642.76,1586.44,1406.2,14.62,21.6,...,8.381,0.03,390,38.95,23.3367,2,134,1.048446,12.048258,-1
328,328,0.0044,-0.0004,100.0,518.67,642.46,1585.36,1406.79,14.62,21.61,...,8.369,0.03,391,38.98,23.3517,2,137,1.043543,12.181063,-1
340,340,-0.002,-0.0003,100.0,518.67,642.03,1587.88,1404.41,14.62,21.61,...,8.387,0.03,391,38.89,23.4024,2,149,1.048499,11.946172,-1
349,349,0.0014,0.0001,100.0,518.67,641.99,1586.95,1404.68,14.62,21.61,...,8.4324,0.03,392,39.03,23.422,2,158,1.047837,11.948611,-1


### Generating Ground Truth with heuristics

In [8]:
timecycles_for_failure = 10

In [9]:
df_pred['ground_truth'] = 0

In [10]:
df_pred.head()

Unnamed: 0.1,Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,...,meas16,meas17,meas20,meas21,unit,time_cycles,lof,lrd,labels,ground_truth
0,0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,...,0.03,392,39.06,23.419,1,1,0.96058,14.109831,0,0
1,1,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,...,0.03,392,39.0,23.4236,1,2,0.958267,15.404271,0,0
2,2,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,...,0.03,390,38.95,23.3442,1,3,1.026987,12.961634,0,0
3,3,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,...,0.03,392,38.88,23.3739,1,4,0.983711,13.88637,0,0
4,4,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,...,0.03,393,38.9,23.4044,1,5,1.027018,12.726336,0,0


In [11]:
units = np.unique(df_pred['unit'])

In [12]:
df_pred.loc[df_pred['unit'] == -1]

Unnamed: 0.1,Unnamed: 0,setting1,setting2,setting3,meas01,meas02,meas03,meas04,meas05,meas06,...,meas16,meas17,meas20,meas21,unit,time_cycles,lof,lrd,labels,ground_truth


In [13]:
# Assign outliers for the last 'n' time cycles for each of the unit

units = np.unique(df_pred['unit'])
# print('Number of unique units:', len(units))
for current_unit in units:
    df_sub = df_pred.loc[df_pred['unit'] == current_unit]
    max_timecycle = max(df_sub['time_cycles'])    
    outlier_time_threshold = max_timecycle - timecycles_for_failure
    
    # print('Unit=', current_unit, 'outlier threshold', outlier_time_threshold)
    
    df_pred['ground_truth'].loc[(df_pred['unit'] == current_unit) & (df_pred['time_cycles'] > outlier_time_threshold)] = -1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [14]:
print(df_pred[['unit', 'time_cycles', 'labels', 'ground_truth']].loc[df_pred['unit'] == 1])

     unit  time_cycles  labels  ground_truth
0       1            1       0             0
1       1            2       0             0
2       1            3       0             0
3       1            4       0             0
4       1            5       0             0
5       1            6       0             0
6       1            7       0             0
7       1            8       0             0
8       1            9       0             0
9       1           10       0             0
10      1           11       0             0
11      1           12       0             0
12      1           13       0             0
13      1           14       0             0
14      1           15       0             0
15      1           16       0             0
16      1           17       0             0
17      1           18       0             0
18      1           19       0             0
19      1           20       0             0
20      1           21       0             0
21      1 

In [15]:
print('Number of outliers in the predicted values', len(df_pred.loc[df_pred['labels'] == -1]))
print('Number of outliers in the ground truth', len(df_pred.loc[df_pred['ground_truth'] == -1]))

Number of outliers in the predicted values 1000
Number of outliers in the ground truth 1000


In [16]:
# Copy into a new dataframe with only the required columns
df_scoring = df_pred[['unit', 'time_cycles', 'labels', 'ground_truth']].copy()

In [17]:
# tp = 0
# fp = 1
# fn = 2
# tn = 3

def gen_vals(ground_truth, pred_label, pos_label, neg_label):
    if (ground_truth == pos_label) & (pred_label == pos_label):
        return 0
    elif (ground_truth == neg_label) & (pred_label == pos_label):
        return 1
    elif (ground_truth == pos_label) & (pred_label == neg_label):
        return 2
    elif (ground_truth == neg_label) & (pred_label == neg_label):
        return 3

### Find out the f-measure

In [18]:
pos_label = -1
neg_label = 0

df_scoring['pos'] = -1

df_scoring['pos'] = df_scoring.apply(lambda x: gen_vals(x['ground_truth'], x['labels'], pos_label, neg_label), axis=1)

In [19]:
from collections import Counter
counter = Counter(df_scoring['pos'])

print(counter)

accuracy = (counter[0] + counter[3]) / (counter[0] + counter[1] + counter[2] + counter[3])
print('Accuracy = ', accuracy)

precision = counter[0] / (counter[0] + counter[1])
print('Precision = ', precision)

recall = counter[0] / (counter[0] + counter[2])
print('Recall = ', recall)

# To avoid any divbyzero error
precision += 0.0000000001
recall += 0.0000000001

f1_score = 2 * (precision * recall) / (precision + recall)
print('F1 score: ', f1_score)

Counter({3: 18631, 1: 1000, 2: 1000})
Accuracy =  0.9030585041927197
Precision =  0.0
Recall =  0.0
F1 score:  1e-10


# Making it into a function

In [20]:
def find_eval_metrics(df_pred, timecycles_for_failure):

    # Make everything in labels column into 0 or -1
    df_pred['labels'] = df_pred['labels'].apply(lambda x: 0 if x >= 0 else -1)
    
    df_pred['ground_truth'] = 0
    
    # Assign outliers for the last 'n' time cycles for each of the unit
    units = np.unique(df_pred['unit'])
    # print('Number of unique units:', len(units))
    for current_unit in units:
        df_sub = df_pred.loc[df_pred['unit'] == current_unit]
        max_timecycle = max(df_sub['time_cycles'])    
        outlier_time_threshold = max_timecycle - timecycles_for_failure

        # print('Unit=', current_unit, 'outlier threshold', outlier_time_threshold)

        df_pred['ground_truth'].loc[(df_pred['unit'] == current_unit) & (df_pred['time_cycles'] > outlier_time_threshold)] = -1
        
    # Copy into a new dataframe with only the required columns
    df_scoring = df_pred[['unit', 'time_cycles', 'labels', 'ground_truth']].copy()
    
    pos_label = -1
    neg_label = 0

    df_scoring['pos'] = -1
    df_scoring['pos'] = df_scoring.apply(lambda x: gen_vals(x['ground_truth'], x['labels'], pos_label, neg_label), axis=1)
    
    counter = Counter(df_scoring['pos'])

    accuracy = (counter[0] + counter[3]) / (counter[0] + counter[1] + counter[2] + counter[3])
    # print('Accuracy = ', accuracy)
    precision = counter[0] / (counter[0] + counter[1])
    # print('Precision = ', precision)
    recall = counter[0] / (counter[0] + counter[2])
    # print('Recall = ', recall)
    
    # To avoid any divbyzero error
    precision += 0.0000000001
    recall += 0.0000000001
    f1_score = 2 * (precision * recall) / (precision + recall)
    # print('F1 score: ', f1_score)
    
    return accuracy, precision, recall, f1_score

In [21]:
import warnings
warnings.filterwarnings("ignore")

max_timecycles_for_failure = 40

print('| timecycles_for_failure|', 'accuracy', '|', 'precision', '|', 'recall', '|', 'f1_score', '|')
print('|--|--|--|--|--|')

for i in range(1, max_timecycles_for_failure+1):
    accuracy, precision, recall, f1_score = find_eval_metrics(df_pred, i)
    print('|', i, '|', accuracy, '|', precision, '|', recall, '|', f1_score, '|')

| timecycles_for_failure| accuracy | precision | recall | f1_score |
|--|--|--|--|--|
| 1 | 0.9466821773059958 | 1e-10 | 1e-10 | 1e-10 |
| 2 | 0.9418351025156319 | 1e-10 | 1e-10 | 1e-10 |
| 3 | 0.9369880277252678 | 1e-10 | 1e-10 | 1e-10 |
| 4 | 0.9321409529349037 | 1e-10 | 1e-10 | 1e-10 |
| 5 | 0.9272938781445398 | 1e-10 | 1e-10 | 1e-10 |
| 6 | 0.9224468033541757 | 1e-10 | 1e-10 | 1e-10 |
| 7 | 0.9175997285638118 | 1e-10 | 1e-10 | 1e-10 |
| 8 | 0.9127526537734477 | 1e-10 | 1e-10 | 1e-10 |
| 9 | 0.9079055789830837 | 1e-10 | 1e-10 | 1e-10 |
| 10 | 0.9030585041927197 | 1e-10 | 1e-10 | 1e-10 |
| 11 | 0.8982114294023557 | 1e-10 | 1e-10 | 1e-10 |
| 12 | 0.8933643546119917 | 1e-10 | 1e-10 | 1e-10 |
| 13 | 0.8885172798216276 | 1e-10 | 1e-10 | 1e-10 |
| 14 | 0.8836702050312636 | 1e-10 | 1e-10 | 1e-10 |
| 15 | 0.8788231302408996 | 1e-10 | 1e-10 | 1e-10 |
| 16 | 0.8739760554505356 | 1e-10 | 1e-10 | 1e-10 |
| 17 | 0.8691289806601716 | 1e-10 | 1e-10 | 1e-10 |
| 18 | 0.8642819058698076 | 1e-10 | 1e-