In [12]:
import os
import sys

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler
# becuase we're in a nested folder...
sys.path.append('../')
from utils.preprocess import *

In [13]:
data_dir ="../for_students/data_v1"

In [14]:
def load_data(data_dir, window_size, window_func):
    '''
    data_dir (str): Base directory of data 
    window_size (str): Window size for input examples
    window_func (str): Window function reference as defined in utils.preprocess
                       Option are either 'window' or 'window_func'
    '''
    train_dir = os.path.join(data_dir, 'training')
    train_str = os.path.join(train_dir, 'training_{}.csv')
    test_str = os.path.join(data_dir, 'dataset_{}.csv')

    train_xs = []
    train_ys = []
    for i in range(1,4):
        train_df_i = pd.read_csv(train_str.format(str(i)))
        train_xi = window_func(train_df_i.value.values, window_size)
        train_xs.append(train_xi)
        train_ys.append(train_df_i.label.values)
    x_train = np.concatenate(train_xs)
    y_train = np.concatenate(train_ys)
    assert len(x_train) == len(y_train)
    
    test_xs = []
    test_ys = []
    for i in range(1,7):
        test_df_i = pd.read_csv(test_str.format(str(i)))
        test_xi = window_func(test_df_i.values[:,1], window_size)
        test_xs.append(test_xi)
    x_test = np.concatenate(test_xs)
    print("Train x shape: {}\nTrain y shape: {}\n\nTest x shape: {}".format(x_train.shape, y_train.shape, x_test.shape))
    return x_train, y_train, x_test

### Window Data

In [15]:
x_train, y_train, x_test = load_data(data_dir, 100, window_offset)

Train x shape: (12096, 100)
Train y shape: (12096,)

Test x shape: (39476, 100)


In [16]:
x_train_normal = x_train[y_train == 0]
# We may be able to get min max parameters from training set. We should do cross validation also. 
minmax_scalar = MinMaxScaler()
minmax_scalar.fit(x_train_normal)
x_train_normal_min_max = minmax_scalar.transform(x_train_normal)
x_train_min_max = minmax_scalar.transform(x_train)

### OCSVM

In [17]:
# Train on normal data
ocsvm = OneClassSVM(gamma='auto').fit(x_train_normal)
# check performance on training set for sanity check. 
y_pred_ocsvm = ocsvm.predict(x_train)
y_pred_ocsvm[y_pred_ocsvm==1] = 0
y_pred_ocsvm[y_pred_ocsvm==-1] = 1
ocsvm_f1 = f1_score(y_train, y_pred_ocsvm, average='macro')

In [18]:
# Train on normal data
ocsvm_min_max = OneClassSVM(gamma='auto').fit(x_train_normal_min_max )
# check performance on training set for sanity check. 
y_pred_ocsvm_min_max = ocsvm.predict(x_train_min_max)
y_pred_ocsvm_min_max[y_pred_ocsvm_min_max==1] = 0
y_pred_ocsvm_min_max[y_pred_ocsvm_min_max==-1] = 1
ocsvm_f1_min_max = f1_score(y_train, y_pred_ocsvm_min_max, average='macro')

In [19]:
print(ocsvm_f1)
print(ocsvm_f1_min_max)

0.3431233160533406
0.4949688948269384


### iForest

In [20]:
iforest = IsolationForest().fit(x_train_normal)
# check performance on training set for sanity check. 
y_pred_iforest = iforest.predict(x_train)
y_pred_iforest[y_pred_iforest==1] = 0
y_pred_iforest[y_pred_iforest==-1] = 1
iforest_f1 = f1_score(y_train, y_pred_iforest, average='macro')

In [21]:
iforest_min_max = IsolationForest().fit(x_train_normal_min_max)
# check performance on training set for sanity check. 
y_pred_iforest_min_max = iforest.predict(x_train_min_max)
y_pred_iforest_min_max[y_pred_iforest==1] = 0
y_pred_iforest_min_max[y_pred_iforest==-1] = 1
iforest_f1_min_max = f1_score(y_train, y_pred_iforest_min_max, average='macro')

In [23]:
print(iforest_f1 )
print(iforest_f1_min_max)

0.39325842696629215
0.2819524534463278
