# Anomaly Record Detection in Sequence Data using Support Vector Machines -  NASA Data (Multivariate)

In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

%matplotlib inline


In [2]:
# Read in NASA dataset (multivariate)
nasa_original = pd.read_csv('data/nasa/clean-std-norm-nasa.csv')

In [3]:
nasa_original.head()

Unnamed: 0,time,time.1,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Open,Bpv Close,class,outlier
0,-4821,-4821,0.4375,0.507015,0.333333,0.638093,0.575163,0.656501,0.598071,7,1
1,-4624,-4624,0.429688,0.507401,0.358974,0.638212,0.568627,0.630819,0.575563,7,1
2,-4475,-4475,0.664062,0.5065,0.349359,0.638522,0.764706,0.686998,0.581994,7,1
3,-4184,-4184,0.664062,0.512164,0.355769,0.637878,0.764706,0.682183,0.578778,7,1
4,-4048,-4048,0.65625,0.507015,0.358974,0.638117,0.75817,0.677368,0.572347,1,0


In [4]:
nasa_original = nasa_original.drop(['class', 'time.1'], axis=1)

In [5]:
nasa_original.shape

(49097, 9)

In [6]:
nasa_org_copy = nasa_original.copy(deep=True)

In [7]:
# first append a NaN row to the dataframe, because the last row will be lost when shifted
nasa_org_copy = nasa_org_copy.append(pd.Series(), ignore_index=True)
nasa_org_copy = nasa_org_copy.shift(1)

  


In [8]:
nasa_original = nasa_original.append(pd.Series(), ignore_index=True)

  """Entry point for launching an IPython kernel.


In [9]:
nasa_original = nasa_original.rename(columns={"time": "time_y", "Rad Flow": "Rad Flow_y", "Fpv Close": "Fpv Close_y", "Fpv Open": "Fpv Open_y", "High": "High_y", "Bypass":"Bypass_y", "Bpv Open":"Bpv Open_y", "Bpv Close":"Bpv Close_y", "outlier":"outlier_y" })


In [10]:
nasa_original.shape

(49098, 9)

In [11]:
nasa_org_copy.shape

(49098, 9)

In [12]:
nasa_org_copy.head()

Unnamed: 0,time,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Open,Bpv Close,outlier
0,,,,,,,,,
1,-4821.0,0.4375,0.507015,0.333333,0.638093,0.575163,0.656501,0.598071,1.0
2,-4624.0,0.429688,0.507401,0.358974,0.638212,0.568627,0.630819,0.575563,1.0
3,-4475.0,0.664062,0.5065,0.349359,0.638522,0.764706,0.686998,0.581994,1.0
4,-4184.0,0.664062,0.512164,0.355769,0.637878,0.764706,0.682183,0.578778,1.0


In [13]:
nasa_original.head()

Unnamed: 0,time_y,Rad Flow_y,Fpv Close_y,Fpv Open_y,High_y,Bypass_y,Bpv Open_y,Bpv Close_y,outlier_y
0,-4821.0,0.4375,0.507015,0.333333,0.638093,0.575163,0.656501,0.598071,1.0
1,-4624.0,0.429688,0.507401,0.358974,0.638212,0.568627,0.630819,0.575563,1.0
2,-4475.0,0.664062,0.5065,0.349359,0.638522,0.764706,0.686998,0.581994,1.0
3,-4184.0,0.664062,0.512164,0.355769,0.637878,0.764706,0.682183,0.578778,1.0
4,-4048.0,0.65625,0.507015,0.358974,0.638117,0.75817,0.677368,0.572347,0.0


In [14]:
nasa_merged = nasa_org_copy.merge(nasa_original, left_index=True, right_index=True)

In [15]:

nasa_merged.head()

Unnamed: 0,time,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Open,Bpv Close,outlier,time_y,Rad Flow_y,Fpv Close_y,Fpv Open_y,High_y,Bypass_y,Bpv Open_y,Bpv Close_y,outlier_y
0,,,,,,,,,,-4821.0,0.4375,0.507015,0.333333,0.638093,0.575163,0.656501,0.598071,1.0
1,-4821.0,0.4375,0.507015,0.333333,0.638093,0.575163,0.656501,0.598071,1.0,-4624.0,0.429688,0.507401,0.358974,0.638212,0.568627,0.630819,0.575563,1.0
2,-4624.0,0.429688,0.507401,0.358974,0.638212,0.568627,0.630819,0.575563,1.0,-4475.0,0.664062,0.5065,0.349359,0.638522,0.764706,0.686998,0.581994,1.0
3,-4475.0,0.664062,0.5065,0.349359,0.638522,0.764706,0.686998,0.581994,1.0,-4184.0,0.664062,0.512164,0.355769,0.637878,0.764706,0.682183,0.578778,1.0
4,-4184.0,0.664062,0.512164,0.355769,0.637878,0.764706,0.682183,0.578778,1.0,-4048.0,0.65625,0.507015,0.358974,0.638117,0.75817,0.677368,0.572347,0.0


In [16]:
nasa_merged.tail()

Unnamed: 0,time,Rad Flow,Fpv Close,Fpv Open,High,Bypass,Bpv Open,Bpv Close,outlier,time_y,Rad Flow_y,Fpv Close_y,Fpv Open_y,High_y,Bypass_y,Bpv Open_y,Bpv Close_y,outlier_y
49093,4400.0,0.664062,0.513451,0.355769,0.637806,0.261438,0.682183,0.700965,1.0,4501.0,0.664062,0.506758,0.349359,0.639238,0.248366,0.686998,0.707395,1.0
49094,4501.0,0.664062,0.506758,0.349359,0.639238,0.248366,0.686998,0.707395,1.0,4692.0,0.429688,0.507659,0.358974,0.63826,0.03268,0.630819,0.707395,1.0
49095,4692.0,0.429688,0.507659,0.358974,0.63826,0.03268,0.630819,0.707395,1.0,4903.0,0.648438,0.506629,0.358974,0.637973,0.196078,0.675762,0.710611,1.0
49096,4903.0,0.648438,0.506629,0.358974,0.637973,0.196078,0.675762,0.710611,1.0,5075.0,0.4375,0.507015,0.333333,0.638093,0.0,0.656501,0.73955,1.0
49097,5075.0,0.4375,0.507015,0.333333,0.638093,0.0,0.656501,0.73955,1.0,,,,,,,,,


In [17]:
#Since the first and last rows have NaN values, we need to remove those from the dataframe
print(nasa_merged.shape)
nasa_merged = nasa_merged.drop([0,49097])
print(nasa_merged.shape)

(49098, 18)
(49096, 18)


In [18]:
# outlier_y column is not needed for forecasting, but needed later to detect outliers
# remove columns which are not relevant for forecasting of time series
outlier_df = nasa_merged[['outlier_y']]
nasa_merged = nasa_merged.drop(['outlier','outlier_y','time', 'time_y'], axis=1)

In [19]:
train_size = int(len(nasa_merged) * 0.8)
train_set, test_set = nasa_merged[:train_size], nasa_merged[train_size:]

In [20]:
#seperate into features and target
X_train = train_set[['Rad Flow','Fpv Close', 'Fpv Open', 'High', 'Bypass', 'Bpv Open', 'Bpv Close']]
y_train = train_set[['Rad Flow_y','Fpv Close_y', 'Fpv Open_y', 'High_y', 'Bypass_y', 'Bpv Open_y', 'Bpv Close_y']]

In [22]:
X_test = test_set[['Rad Flow','Fpv Close', 'Fpv Open', 'High', 'Bypass', 'Bpv Open', 'Bpv Close']]
y_test = test_set[['Rad Flow_y','Fpv Close_y', 'Fpv Open_y', 'High_y', 'Bypass_y', 'Bpv Open_y', 'Bpv Close_y']]

In [23]:
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(39276, 7) (39276, 7)
(9820, 7) (9820, 7)


In [27]:
svm_clf = SVR(kernel='poly')

In [None]:
clf = MultiOutputRegressor(svm_clf).fit(X_train, y_train)

In [None]:
# class VectorRegression(sklearn.base.BaseEstimator):
#     def __init__(self, estimator):
#         self.estimator = estimator

#     def fit(self, X, y):
#         n, m = y.shape
#         # Fit a separate regressor for each column of y
#         self.estimators_ = [sklearn.base.clone(self.estimator).fit(X, y[:, i]) for i in range(m)]
#         return self

#     def predict(self, X):
#         # Join regressors' predictions
#         res = [est.predict(X)[:, np.newaxis] for est in self.estimators_]
#         return np.hstack(res)

In [None]:
y_predict = clf.predict(X_test)