# Anomaly Record Detection in Sequence Data using Support Vector Machines -  Yahoo Data (Univariate)

In [154]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import import_ipynb
from Deviation_Threshold import get_deviations, get_anomaly_labels_by_deviation_threshold, get_anomaly_labels_by_deviation_pctile_threshold
import glob
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline


In [157]:
def predictTimeSeries(file_path, sliding_window=1, threshold=0.1, asPercentile=True, percentile=75.0):
    """
    This function reads in csv's in the given path and processes each of them, based on the sliding window.
    Creates a Support Vector Regression Model and fits 80% of the data and predicts 20% of the data
    
    file_path - path to the csv files
    sliding_window - how many previous entries should be considered as inputs to the current entry
    
    return - dictionary of precision, recall and F1-scores for each csv file
    """

    a1_csv = glob.glob(file_path, recursive=True)
    for index,file in enumerate(a1_csv):

        # Read in 1 csv file
        yahoo_df = pd.read_csv(file)
        fname = file.split("/")[4].replace('\\','').split(".")[0]
        print("File name {}".format(fname))
        # Extract column names
        columnNames = list(yahoo_df.columns)

        # Make a copy of the dataframe, one will be used as output, and the other as input
        yahoo_df_copy = yahoo_df.copy(deep=True)

        # first append a NaN row to the dataframe, because the last row will be lost when shifted
        yahoo_df_copy = yahoo_df_copy.append(pd.Series(), ignore_index=True)
        yahoo_df_copy = yahoo_df_copy.shift(1)

    #     print("Copy head")
    #     print(yahoo_df_copy.head(5))

        # a NaN row is required to be able to merge
        yahoo_df = yahoo_df.append(pd.Series(), ignore_index=True)

    #     print("Original head")
    #     print(yahoo_df.head(5))

        yahoo_df = yahoo_df.rename(columns=createColumnDict(columnNames))

        yahoo_merged = yahoo_df_copy.merge(yahoo_df, left_index=True, right_index=True)

    #     print("Merged head")
    #     print(yahoo_merged.head(5))

    #     print("Merged tail")
    #     print(yahoo_merged.tail(5))

    #     print(yahoo_merged.shape)
    #     print(yahoo_merged.head(0))

        # TODO put this in a for loop for sliding_window > 1
        last_index = yahoo_merged.shape[0] - 1
        # drop the 1st and last rows, because they contain NaN values
        yahoo_merged = yahoo_merged.drop([0,last_index])
        
#         yahoo_merged = yahoo_merged.drop(['is_anomaly','is_anomaly_y','timestamp', 'timestamp_y'], axis=1)
        train_size = int(len(yahoo_merged) * 0.8)
        train_set, test_set = yahoo_merged[:train_size], yahoo_merged[train_size:]
        #seperate into features and target
        X_train = train_set[['value']]
        y_train = train_set[['value_y']]
        X_test = test_set[['value']]
        y_test = test_set[['value_y']]
        
        # outlier_y column is not needed for forecasting, but needed later to detect outliers
        outlier_df = test_set[['is_anomaly_y']]

        svm_clf = SVR(kernel='rbf')
        svm_clf.fit(X_train, np.ravel(y_train))
        y_predict = svm_clf.predict(X_test)
#         y_predict1 = y_predict.reshape(-1,1)

#         print("Actual anomaly : ")
#         print(outlier_df)

        predicted_anomaly = None
        if(asPercentile):
            predicted_anomaly = get_anomaly_labels_by_deviation_pctile_threshold(np.ravel(y_test), np.ravel(y_predict), percentile) 
        else:
            predicted_anomaly = get_anomaly_labels_by_deviation_threshold(np.ravel(y_test), np.ravel(y_predict), threshold)
        
        metrics = precision_recall_fscore_support(np.ravel(outlier_df), predicted_anomaly, average='binary', zero_division=0)
        
        print("Precision :" ,metrics[0], "Recall :", metrics[1], "F1-score :" ,metrics[2])
        print()



In [97]:
def createColumnDict(columnNames):
    """
    Generates a dictionary which contains a mapping between column names to the target column names
    
    columnNames - current names of the columns
    
    return a dictionary which contains a mapping between column names
            Ex: ["timestamp", "value", "is_anomaly"]
            {"timestamp": "timestamp_y", "value": "value_y", "is_anomaly": "is_anomaly_y"}
    """
    
    dict = {}
    for name in columnNames:
        dict[name] = name + "_y"
    
    return dict

In [158]:
predictTimeSeries('./data/ydata-labeled-time-series-anomalies-v1_0/A1Benchmark_processed/*.csv',1)

File name real_59
Deviation Min 0.0006506429948124337, Max 0.7468148316218403
Deviation 75.0th pctile 0.18236679889245844
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([214,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name real_65
Deviation Min 0.0011367536032944464, Max 0.9140142415923129
Deviation 75.0th pctile 0.12043153740562804
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([214,  71]))
Precision : 0.2112676056338028 Recall : 0.8823529411764706 F1-score : 0.34090909090909094

File name real_64
Deviation Min 0.00037891849099117025, Max 0.80267613522134
Deviation 75.0th pctile 0.15532007826795835
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([216,  72]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name real_58
Deviation Min 0.003196930946291563, Max 0.9776214833759591
Deviation 75.0th pctile 0.022378516624040924
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), arr

Deviation Min 0.0002029994302421012, Max 0.5444771515619697
Deviation 75.0th pctile 0.09560135253253993
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([215,  72]))
Precision : 0.05555555555555555 Recall : 0.5 F1-score : 0.09999999999999999

File name real_32
Deviation Min 0.00017613312309198992, Max 0.9209078404401649
Deviation 75.0th pctile 0.812863741086755
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([214,  72]))
Precision : 0.3472222222222222 Recall : 0.5319148936170213 F1-score : 0.42016806722689076

File name real_18
Deviation Min 0.0005952302786640384, Max 0.3416889698755013
Deviation 75.0th pctile 0.08320022558138873
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([219,  73]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name real_24
Deviation Min 0.00039392443776054, Max 0.5979588317180368
Deviation 75.0th pctile 0.04834793185698888
Deviation > 75.0th pctile is_anomaly labels in data (array(

In [159]:
predictTimeSeries('./data/ydata-labeled-time-series-anomalies-v1_0/A2Benchmark_processed/*.csv',1)

File name synthetic_85
Deviation Min 0.0002011663925197138, Max 0.29322223765742145
Deviation 75.0th pctile 0.0956097151232741
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_91
Deviation Min 0.000416263496039293, Max 0.147605690840231
Deviation 75.0th pctile 0.12483226342600177
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_46
Deviation Min 9.437875486684177e-07, Max 0.2970984559670895
Deviation 75.0th pctile 0.15301597207142226
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_52
Deviation Min 6.41441313745661e-05, Max 0.22814617576623664
Deviation 75.0th pctile 0.09373237609822122
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision :

Deviation Min 0.0003380320999775699, Max 0.3293027613476629
Deviation 75.0th pctile 0.1246095492924176
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_56
Deviation Min 0.0037448596589222394, Max 0.17011451720298343
Deviation 75.0th pctile 0.1289247333005622
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_42
Deviation Min 0.008956075990138812, Max 0.39943627466965703
Deviation 75.0th pctile 0.3607712310648634
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_95
Deviation Min 0.00024576195102488096, Max 0.07222577931095414
Deviation 75.0th pctile 0.039652267031831376
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1

Deviation Min 1.3965391125747573e-06, Max 0.17126948579664325
Deviation 75.0th pctile 0.06333876841664138
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_14
Deviation Min 0.0005346905497936039, Max 0.15666785338789868
Deviation 75.0th pctile 0.12628682738267938
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_28
Deviation Min 0.008732826778969471, Max 0.31185990495579197
Deviation 75.0th pctile 0.2699121162136396
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name synthetic_3
Deviation Min 0.00012905410961128183, Max 0.10438793247148101
Deviation 75.0th pctile 0.06257151958430306
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([213,  71]))
Precision : 0.0 Recall : 0.0 

In [171]:
def predictTimeSeriesNew(file_path, sliding_window=1, threshold=0.1, asPercentile=True, percentile=75.0):
    """
    This function reads in csv's in the given path and processes each of them, based on the sliding window.
    Creates a Support Vector Regression Model and fits 80% of the data and predicts 20% of the data
    
    file_path - path to the csv files
    sliding_window - how many previous entries should be considered as inputs to the current entry
    
    return - dictionary of precision, recall and F1-scores for each csv file
    """

    a3_csv = glob.glob(file_path, recursive=True)
    for index,file in enumerate(a3_csv):

        fname = file.split("/")[4].replace('\\','').split(".")[0]
        
        if(not(fname == 'A3Benchmark_all' or fname == 'A4Benchmark_all')):
        
            print("File name {}".format(fname))
        
            # Read in 1 csv file
            yahoo_df = pd.read_csv(file)


            # these csv files have 9 attributes, but we need only 
            yahoo_df = yahoo_df[['timestamps','value','anomaly']]

            # Extract column names
            columnNames = list(yahoo_df.columns)

            # Make a copy of the dataframe, one will be used as output, and the other as input
            yahoo_df_copy = yahoo_df.copy(deep=True)

            # first append a NaN row to the dataframe, because the last row will be lost when shifted
            yahoo_df_copy = yahoo_df_copy.append(pd.Series(), ignore_index=True)
            yahoo_df_copy = yahoo_df_copy.shift(1)

        #     print("Copy head")
        #     print(yahoo_df_copy.head(5))

            # a NaN row is required to be able to merge
            yahoo_df = yahoo_df.append(pd.Series(), ignore_index=True)

        #     print("Original head")
        #     print(yahoo_df.head(5))

            yahoo_df = yahoo_df.rename(columns=createColumnDict(columnNames))

            yahoo_merged = yahoo_df_copy.merge(yahoo_df, left_index=True, right_index=True)

        #     print("Merged head")
        #     print(yahoo_merged.head(5))

        #     print("Merged tail")
        #     print(yahoo_merged.tail(5))

        #     print(yahoo_merged.shape)
        #     print(yahoo_merged.head(0))

            # TODO put this in a for loop for sliding_window > 1
            last_index = yahoo_merged.shape[0] - 1
            # drop the 1st and last rows, because they contain NaN values
            yahoo_merged = yahoo_merged.drop([0,last_index])

    #         yahoo_merged = yahoo_merged.drop(['is_anomaly','is_anomaly_y','timestamp', 'timestamp_y'], axis=1)
            train_size = int(len(yahoo_merged) * 0.8)
            train_set, test_set = yahoo_merged[:train_size], yahoo_merged[train_size:]
            #seperate into features and target
            X_train = train_set[['value']]
            y_train = train_set[['value_y']]
            X_test = test_set[['value']]
            y_test = test_set[['value_y']]

            # outlier_y column is not needed for forecasting, but needed later to detect outliers
            outlier_df = test_set[['anomaly_y']]

            svm_clf = SVR(kernel='rbf')
            svm_clf.fit(X_train, np.ravel(y_train))
            y_predict = svm_clf.predict(X_test)
    #         y_predict1 = y_predict.reshape(-1,1)

    #         print("Actual anomaly : ")
    #         print(outlier_df)

            predicted_anomaly = None
            if(asPercentile):
                predicted_anomaly = get_anomaly_labels_by_deviation_pctile_threshold(np.ravel(y_test), np.ravel(y_predict), percentile) 
            else:
                predicted_anomaly = get_anomaly_labels_by_deviation_threshold(np.ravel(y_test), np.ravel(y_predict), threshold)

            metrics = precision_recall_fscore_support(np.ravel(outlier_df), predicted_anomaly, average='binary', zero_division=0)

            print("Precision :" ,metrics[0], "Recall :", metrics[1], "F1-score :" ,metrics[2])
            print()



In [174]:
predictTimeSeriesNew('./data/ydata-labeled-time-series-anomalies-v1_0/A3Benchmark_processed/*.csv',1)

File name A3Benchmark-TS12
Deviation Min 0.00018006413565441193, Max 0.3898787979080486
Deviation 75.0th pctile 0.07349788333132225
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.03571428571428571 Recall : 1.0 F1-score : 0.0689655172413793

File name A3Benchmark-TS13
Deviation Min 0.00010980047303310592, Max 0.20994676122457812
Deviation 75.0th pctile 0.08229952221174366
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS11
Deviation Min 0.00031689608760276933, Max 0.3599226278915798
Deviation 75.0th pctile 0.12127701648967812
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A3Benchmark-TS39
Deviation Min 9.50423456980154e-05, Max 0.5069320837830188
Deviation 75.0th pctile 0.28996074330674715
Deviation > 

Deviation Min 0.07540769565918826, Max 0.45860240914664174
Deviation 75.0th pctile 0.33345615759344305
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS76
Deviation Min 3.2473097262397665e-05, Max 0.5052566215180613
Deviation 75.0th pctile 0.08044350458926847
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A3Benchmark-TS48
Deviation Min 0.00014955897603008417, Max 0.414638353551122
Deviation 75.0th pctile 0.11998783927847742
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A3Benchmark-TS60
Deviation Min 0.004906942371315892, Max 0.40869529260050835
Deviation 75.0th pctile 0.14693791939106882
Deviation > 75.0th pctile is_anomaly la

Deviation Min 0.00044710558701649417, Max 0.3903201128020036
Deviation 75.0th pctile 0.08455203603183756
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.05952380952380952 Recall : 1.0 F1-score : 0.11235955056179775

File name A3Benchmark-TS80
Deviation Min 0.01251131885705703, Max 0.4733196489580759
Deviation 75.0th pctile 0.24745402872422484
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS94
Deviation Min 0.00017625822823097437, Max 0.3348773955789694
Deviation 75.0th pctile 0.07444877565191799
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A3Benchmark-TS43
Deviation Min 0.0027468852063140536, Max 0.492790734758568
Deviation 75.0th pctile 0.23980899060161207
Deviation > 75.0th pctile is_anomaly label

Deviation Min 0.00015849609381024532, Max 0.3962970150452975
Deviation 75.0th pctile 0.09902155739309212
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS34
Deviation Min 0.0003108093505469478, Max 0.3630827769466205
Deviation 75.0th pctile 0.08747293038458978
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS22
Deviation Min 8.342849368281335e-05, Max 0.4235005548306109
Deviation 75.0th pctile 0.08025655451880964
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A3Benchmark-TS36
Deviation Min 0.0007342086074113574, Max 0.38065134381162924
Deviation 75.0th pctile 0.07640256562498449
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Re

In [175]:
predictTimeSeriesNew('./data/ydata-labeled-time-series-anomalies-v1_0/A4Benchmark_processed/*.csv',1)

File name A4Benchmark-TS99
Deviation Min 1.2524610808195558e-05, Max 0.29116284443818896
Deviation 75.0th pctile 0.059275695288347155
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.023809523809523808 Recall : 1.0 F1-score : 0.046511627906976744

File name A4Benchmark-TS72
Deviation Min 0.0003370437836326401, Max 0.14116632715062122
Deviation 75.0th pctile 0.08057383148373454
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS66
Deviation Min 0.0001266889802882254, Max 0.5802937901593164
Deviation 75.0th pctile 0.10570000930574167
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS67
Deviation Min 0.00045710669733189935, Max 0.6488366759226019
Deviation 75.0th pctile 0.1528781900020197
Deviation > 75.0th pctile is_anomaly label

Deviation Min 0.00020043322837037802, Max 0.41101485018096073
Deviation 75.0th pctile 0.06786884818260075
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.023809523809523808 Recall : 1.0 F1-score : 0.046511627906976744

File name A4Benchmark-TS9
Deviation Min 0.0026769770315213104, Max 0.3250395058983929
Deviation 75.0th pctile 0.12519629690245085
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS100
Deviation Min 0.00038948243991332454, Max 0.6456614149362079
Deviation 75.0th pctile 0.16459016426360717
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A4Benchmark-TS17
Deviation Min 7.497468398343088e-05, Max 0.25949245494370504
Deviation 75.0th pctile 0.0493232219844197
Deviation > 75.0th pctile is_anomaly

Deviation Min 0.00010927335962851448, Max 0.15657641487106477
Deviation 75.0th pctile 0.07844105539738581
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS90
Deviation Min 0.0004896712369214518, Max 0.3888949593666785
Deviation 75.0th pctile 0.07404645927423581
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.03571428571428571 Recall : 1.0 F1-score : 0.0689655172413793

File name A4Benchmark-TS84
Deviation Min 0.0007568134223696066, Max 0.6395725428060187
Deviation 75.0th pctile 0.3131996829556892
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS53
Deviation Min 6.848415128857255e-05, Max 0.6073562637662704
Deviation 75.0th pctile 0.12779373099097585
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([

Precision : 0.011904761904761904 Recall : 1.0 F1-score : 0.023529411764705882

File name A4Benchmark-TS56
Deviation Min 2.060878123777421e-05, Max 0.38996616988915767
Deviation 75.0th pctile 0.08096580538103929
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.047619047619047616 Recall : 1.0 F1-score : 0.0909090909090909

File name A4Benchmark-TS42
Deviation Min 0.0005338614847999468, Max 0.45398594410934723
Deviation 75.0th pctile 0.16117344531532185
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.0 Recall : 0.0 F1-score : 0.0

File name A4Benchmark-TS43
Deviation Min 0.0006305002907208701, Max 0.4368171416686133
Deviation 75.0th pctile 0.09183965438042348
Deviation > 75.0th pctile is_anomaly labels in data (array([0, 1]), array([252,  84]))
Precision : 0.023809523809523808 Recall : 1.0 F1-score : 0.046511627906976744

File name A4Benchmark-TS57
Deviation Min 0.0001784964791583521, 