In [None]:
filename='yahoo2.csv'

In [None]:
import numpy
import matplotlib.pyplot as plt
from pandas import read_csv
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import recall_score, precision_score, f1_score, roc_curve, auc

period_size = 234
step_size = 117
upper_q_threshold=0.9999
lower_q_threshold=0.0001
anomaly_res =[-563.9761436,-2416.661669,-3232.353099,-3565.527389,-2560.50266,-4133.645629,-4004.058979,-4279.506572,-4804.726653,-5948.068253]

def intersection(lst1, lst2):
    lst3=[]
    for itr in range(len(lst1)):
        if lst1[itr][0] in lst2:
            lst3.append(lst1[itr])
    return lst3



def verify_stationarity(dataset):
    is_stationary=True
    test_results = adfuller(dataset)

    print(f"ADF test statistic: {test_results[0]}")
    print(f"p-value: {test_results[1]}")
    print("Critical thresholds:")

    for key, value in test_results[4].items():
        print(f"\t{key}: {value}")
    itr = 0
    for key, value in test_results[4].items():
       print('\t%s: %.3f' % (key, value))
       if itr==0:
         critical=value
       itr=itr+1

    print('critical',critical)
    if test_results[0] > critical:
         print('non stationary')
         is_stationary=False
    return  is_stationary

def create_dataset(dataset, look_back=1, tw=3):
    dataX, dataY = [], []  # dtaset for mean
    datastdX, datastdY = [], []  # dataset for std
    datacombX, datacomY = [], []  # dataset for mean and std for third deep learning
    multi = look_back // tw
    for i in range(len(dataset) - look_back - 1):
        q10X = []
        q90X = []
        a = dataset[i + 1:(i + look_back + 1)]
        indices = i + (multi - 1) * tw
        # print('last window', dataset[indices:(i + look_back), 0])
        b = numpy.quantile(a, lower_q_threshold)
        c = numpy.quantile(a, upper_q_threshold)
        for j in range(0, len(a), tw):
            q10 = numpy.quantile(a[j:j + tw], lower_q_threshold)
            q90 = numpy.quantile(a[j:j + tw], upper_q_threshold)
            q10X.append(q10)
            q90X.append(q90)
        dataX.append(q10X)
        datastdX.append(q90X)
        dataY.append(b)
        datastdY.append(c)
        comb = []
        comb.append(b)
        comb.append(c)
        datacombX.append(comb)
        datacomY.append(dataset[i + look_back, 0])
    return numpy.array(dataX), numpy.array(dataY), numpy.array(datastdX), numpy.array(datastdY), numpy.array(
        datacombX), numpy.array(datacomY)


def identify_anomaly(finalres_q10,finalres_q90,ts):
    ''' if ( actual_values[i] > avg_q90) or (
                         actual_values[i] < avg_q10):'''
    anomaly_detection = []
    for m in range(0, len(finalres_q10), period_size):
        actual_values = ts[m:m + period_size]
        avg_q10 = numpy.average(finalres_q10[m:m + period_size])
        avg_q90 = numpy.average(finalres_q90[m:m + period_size])


        for i in range(len(actual_values)):
            if (actual_values[i] > avg_q90) or (
                    actual_values[i] < avg_q10):
                anomaly_detection.append(actual_values[i])



    return anomaly_detection

def find_anomalies(lowest_leaf):
    ts = read_csv(filename, usecols=lambda column: column != 'Class').values

    y_pred = numpy.zeros(len(ts), dtype=int)

    lowest_leaf = numpy.array(lowest_leaf)

    for lowest_leaf_array in lowest_leaf:
        diffs = numpy.abs(ts - lowest_leaf_array)
        max_diffs = numpy.max(diffs, axis=1)
        outliers = max_diffs <= 1e-4
        y_pred[outliers] = 1

    return y_pred

if __name__ == '__main__':
    # fix random seed for reproducibility
    numpy.random.seed(7)
    dataframe = read_csv(filename)
    y_true=dataframe.iloc[:, -1]
    label_0_rows = dataframe[dataframe['Class'] == 0]
    percent_to_select = 0.7
    num_to_select = int(len(label_0_rows) * percent_to_select)
    selected_rows = label_0_rows.sample(n=num_to_select, random_state=42)
    selected_rows = selected_rows['Value'] # put column name corresponding to the name of the column containing the datapoints in the dataset
    dataset = selected_rows.values
    dataset = dataset.reshape(-1,1)
    stationary=verify_stationarity(dataset)
    dataset = dataset.astype('float32')
    # normalize the dataset
    stationary = verify_stationarity(dataset)
    scaler = MinMaxScaler(feature_range=(0, 1))

    dataset = scaler.fit_transform(dataset)
    # split into train and test sets
    train_size = int(len(dataset) * 0.7)
    test_size = len(dataset) - train_size
    train, test = dataset[0:train_size, :], dataset[train_size:len(dataset), :]
    # reshape into X=t and Y=t+1
    look_back = period_size
    tw = step_size
    multi = look_back // tw
    trainX, trainY, trainstdX, trainstdY, traincombX, traincombY = create_dataset(train, look_back, tw)
    testX, testY, teststdX, teststdY, testcombX, testcombY = create_dataset(test, look_back, tw)
    # reshape input to be [samples, time steps, features]
    trainX = numpy.reshape(trainX, (trainX.shape[0], trainX.shape[1], 1))
    testX = numpy.reshape(testX, (testX.shape[0], testX.shape[1], 1))
    trainstdX = numpy.reshape(trainstdX, (trainstdX.shape[0], trainstdX.shape[1], 1))
    teststdX = numpy.reshape(teststdX, (teststdX.shape[0], teststdX.shape[1], 1))
    traincombX = numpy.reshape(traincombX, (traincombX.shape[0], traincombX.shape[1], 1))
    testcombX = numpy.reshape(testcombX, (testcombX.shape[0], testcombX.shape[1], 1))
    modelq10 = Sequential()
    modelq10.add(LSTM(4, input_shape=(multi, 1), activation='sigmoid', recurrent_activation='sigmoid'))
    modelq10.add(Dense(1))
    modelq10.compile(loss='log_cosh', optimizer='adam')
    modelq10.fit(trainX, trainY, epochs=100, batch_size=1, verbose=2)
    modelq90 = Sequential()
    modelq90.add(LSTM(4, input_shape=(multi, 1), activation='sigmoid', recurrent_activation='sigmoid'))
    modelq90.add(Dense(1))
    modelq90.compile(loss='log_cosh', optimizer='adam')
    modelq90.fit(trainstdX, trainstdY, epochs=100, batch_size=1, verbose=2)

    i = 0
    j = look_back
    actual_quantile_interval = []
    steps = tw
    positive = True
    anomalies=[]
    finalres_q10 = []
    finalres_q90 = []
    dataframe = read_csv(filename, usecols=[0], engine='python')
    dataset = dataframe.values
    dataset = scaler.fit_transform(dataset)
    ts = dataset
    ts_accumulate=[]
    while j <= len(dataset):
        q10_array = []
        q90_array = []

        temp = dataset[i:j]
        actual_quantile_interval.append(
            numpy.absolute(numpy.quantile(dataset[i + 1:j + 1], lower_q_threshold) - numpy.quantile(dataset[i + 1:j + 1], upper_q_threshold)))

        for m in range(0, len(temp), steps):
            q10array = []
            q90array = []
            q10 = numpy.quantile(temp[m:m + steps], lower_q_threshold)
            q90 = numpy.quantile(temp[m:m + steps], upper_q_threshold)
            q10array.append(q10)
            q90array.append(q90)
            q90_array.append(q90array)
            q10_array.append(q10array)

        final_q10_array = []
        final_q10_array.append(q10_array)
        print('final_q10_array', final_q10_array)
        q10_predict = modelq10.predict(final_q10_array)
        print('q10 predict', q10_predict)
        final_q90_array = []
        final_q90_array.append(q90_array)
        print('final_q90_array', final_q90_array)
        final_q90_array = numpy.array(final_q90_array)
        q90_predict = modelq90.predict(final_q90_array)
        print('predict', q90_predict)
        if j+1 < len(dataset) and (dataset[j+1]> q90_predict or dataset[j+1]<q10_predict):
            anomalies.append(dataset[j+1])
            dataset=numpy.delete(dataset,j+1)
            print('length',len(dataset))
        finalres_q10.append(q10_predict)
        finalres_q90.append(q90_predict)
        if j+1 < len(dataset):
           ts_accumulate.append(dataset[j+1])
        j = j + 1
        i = i + 1

    prediction_array_q10 = []
    prediction_array_q90 = []
    anomalies_array=[]

    for h in range(len(anomalies)):
        internal = anomalies[h]
        internal_array = []
        internal_array.append(internal)
        anomalies_array.append(internal_array)
    anomalies_array = scaler.inverse_transform(anomalies_array)
    '''print(anomalies_array)
    print('anomaly length',len(anomalies_array))'''
    ts_accumulate_another=[]
    for h in range(len(finalres_q10)):
        internal = finalres_q10[h]
        internal_q90 = finalres_q90[h]
        prediction_array_q10.append(internal[0])
        prediction_array_q90.append(internal_q90[0])
    for g in range(len(ts_accumulate)):
        internal=[]
        internal.append(ts_accumulate[g])
        ts_accumulate_another.append(internal)
    finalres_q10 = scaler.inverse_transform(prediction_array_q10)
    finalres_q90 = scaler.inverse_transform(prediction_array_q90)
    '''trunc_finalres = []
    for g in range(len(finalres)):
        trunc_finalres.append(finalres[g])'''
    ts = ts[look_back:]
    ts = scaler.inverse_transform(ts)
    ts_accumulate=scaler.inverse_transform(ts_accumulate_another)
    print('lenght', len(ts_accumulate), 'actual_quantile_interval', len(finalres_q10))
    '''ts_array = []
    for g in range(len(ts)):
        ts_array.append(ts[g])'''
    finalres_q10_array=[]
    finalres_q90_array=[]
    for g in range(len(finalres_q10)-1):
        finalres_q10_array.append(finalres_q10[g])
        finalres_q90_array.append(finalres_q90[g])
    prediction_errors = []
    anomalies = identify_anomaly(finalres_q10_array,finalres_q90_array,ts)
    y_pred = find_anomalies(anomalies)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1_score = (2*(precision*recall))/(precision + recall)
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    auc_roc = auc(fpr, tpr)

    print("Precision:")
    print(precision)
    print("Recall:")
    print(recall)
    print("F1 Score:")
    print(f1_score)
    print("AUC ROC:")
    print(auc_roc)