In [1]:
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import keras
from sklearn.preprocessing import MinMaxScaler

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
df = pd.read_csv('nyc_taxi.csv', engine='python', parse_dates=['timestamp'])
df


Unnamed: 0,timestamp,value
0,2014-07-01 00:00:00,10844
1,2014-07-01 00:30:00,8127
2,2014-07-01 01:00:00,6210
3,2014-07-01 01:30:00,4656
4,2014-07-01 02:00:00,3820
...,...,...
10315,2015-01-31 21:30:00,24670
10316,2015-01-31 22:00:00,25721
10317,2015-01-31 22:30:00,27309
10318,2015-01-31 23:00:00,26591


In [4]:
anomaly_points=[
        [
            "2014-11-02 00:00:00.000000",
            "2014-11-03 00:00:00.000000"
        ],
        [
            "2014-11-27 00:00:00.000000",
            "2014-11-28 00:00:00.000000"
        ],
        [
            "2014-12-24 00:00:00.000000",
            "2014-12-26 00:00:00.000000"
        ],
        [
            "2015-01-01 00:00:00.000000",
            "2015-01-02 00:00:00.000000"
        ],
        [
            "2015-01-26 00:00:00.000000",
            "2015-01-27 00:00:00.000000"
        ]
    ]
anomaly_points

[['2014-11-02 00:00:00.000000', '2014-11-03 00:00:00.000000'],
 ['2014-11-27 00:00:00.000000', '2014-11-28 00:00:00.000000'],
 ['2014-12-24 00:00:00.000000', '2014-12-26 00:00:00.000000'],
 ['2015-01-01 00:00:00.000000', '2015-01-02 00:00:00.000000'],
 ['2015-01-26 00:00:00.000000', '2015-01-27 00:00:00.000000']]

In [5]:
#is anomaly? : True => 1, False => 0
df['anomaly'] = 0
for start, end in anomaly_points:
    df.loc[((df['timestamp'] >= start) & (df['timestamp'] <= end)), 'anomaly'] = 1
df

Unnamed: 0,timestamp,value,anomaly
0,2014-07-01 00:00:00,10844,0
1,2014-07-01 00:30:00,8127,0
2,2014-07-01 01:00:00,6210,0
3,2014-07-01 01:30:00,4656,0
4,2014-07-01 02:00:00,3820,0
...,...,...,...
10315,2015-01-31 21:30:00,24670,0
10316,2015-01-31 22:00:00,25721,0
10317,2015-01-31 22:30:00,27309,0
10318,2015-01-31 23:00:00,26591,0


In [6]:
import holoviews as hv
from holoviews import opts
from holoviews import streams
import hvplot
import hvplot.pandas
import panel as pn
hv.extension('bokeh')

In [7]:
adf=df[df['anomaly']==1]
adf

Unnamed: 0,timestamp,value,anomaly
5952,2014-11-02 00:00:00,25110,1
5953,2014-11-02 00:30:00,23109,1
5954,2014-11-02 01:00:00,39197,1
5955,2014-11-02 01:30:00,35212,1
5956,2014-11-02 02:00:00,13259,1
...,...,...,...
10076,2015-01-26 22:00:00,1783,1
10077,2015-01-26 22:30:00,866,1
10078,2015-01-26 23:00:00,297,1
10079,2015-01-26 23:30:00,189,1


In [8]:
slider_day = pn.widgets.IntSlider(name="Index", start=1, end=len(df)-1 )

def draw_series(dx):
    
    pass_line = hv.Curve(df['value'], label="Demand")
    pass_line.opts(line_width=2, line_color='b', line_dash='solid', line_alpha=1) 

    idx=df['timestamp'].iloc[dx]
    idy=df['value'].iloc[dx]
    
    plot_point= hv.Points((dx,idy), label="Day = %s, Demand = %i" %(idx, idy)) 
    plot_point.opts(color='c', marker='x', size=25, line_width=5)

    plot_anomalies= hv.VLines(adf.index, label='Anomalies').opts(color='r', alpha=0.15)
    
    overlay =  pass_line *plot_point *plot_anomalies
    overlay.opts(xlabel="Time", ylabel="Demand", width=700, height=400, tools=['hover'], 
         title="New York City Taxi Demand", legend_position='top_left', show_grid=True)
    return (overlay)

pn.Row(
    pn.pane.HoloViews(
        pn.bind(draw_series, slider_day)
    ).servable(),
    
    pn.WidgetBox(
        pn.Column(
            "Временной ряд",
            slider_day,
            height = 400,
            ).servable(target='sidebar')
    ),
)

BokehModel(combine_events=True, render_bundle={'docs_json': {'3fed1964-7872-43f9-ae66-b89bb7566ca0': {'version…

In [9]:
from sklearn.preprocessing import StandardScaler
import math
data = df.filter(['value'])
scaler = StandardScaler() 
X = scaler.fit_transform(data.values)
df['X']=X
df

Unnamed: 0,timestamp,value,anomaly,X
0,2014-07-01 00:00:00,10844,0,-0.618745
1,2014-07-01 00:30:00,8127,0,-1.010291
2,2014-07-01 01:00:00,6210,0,-1.286549
3,2014-07-01 01:30:00,4656,0,-1.510496
4,2014-07-01 02:00:00,3820,0,-1.630971
...,...,...,...,...
10315,2015-01-31 21:30:00,24670,0,1.373715
10316,2015-01-31 22:00:00,25721,0,1.525175
10317,2015-01-31 22:30:00,27309,0,1.754021
10318,2015-01-31 23:00:00,26591,0,1.650550


LSTM – сети долгой краткосрочной памяти

In [10]:

X = df.filter(['X']).values
y = df.filter(['anomaly']).values
X

array([[-0.61874487],
       [-1.01029084],
       [-1.28654908],
       ...,
       [ 1.75402085],
       [ 1.65055011],
       [ 1.60688488]])

In [11]:
Past = 48
cur_b=Past
ranges =[]
for start, finish in anomaly_points:
    id_s=df[df['timestamp'] == start].index[0]
    cur_e= id_s - Past-1
    ranges.append([cur_b, cur_e])
    id_f=df[df['timestamp'] == finish].index[0]
    cur_b = id_f+1
cur_e= len(df) - Past
ranges.append([cur_b, cur_e])
ranges

[[48, 5903],
 [6001, 7103],
 [7201, 8399],
 [8545, 8783],
 [8881, 9983],
 [10081, 10272]]

In [12]:
print("Ranges:", ranges)
print(df.head())
print(df.info())
X = df['X'].values.reshape(-1, 1)
y = df['anomaly'].values

Ranges: [[48, 5903], [6001, 7103], [7201, 8399], [8545, 8783], [8881, 9983], [10081, 10272]]
            timestamp  value  anomaly         X
0 2014-07-01 00:00:00  10844        0 -0.618745
1 2014-07-01 00:30:00   8127        0 -1.010291
2 2014-07-01 01:00:00   6210        0 -1.286549
3 2014-07-01 01:30:00   4656        0 -1.510496
4 2014-07-01 02:00:00   3820        0 -1.630971
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10320 entries, 0 to 10319
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   timestamp  10320 non-null  datetime64[ns]
 1   value      10320 non-null  int64         
 2   anomaly    10320 non-null  int64         
 3   X          10320 non-null  float64       
dtypes: datetime64[ns](1), float64(1), int64(2)
memory usage: 322.6 KB
None


In [13]:
training_data_len = len(X)
train_data = X[0:training_data_len]
x_train = []
y_train = []

for start, finish in ranges:
    for i in range(start, finish): 
        x_train.append(train_data[i-Past:i,0])
        y_train.append(train_data[i,0])

x_train, y_train = np.array(x_train), np.array(y_train)
x_train = np.reshape(x_train, (x_train.shape[0],x_train.shape[1],1)) # n_samples x timeteps x n_features

In [14]:
train_data = X[0:training_data_len]
x_test = []
y_test = []

for i in range(Past,len(train_data)): # for i in range(Past,len(train_data)-Past):
    x_test.append(train_data[i-Past:i,0])
    y_test.append(train_data[i,0])

x_test, y_test = np.array(x_test), np.array(y_test)
x_test = np.reshape(x_test, (x_test.shape[0],x_test.shape[1],1))

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Input, Dropout,RepeatVector, TimeDistributed, Activation

model = Sequential()
model.add(Input(shape=(x_train.shape[1], x_train.shape[2])))
model.add(LSTM(48))
model.add(Dropout(rate=0.2))
model.add(RepeatVector(x_train.shape[1]))
model.add(LSTM(48, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(TimeDistributed(Dense(x_train.shape[2])))
model.compile(optimizer='adam', loss='mean_squared_error')

In [16]:
model.summary()

In [17]:
history = model.fit(x_train, x_train, epochs=10, batch_size=8, shuffle=False)

Epoch 1/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 13ms/step - loss: 0.6410
Epoch 2/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - loss: 0.1994
Epoch 3/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - loss: 0.1706
Epoch 4/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - loss: 0.1386
Epoch 5/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - loss: 0.1350
Epoch 6/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 12ms/step - loss: 0.1230
Epoch 7/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - loss: 0.1219
Epoch 8/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - loss: 0.1177
Epoch 9/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - loss: 0.1137
Epoch 10/10
[1m1211/1211[0m [32m━━━━━━━━━━━━━━━━━━━━

In [18]:
model.save('lstm_75_75_b8_e10.keras')  # creates a file 

In [19]:
print(tf.keras.__version__) 

3.10.0


In [20]:
from keras.models import load_model
model = load_model('lstm_75_75_b8_e10.keras')

In [21]:
x_test_pred = model.predict(x_test)

[1m321/321[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step


In [22]:
p_df=pd.DataFrame(df.iloc[Past:-Past-2])
p_df['Predicted'] = x_test_pred[1:-Past-1,0]

In [23]:
pn.extension()

N = len(p_df)
slider_w = pn.widgets.IntSlider(name="Index", start=0, end=N-1)

@pn.depends(idx=slider_w)
def points(idx):
    iv=p_df.index[idx]
    ov=p_df['X'].iloc[idx]
    pv=p_df['Predicted'].iloc[idx]
    
    point=[[iv, ov]]
    return hv.Points(point, label="Ts = % s, V = % 5.4f, PV = % 5.4f, dV= %5.4f %%" %(iv,ov, pv, (ov-pv)/ov*100 ))

fig_slider = hv.DynamicMap(points)
fig_slider.opts(color='k', marker='o', size=10)

fig_line1 = hv.Curve(p_df['X'], label="Original")
fig_line1.opts(line_width=3, line_color='blue', line_dash='solid', line_alpha=1) 

fig_line2 = hv.Curve(p_df['Predicted'], label="Predicted")
fig_line2.opts(line_width=3, line_color='red', line_dash='solid', line_alpha=0.5) 


fig =  fig_line1 * fig_line2 * fig_slider

fig.opts(xlabel="Time", ylabel="Value", width=800, height=500, tools=['hover'], 
         title="Original & Predicted Points", legend_position='top', show_grid=True)

pn.Column(slider_w, fig)

BokehModel(combine_events=True, render_bundle={'docs_json': {'70464f46-0714-42af-aded-95bde3e0bc07': {'version…

In [24]:
p_df['MAE'] = np.abs(p_df['X']-p_df['Predicted'])

In [25]:
print(p_df.columns)

Index(['timestamp', 'value', 'anomaly', 'X', 'Predicted', 'MAE'], dtype='object')


In [26]:
pn.extension()

slider_w = pn.widgets.FloatSlider(name="Threshold", start=p_df['MAE'].min(), end=p_df['MAE'].max(), step=0.01)
min, max = p_df.index.min(), p_df.index.max()
scale = 5

@pn.depends(idx=slider_w)
def thresh(idx):
    fig1= hv.Curve ([[min, idx*scale],[max,idx*scale]], label=" Threshold Value = % 5.4f" %(idx)).opts(color='k', line_dash='dashed', line_width=5)
    return fig1

fig_slider = hv.DynamicMap(thresh)

fig_line1 = hv.Curve(p_df['X'], label="Original")
fig_line1.opts(line_width=3, line_color='blue', line_dash='solid', line_alpha=1) 

# Create the line for the predicted values
fig_line2 = hv.Curve(p_df['Predicted'], label="Predicted")
fig_line2.opts(line_width=3, line_color='cyan', line_dash='solid', line_alpha=0.75) 

fig_line3 = hv.Curve(p_df['MAE']*scale, label="MAE")
fig_line3.opts(line_width=3, line_color='red', line_dash='solid', line_alpha=0.5) 


fig =  fig_line3 * fig_slider
#fig =  fig_line1 * fig_line2 * fig_line3 * fig_slider

fig.opts(xlabel="Time", ylabel="Value", width=800, height=500, tools=['hover'], 
         title="Original & Predicted Points", legend_position='top', show_grid=True)

pn.Column(slider_w, fig)

BokehModel(combine_events=True, render_bundle={'docs_json': {'0f56d621-8e7a-4e80-93cd-ffb43b519a70': {'version…

In [27]:
def draw_points(df, X_name, X_label, A_name, title ):

    slider_day = pn.widgets.IntSlider(name="Index", start=1, end=len(df)-1 )

    def draw_series(dx):
        
        fig_line = hv.Curve(df[X_name], label=X_label)
        fig_line.opts(line_width=2, line_color='b', line_dash='solid', line_alpha=1) 
    
        idx=df.index[dx]
        idy=df[X_name].iloc[dx]
        
        plot_point= hv.Points((idx,idy), label="Day = %s, Value = %i" %(idx, idy)) 
        plot_point.opts(color='c', marker='x', size=25, line_width=5)
    
        anomalies = [[ind, value] for ind, value in zip(df[df[A_name]==1].index, df.loc[df[A_name]==1, X_name])]
        anom_points = hv.Points(anomalies, label="Anomaly Points")
        anom_points.opts(color='red', size=3, alpha=1 ) 
        
        overlay =  fig_line *anom_points *plot_point
        overlay.opts(xlabel="Time", ylabel=X_name, width=700, height=400, tools=['hover'], 
             title=title, legend_position='bottom_right', show_grid=True)
        return (overlay)
    
    return (
        pn.Row(
        pn.pane.HoloViews(
            pn.bind(draw_series, slider_day)
        ).servable(),
        
        pn.WidgetBox(
            pn.Column(
                "Временной ряд",
                slider_day,
                height = 400,
                ).servable(target='sidebar')
        ),
        )
    )

In [None]:
threshold =1.3
column_name='LSTM_10'
p_df[column_name] = 0
p_df.loc[(p_df['MAE'] >=threshold), column_name] = 1
draw_points(p_df, 'X','Values', column_name, column_name)

BokehModel(combine_events=True, render_bundle={'docs_json': {'a4ffe2d2-d4ad-4180-b05f-485f60d4ea4c': {'version…

In [41]:
from scipy import signal
p_df['LSTM_filter']=signal.medfilt(p_df[column_name], 7)
draw_points(p_df, 'value','Values', 'LSTM_filter', 'LSTM_filter')

BokehModel(combine_events=True, render_bundle={'docs_json': {'7481ee70-0776-4a74-ae5c-8e82c923b006': {'version…

In [42]:
results=p_df.columns.tolist()[-2:]
results

['LSTM_10', 'LSTM_filter']

In [43]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
metrics =[ accuracy_score, precision_score, recall_score, f1_score ]
res_df=pd.DataFrame()
for score in metrics:
    f_name=score.__name__
    res_df[f_name]=0.0
    for res in results:
        s = score(p_df['anomaly'], p_df[res])
        res_df.at[res, f_name] = s
res_df

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
LSTM_10,0.935825,0.080831,0.119454,0.096419
LSTM_filter,0.938857,0.065445,0.085324,0.074074
