In [None]:
import numpy as np
import os
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, Bidirectional, SimpleRNN

In [None]:
dfs = pd.DataFrame()
for i in range(1,27):
    filename = f"rawdata/{i}.json"
    if os.path.exists(filename):
        with open(filename, "r") as f:
            data = json.load(f)
            df = pd.json_normalize(data.get('data'))

    # Add duration by using next time - this time
    df['time'] = pd.to_datetime(df['time'])
    df['duration'] = df['time'].diff().apply(lambda x: x.total_seconds())
    df['duration'] = df['duration'].shift(periods=-1)
    df.at[df.index[-1], 'duration'] = random.randint(5,20)
    
    # Drop unnecessary columns
    df = df[['url','type', 'duration']]
    df = df.reset_index()
    dfs = pd.concat([dfs,df],axis=0, ignore_index=True)

In [None]:
# Apply lable encoding to type and one-hot encoding to urls
le = LabelEncoder()
oe = OneHotEncoder()
dfs['type'] = le.fit_transform(dfs['type'])

onehot = oe.fit_transform(np.array(dfs['url']).reshape(-1,1))
onehot_df = pd.DataFrame.sparse.from_spmatrix(onehot)
# onehot_df
dfs_encoded = pd.concat([dfs, onehot_df], axis=1)
dfs_encoded.drop('url', axis=1, inplace=True)

In [None]:
# Convert data into time sequences, 5 events forms a time step
samples = []
X_sequences = []
y_sequences = []
start_index = 0
for row_index in range(len(dfs_encoded)-1):
        if (dfs_encoded.iloc[row_index+1]['index'] == 0 or row_index+1 == (len(dfs_encoded)-1)):
                samples = dfs_encoded.iloc[start_index:row_index+2, 1:].values.tolist()
                for i in range(5, len(samples)):
                        time_step = samples[i-5:i]
                        X_sequences.append(time_step)
                        y_sequences.append(samples[i][2:])
                        i+=1
                start_index = row_index+2


In [None]:
# Train test split
X_train,X_test,y_train,y_test=train_test_split(X_sequences,y_sequences,test_size=0.2,random_state=2)

In [None]:
# RNN
model = Sequential()
model.add(LSTM(32, return_sequences=True, input_shape=(5,658)))
model.add(Dropout(0.2))
model.add(LSTM(32, input_shape=(5,658)))
model.add(Dropout(0.2))
model.add(Dense(656, activation='sigmoid'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=5, epochs=200, validation_data=(X_test, y_test))
print("Training.....")
model.save('model.json')