In [None]:
import numpy as np
import os
import pandas as pd
import json
import random
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
dfs = pd.DataFrame()
for i in range(1,27):
    filename = f"rawdata/{i}.json"
    if os.path.exists(filename):
        with open(filename, "r") as f:
            data = json.load(f)
            df = pd.json_normalize(data.get('data'))

    # Add duration by using next time - this time
    df['time'] = pd.to_datetime(df['time'])
    df['duration'] = df['time'].diff().apply(lambda x: x.total_seconds())
    df['duration'] = df['duration'].shift(periods=-1)
    df.at[df.index[-1], 'duration'] = random.randint(5,20)
    
    # Drop unnecessary columns
    df = df[['url','type', 'duration']]
    df = df.reset_index()
    dfs = pd.concat([dfs,df],axis=0, ignore_index=True)

In [None]:
# Tokenization
punc=string.punctuation
stop_words = set(stopwords.words('english'))

dfs['re_url'] = dfs['url'].replace(regex={r'http://': '', 'https://': '', 'www.': '',
                                          '.edu':'', '.org':'', '.net':'',
                                          '.uk':'', 'chrome':'', 'sourceid':'', 'utf':'',
                                          'and':'', '.cn':''})
# print(dfs['re_url'][1])

for i in range(len(dfs['re_url'])):
    # Remove query parameters and special characters
    dfs.loc[i,'re_url'] = re.sub(r'\.com\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\.ac\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'[\.\?\!\/\&\+\=\-\%\#]', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\bin\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\bthe\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\bfor\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'html', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'php', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\be5\b', ' ', dfs.loc[i,'re_url'])

    dfs.loc[i,'re_url'] = re.sub(r'[A-Z]', lambda m: m.group(0).lower(), dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'(?<![a-z])[a-z](?![a-z])', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\b\d+\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\b\w\d+\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\b\w{1,2}\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\butf\b', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'[^\w\s]', ' ', dfs.loc[i,'re_url'])
    dfs.loc[i,'re_url'] = re.sub(r'\b\w{1,2}\d+\b', ' ', dfs.loc[i,'re_url'])

dfs['re_url'].apply(lambda x: [word for word in x if word not in punc])
dfs['re_url'].apply(lambda x: [word for word in x if word not in stop_words])
dfs['tok_url'] = dfs['re_url'].apply(lambda x: x.split())

In [None]:
# Lemmatization
from nltk import pos_tag
dfs['pos_tags'] = dfs['tok_url'].apply(pos_tag)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
dfs['wordnet_pos'] = dfs['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(pos_tag)) for (word, pos_tag) in x])

wnl = WordNetLemmatizer()
dfs['lemma'] = dfs['wordnet_pos'].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])

In [None]:
# Build the url vocabulary
from collections import Counter
vocabulary = []
for row in dfs['lemma']:
    vocabulary.extend(row)

vocabulary = [word.translate(str.maketrans("", "", string.punctuation)) for word in vocabulary]
vocabulary = [re.sub(r'\b\w{1,2}\b', '', word) for word in vocabulary]
vocabulary = [re.sub(r'\b\w+\d+', '', word) for word in vocabulary]
vocabulary = [re.sub(r'\b\d+\w+', '', word) for word in vocabulary]
vocabulary = list(filter(None, vocabulary))

freq_dist = Counter(vocabulary)
sorted_vocab = sorted(freq_dist.items(), key=lambda x: x[1], reverse=True)
vocab_size = 800  
url_vocab = [token for token, freq in sorted_vocab[:vocab_size]]

In [None]:
# Encoding
le = LabelEncoder()
dfs['type'] = le.fit_transform(dfs['type'])

# Encoding urls using the vocabulary and convert all the type into float32
word_to_label = {word: i for i, word in enumerate(url_vocab)}
dfs['encoded_urls'] = dfs['lemma'].apply(lambda x: [word_to_label[word] for word in x if word in url_vocab])

dfs_encoded = dfs[['index','type', 'duration', 'encoded_urls']]
dfs_encoded['type'] = dfs_encoded['type'].apply(lambda x: np.array(x, dtype=np.float32))
dfs_encoded = dfs_encoded[['index','type', 'duration']]

padded_urls = pad_sequences([x for x in dfs['encoded_urls']], maxlen=22, padding='post')
# padded_urls = list(padded_urls.astype('float32'))
df2 = pd.DataFrame(padded_urls, columns= [f'{i}' for i in range(1, 23)])
dfs_encoded = pd.concat([dfs_encoded, df2], axis=1)


In [None]:
# Convert data into time sequences, 4 events forms a time step, and put them into input and output lists
samples = []
X_sequences = []
y_sequences = []
start_index = 0
for row_index in range(len(dfs_encoded)-1):
        if (dfs_encoded.iloc[row_index+1]['index'] == 0 or row_index+1 == (len(dfs_encoded)-1)):
                samples = dfs_encoded.iloc[start_index:row_index+2, 1:].values.tolist()
                for i in range(4, len(samples)):
                        time_step = samples[i-4:i]
                        X_sequences.append(time_step)
                        y_sequences.append(samples[i][2:])
                        i+=1
                start_index = row_index+2

In [None]:
# Train test split
X_train,X_test,y_train,y_test=train_test_split(X_sequences,y_sequences,test_size=0.2,random_state=2)


In [None]:
# RNN
from tensorflow import keras
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, SimpleRNN

# Define the RNN model
model = Sequential()
# model.save('my_model.h5')
# model = keras.models.load_model(model, custom_objects={'binary_activation': binary_activation})
model.add(LSTM(32, return_sequences=True, input_shape=(4,24)))
# model.add(LSTM(32, return_sequences=True, input_shape=(5,658)))
model.add(Dropout(0.2))
model.add(LSTM(32, input_shape=(4,24)))
# model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(22, activation='sigmoid'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=10, epochs=150, validation_data=(X_test, y_test))

# train the model
# model.fit(X_train, y_train, epochs=100, batch_size=10)

# loss, accuracy = model.evaluate(X_test, y_test)
# print("Test set loss: {:.3f}".format(loss))
# print("Test set accuracy: {:.3f}".format(accuracy))

# predict on test data
# y_pred = model.predict(X_test)

# # convert the probability distribution to the sequence of numbers
# y_pred_seq = np.argmax(y_pred, axis=1)


print("Training.....")
model.save('modelX.json')