In [8]:
import pandas as pd
import numpy as np
import json
import sys 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import plotly.express as px
import itertools
import optuna
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from joblib import Parallel, delayed, dump, load
# sys.path.insert(0, '../DevCode')


pd.set_option('display.expand_frame_repr', False)
pd.options.display.max_rows = 500
sys.path.append('../src')
import pickle

<h2> Import custom created code for Tokenization of Data </h2>

In [9]:
from DataManager import BetterTokenizer, Tokenizer
tm = BetterTokenizer()

In [42]:
df = pd.read_csv('../code/data/malicious_phish.csv')
df = df.rename({'type':'target'},axis=1)

<h2> Analyze the classification totals for our target variables<br>


In [43]:
df['target'].value_counts()

benign        428103
defacement     96457
phishing       94111
malware        32520
Name: target, dtype: int64

<h2>I Selected the 70% mark for out data to isolate our training, validation and testing

In [44]:
percent_70 = int(len(df) * 0.70)

<h2> Here we can see an extreme concentration of Benign samples within <br>
 the original Training data

In [47]:
df.iloc[:percent_70]['target'].value_counts()

benign        333047
defacement     84441
phishing       27338
malware        11007
Name: target, dtype: int64

<h2> Benign URL's make up over half of our data

In [48]:
df.iloc[:percent_70]['target'].value_counts()['benign'] / int(len(df))

0.5114428792781227

In [33]:
df.iloc[percent_70:]['target'].value_counts()

benign        95056
phishing      66773
malware       21513
defacement    12016
Name: target, dtype: int64

<h2> I decided to reverse the index for the data and re analyze

In [49]:
df = df.sort_index(ascending=False)

In [36]:
df

Unnamed: 0,url,target
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651186,xbox360.ign.com/objects/850/850402.html,phishing
...,...,...
4,http://adventure-nicaragua.net/index.php?optio...,defacement
3,http://www.garage-pirenne.be/index.php?option=...,defacement
2,bopsecrets.org/rexroth/cr/1.htm,benign
1,mp3raid.com/music/krizz_kaliko.html,benign


In [50]:
percent_70 = int(len(df) * 0.70)

<h2> Here we can see that our data is split more evenly withint the training data

In [51]:
df.iloc[:percent_70]['target'].value_counts()

benign        285284
phishing       82390
defacement     60303
malware        27856
Name: target, dtype: int64

<h2> After reversing the data, Benign samples only make up 43% of the data

In [53]:
df.iloc[:percent_70]['target'].value_counts()['benign'] / int(len(df))

0.43809573535260776

In [52]:
df.iloc[percent_70:]['target'].value_counts()

benign        142819
defacement     36154
phishing       11721
malware         4664
Name: target, dtype: int64

<h2> Take each URL string and tokenize the data into integers </h2>

In [54]:
tokenized_url = tm.tokenize_column(df['url'])

<h2> Assign the Token Dictionary Objects to a Single Dictionary Object </h2>

In [55]:
tokenizer_state = {
    'token_to_id':tm.token_to_id,
    'id_to_token':tm.id_to_token,
}

In [7]:
id_to = {int(k): v for k, v in tokenizer_state['id_to_token'].items()}

<h2>Save the Tokenizer State Dictionary to a JSON object for later use</h2>

In [44]:
with open('../code/data/token1.json', 'w') as f:
    json.dump(tokenizer_state, f)

<h2>Add the new tokenized data as a column to the dataframe </h2>

In [8]:
df['tokens'] = tokenized_url

In [9]:
df

Unnamed: 0,url,target,tokens
0,br-icloud.com.br,phishing,"[1, 2, 3, 4, 5, 4, 1]"
1,mp3raid.com/music/krizz_kaliko.html,benign,"[6, 4, 5, 7, 8, 7, 9, 4, 10]"
2,bopsecrets.org/rexroth/cr/1.htm,benign,"[11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]"
3,http://www.garage-pirenne.be/index.php?option=...,defacement,"[17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,..."
4,http://adventure-nicaragua.net/index.php?optio...,defacement,"[17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,..."
...,...,...,...
651186,xbox360.ign.com/objects/850/850402.html,phishing,"[62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62..."
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,"[2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1..."
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,"[19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298..."
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,"[187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3..."


In [10]:
#Drop original URL data
df = df.drop(['url',],axis=1)

In [11]:
df

Unnamed: 0,target,tokens
0,phishing,"[1, 2, 3, 4, 5, 4, 1]"
1,benign,"[6, 4, 5, 7, 8, 7, 9, 4, 10]"
2,benign,"[11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]"
3,defacement,"[17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,..."
4,defacement,"[17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,..."
...,...,...
651186,phishing,"[62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62..."
651187,phishing,"[2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1..."
651188,phishing,"[19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298..."
651189,phishing,"[187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3..."


In [12]:
df['tokens']

0                                     [1, 2, 3, 4, 5, 4, 1]
1                              [6, 4, 5, 7, 8, 7, 9, 4, 10]
2                   [11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]
3         [17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,...
4         [17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,...
                                ...                        
651186    [62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62...
651187    [2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1...
651188    [19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298...
651189    [187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3...
651190        [19, 4, 15712, 4, 5, 7, 612669, 7, 627328, 7]
Name: tokens, Length: 651191, dtype: object

<h2> Create a new column to evaluate how many tokens are within each vector </h2>

In [13]:
df['sizes'] = [len(i) for i in df['tokens']]

In [14]:
df['sizes'].value_counts()

9      65752
11     52063
13     47651
17     40627
15     40475
7      30020
4      27825
8      22479
19     22316
6      19287
5      19248
10     17376
21     17272
23     15819
12     14892
27     14257
25     13942
29     13415
31     13166
3      10955
14      9602
33      9520
16      8188
35      7031
37      6152
18      5803
39      5525
41      4862
20      4770
28      4190
24      4187
22      4106
32      3979
26      3880
43      3591
34      3577
30      3567
36      3264
45      3008
38      2549
44      2437
51      2264
47      2068
40      1942
42      1654
57      1528
49      1440
63      1266
46      1163
52      1151
48      1085
53      1048
55      1043
50      1034
54       962
56       902
58       679
60       593
59       553
62       344
61       301
64       285
66       200
65       189
76       162
89       157
69       157
68       156
70       140
67       128
71       128
72       125
77       104
84        93
74        92
95        86
81        78

<h2> Here we clip the outlier data to reduce the training size and eliminate non effective data </h2>

In [15]:
mean = df['sizes'].mean()
std = df['sizes'].std()
clipped_data = df[(df['sizes'] >= (mean - 2*std)) & (df['sizes'] <= (mean + 2*std))]
data = df.loc[clipped_data.index]
targets_df = df['target'].loc[clipped_data.index]

<h2> Here we loop through each value in the tokens column an begin padding the data to match the size of the largest vector </h2>

In [16]:
max_length = 0

for i in data['tokens'].values:
    if len(i) > max_length:
        max_length = len(i)
vector_data = [np.array(i) for i in data['tokens'].values]

<h2> Here we create a matrix of our padded and tokenized data </h2>

In [17]:
matrix_list = []
for i in vector_data:
    new_vector = np.zeros(max_length)
    current_vector_size = i.shape[0]
    new_vector[:current_vector_size] = i
    matrix_list.append(new_vector)

matrix_main = np.vstack(matrix_list)

In [18]:
matrix_main[0]

array([1., 2., 3., 4., 5., 4., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

<h2> Assigning integer values to each specific target type and assigning that as a column to the dataframe</h2>

In [19]:
target_list = []
target_key_values = {
    'benign':0,
    'phishing':1,
    'defacement':2,
    'malware':3
}
for i in targets_df.values:
    if i == 'benign':
        target_list.append(0)
    elif i == 'phishing':
        target_list.append(1)
    elif i == 'defacement':
        target_list.append(2)
    else:
        target_list.append(3)
data['target'] = target_list

In [20]:
target_key_values = {
    'benign':0,
    'phishing':1,
    'defacement':2,
    'malware':3
}

In [21]:
int_to_label = {value: key for key, value in target_key_values.items()}

In [56]:
int_to_label.get(1)

'phishing'

In [22]:
data

Unnamed: 0,target,tokens,sizes
0,1,"[1, 2, 3, 4, 5, 4, 1]",7
1,0,"[6, 4, 5, 7, 8, 7, 9, 4, 10]",9
2,0,"[11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]",11
3,2,"[17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,...",31
4,2,"[17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,...",25
...,...,...,...
651186,1,"[62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62...",13
651187,1,"[2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1...",16
651188,1,"[19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298...",12
651189,1,"[187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3...",12


<h2> 
-Create our target vector <br>
-Find the total length of the matrix <br>
-Determine the 70% mark based on the total length, this will be our Training data
</h2>

In [16]:
targets_vector = np.array(target_list)
total_length = int(matrix_main.shape[0])
ap = int(matrix_main.shape[0] * 0.7)

In [28]:
ap

440619

In [29]:
data

Unnamed: 0,target,tokens,sizes
0,1,"[1, 2, 3, 4, 5, 4, 1]",7
1,0,"[6, 4, 5, 7, 8, 7, 9, 4, 10]",9
2,0,"[11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]",11
3,2,"[17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,...",31
4,2,"[17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,...",25
...,...,...,...
651186,1,"[62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62...",13
651187,1,"[2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1...",16
651188,1,"[19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298...",12
651189,1,"[187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3...",12


<h2> 

<h2> Take 15% of the remaining data to use as the Validation Data </h2>

In [17]:
valid_test_length = int((total_length - ap) / 2)

In [18]:
valid_size = ap + valid_test_length
valid_size

535037

In [32]:
matrix_main.shape

(629456, 47)

<h2> Split the data into our tuples, each containing the data and its label </h2>

In [61]:

train = (matrix_main[:ap], targets_vector[:ap])

valid = (matrix_main[ap: valid_size], targets_vector[ap: valid_size])


test = (matrix_main[valid_size : ], targets_vector[valid_size : ])

<h2> Save and Compress our data to numpy matrices to save space

In [None]:
np.savez_compressed('../code/data/train_features.npz', a=train[0])
np.savez_compressed('../code/data/train_labels.npz', a=train[1])

np.savez_compressed('../code/data/valid_features.npz', a=valid[0])
np.savez_compressed('../code/data/valid_labels.npz', a=valid[1])

np.savez_compressed('../code/data/test_features.npz', a=test[0])
np.savez_compressed('../code/data/test_labels.npz', a=test[1])

<h2> Validate that we have a good distribution of samples

In [45]:
data[:ap]['target'].value_counts()

0    277261
1     81041
2     56635
3     25682
Name: target, dtype: int64

In [46]:
data[ap: valid_size]['target'].value_counts()

0    69846
2    17077
1     5802
3     1693
Name: target, dtype: int64

In [47]:
data[valid_size:]['target'].value_counts()

0    70216
2    16808
1     5670
3     1725
Name: target, dtype: int64

<h2> Scale our Data between 0 and 1 for our LSTM Model

In [None]:
scaler = MinMaxScaler(feature_range=(0,1))

In [None]:
scaled_matrix = scaler.fit_transform(matrix_main)

In [37]:

scaled_train = (scaled_matrix[:ap], targets_vector[:ap])

scaled_valid = (scaled_matrix[ap: valid_size], targets_vector[ap: valid_size])


scaled_test = (scaled_matrix[valid_size : ], targets_vector[valid_size : ])

<h2> Save and Compress our data to numpy matrices to save space

In [None]:
np.savez_compressed('../code/data/scaled_train_features.npz', a=scaled_train[0])
np.savez_compressed('../code/data/scaled_train_labels.npz', a=scaled_train[1])

In [None]:
np.savez_compressed('../code/data/scaled_valid_features.npz', a=scaled_valid[0])
np.savez_compressed('../code/data/scaled_valid_labels.npz', a=scaled_valid[1])

In [None]:
np.savez_compressed('../code/data/scaled_test_features.npz', a=scaled_test[0])
np.savez_compressed('../code/data/scaled_test_labels.npz', a=scaled_test[1])

<h2> Save our tuples into pickle files for later use</h2>

In [26]:
with open('../input/test_tuple.pkl', 'wb') as file:
    pickle.dump(test, file)

In [27]:
with open('../input/train_tuple.pkl', 'wb') as file:
    pickle.dump(train, file)

In [28]:
with open('../input/valid_tuple.pkl', 'wb') as file:
    pickle.dump(valid, file)