In [1]:
import pandas as pd
import numpy as np
import json
import sys 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import plotly.express as px
import itertools
import optuna
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from joblib import Parallel, delayed, dump, load
# sys.path.insert(0, '../DevCode')


pd.set_option('display.expand_frame_repr', False)
pd.options.display.max_rows = 500
sys.path.append('../src')
import pickle

In [2]:
from DataManager import BetterTokenizer, Tokenizer
tm = BetterTokenizer()

In [3]:
df = pd.read_csv('../src/data/malicious_phish.csv')
df = df.rename({'type':'target'},axis=1)

In [4]:
df = df.sort_index(ascending=False)

In [5]:
df

Unnamed: 0,url,target
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing
651188,www.gamespot.com/xbox360/action/deadspace/,phishing
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing
651186,xbox360.ign.com/objects/850/850402.html,phishing
...,...,...
4,http://adventure-nicaragua.net/index.php?optio...,defacement
3,http://www.garage-pirenne.be/index.php?option=...,defacement
2,bopsecrets.org/rexroth/cr/1.htm,benign
1,mp3raid.com/music/krizz_kaliko.html,benign


In [6]:
tokenized_url = tm.tokenize_column(df['url'])
temp_df = df.copy()
# nv = temp_df['tokens'].to_numpy()
# last_tokens = [i[-1] for i in nv]
# temp_df['last'] = last_tokens

In [7]:
# tokenizer_state = {
#     'token_to_id':tm.token_to_id,
#     'id_to_token':tm.id_to_token,
# }

# with open('../code/data/token1.json', 'w') as f:
#     json.dump(tokenizer_state, f)

In [8]:
df['tokens'] = tokenized_url

In [9]:
df

Unnamed: 0,url,target,tokens
651190,www.angelfire.com/goth/devilmaycrytonite/,phishing,"[1, 2, 3, 2, 4, 5, 6, 5, 7, 5]"
651189,en.wikipedia.org/wiki/Dead_Space_(video_game),phishing,"[8, 2, 9, 2, 10, 5, 11, 5, 12, 13, 14, 15]"
651188,www.gamespot.com/xbox360/action/deadspace/,phishing,"[1, 2, 16, 2, 4, 5, 17, 5, 18, 5, 19, 5]"
651187,games.teamxbox.com/xbox-360/1860/Dead-Space/,phishing,"[20, 2, 21, 2, 4, 5, 22, 23, 24, 5, 25, 5, 26,..."
651186,xbox360.ign.com/objects/850/850402.html,phishing,"[17, 2, 28, 2, 4, 5, 29, 5, 30, 5, 31, 2, 32]"
...,...,...,...
4,http://adventure-nicaragua.net/index.php?optio...,defacement,"[9500, 135, 5, 5, 97, 23, 167002, 2, 103, 5, 2..."
3,http://www.garage-pirenne.be/index.php?option=...,defacement,"[9500, 135, 5, 5, 1, 2, 71456, 23, 164934, 2, ..."
2,bopsecrets.org/rexroth/cr/1.htm,benign,"[627327, 2, 10, 5, 627328, 5, 19094, 5, 215, 2..."
1,mp3raid.com/music/krizz_kaliko.html,benign,"[169942, 2, 4, 5, 37803, 5, 280184, 2, 32]"


In [10]:
df = df.drop(['url'],axis=1)
df['sizes'] = [len(i) for i in df['tokens']]
mean = df['sizes'].mean()
std = df['sizes'].std()
clipped_data = df[(df['sizes'] >= (mean - 2*std)) & (df['sizes'] <= (mean + 2*std))]
data = df.loc[clipped_data.index]
targets_df = df['target'].loc[clipped_data.index]

max_length = 0

for i in data['tokens'].values:
    if len(i) > max_length:
        max_length = len(i)
vector_data = [np.array(i) for i in data['tokens'].values]

matrix_list = []
for i in vector_data:
    new_vector = np.zeros(max_length)
    current_vector_size = i.shape[0]
    new_vector[:current_vector_size] = i
    matrix_list.append(new_vector)

matrix_main = np.vstack(matrix_list)

target_list = []
target_key_values = {
    'benign':0,
    'phishing':1,
    'defacement':2,
    'malware':3
}
for i in targets_df.values:
    if i == 'benign':
        target_list.append(0)
    elif i == 'phishing':
        target_list.append(1)
    elif i == 'defacement':
        target_list.append(2)
    else:
        target_list.append(3)
data['target'] = target_list

targets_vector = np.array(target_list)
total_length = int(matrix_main.shape[0])
ap = int(matrix_main.shape[0] * 0.7)

valid_test_length = int((total_length - ap) / 2)

valid_size = ap + valid_test_length
valid_size

535037

In [25]:
train = (matrix_main[:ap], targets_vector[:ap])

valid = (matrix_main[ap: valid_size], targets_vector[ap: valid_size])


test = (matrix_main[valid_size : ], targets_vector[valid_size : ])

In [12]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_matrix = scaler.fit_transform(matrix_main)

scaled_train = (scaled_matrix[:ap], targets_vector[:ap])

scaled_valid = (scaled_matrix[ap: valid_size], targets_vector[ap: valid_size])

scaled_test = (scaled_matrix[valid_size : ], targets_vector[valid_size : ])

In [17]:
np.save('scaled_valid_train_features.npz', scaled_train[0])

In [22]:
# np.savez_compressed('scaled_valid_train_features.npz',scaled_train[0])
np.savez_compressed('../code/data/scaled_train_features.npz', a=scaled_train[0])
np.savez_compressed('../code/data/scaled_train_labels.npz', a=scaled_train[1])

In [23]:
np.savez_compressed('../code/data/scaled_valid_features.npz', a=scaled_valid[0])
np.savez_compressed('../code/data/scaled_valid_labels.npz', a=scaled_valid[1])

In [24]:
np.savez_compressed('../code/data/scaled_test_features.npz', a=scaled_test[0])
np.savez_compressed('../code/data/scaled_test_labels.npz', a=scaled_test[1])

In [16]:
scaler = MinMaxScaler(feature_range=(0,1))
scaled_matrix = scaler.fit_transform(matrix_main)

scaled_train = (scaled_matrix[:ap], targets_vector[:ap])

scaled_valid = (scaled_matrix[ap: valid_size], targets_vector[ap: valid_size])

scaled_test = (scaled_matrix[valid_size : ], targets_vector[valid_size : ])

with open('../code/data/scaled_test_tuple.pkl', 'wb') as file:
    pickle.dump(scaled_test, file)
with open('../code/data/scaled_train_tuple.pkl', 'wb') as file:
    pickle.dump(scaled_train, file)
with open('../code/data/scaled_valid_tuple.pkl', 'wb') as file:
    pickle.dump(scaled_valid, file)

In [26]:
np.savez_compressed('../code/data/train_features.npz', a=train[0])
np.savez_compressed('../code/data/train_labels.npz', a=train[1])

np.savez_compressed('../code/data/valid_features.npz', a=valid[0])
np.savez_compressed('../code/data/valid_labels.npz', a=valid[1])

np.savez_compressed('../code/data/test_features.npz', a=test[0])
np.savez_compressed('../code/data/test_labels.npz', a=test[1])

In [315]:
with open('../input/test_tuple.pkl', 'wb') as file:
    pickle.dump(test, file)
with open('../input/train_tuple.pkl', 'wb') as file:
    pickle.dump(train, file)
with open('../input/valid_tuple.pkl', 'wb') as file:
    pickle.dump(valid, file)

In [160]:
defaults_new = {
    'boosting_type':['gbdt'],
    'n_estimators':[150],
    'max_depth':[3, 6],
    'num_leaves':[6**2],
    'min_child_samples':[2,10,25],
    'subsample':[0.3, 0.5],
    'colsample_bytree': [0.9],
    'reg_alpha':[0.01],
    'reg_lambda':[0.01],
    'random_state': [5],
    'verbosity':[-1],
    'n_jobs':[2],
    'subsample_freq':[1],
    'learning_rate':[0.01, 0.005, 0.001],
    'class_weight':[{0:0.2, 1:3, 2:0.5, 3:0.30}]
    
}

In [161]:
combinations = itertools.product(*defaults_new.values())

sub_dicts = [dict(zip(defaults_new.keys(), combo)) for combo in combinations]

hypersearch_list = [LGBMClassifier(**params) for params in sub_dicts]

In [162]:
len(hypersearch_list)

36

In [163]:
X_train, y_train = train[0], train[1]
X_val, y_val = valid[0], valid[1]
X_test, y_test = test[0], test[1]

In [164]:
# X_train = X_train[:, ::-1].copy()
# X_val = X_val[:, ::-1].copy()
# X_test = X_test[:, ::-1].copy()

In [165]:
# X_train = X_train[:, -10:]
# X_val = X_val[:, -10:]
# X_test = X_test[:, -10:]

In [166]:
X_train[0]

array([1., 2., 3., 2., 4., 5., 6., 5., 7., 5., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [167]:
class Node:

    def __init__(self, y_true_input, y_pred_input) -> None:
        self.right = 0
        self.wrong = 0
        self.search(y_true_input=y_true_input, y_pred_input=y_pred_input)

    def search(self, y_true_input, y_pred_input):
        for t, p in zip(y_pred_input, y_pred_input):
            if (t == 2) & (p == 2):
                self.right += 1
            elif (t != 0) & (p != 0):
                self.right += 1
            elif (t != 2) & (p == 0):
                self.wrong += 1
        #print(f'right {self.right} | wrong {self.wrong}')



In [168]:
def training(input_model):
    model = input_model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    score1 = accuracy_score(y_val, y_pred)
    nd = Node(y_true_input=y_val, y_pred_input=y_pred)
    score = nd.right / (nd.right + nd.wrong)



    data_tuple = (score,  model.get_params(), model)

    return data_tuple

In [169]:
def run_multi_core():
    results = Parallel(n_jobs=16)(delayed(training)(i) for i in hypersearch_list)
    return results

In [170]:
def create_df():
    data_tuple = run_multi_core()
    data = pd.DataFrame(data_tuple,
                        columns=['total_score', 'params', 'model'])
    data = data.sort_values(by='total_score', ascending=False)
    return data

In [171]:
results = create_df()

In [172]:
results.head(10)

Unnamed: 0,total_score,params,model
35,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
8,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
17,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
23,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
14,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
11,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
20,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
26,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
29,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
5,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."


In [174]:
results

Unnamed: 0,total_score,params,model
35,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
8,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
17,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
23,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
14,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
11,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
20,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
26,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
29,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."
5,1.0,"{'boosting_type': 'gbdt', 'class_weight': {0: ...","LGBMClassifier(class_weight={0: 0.2, 1: 3, 2: ..."


In [152]:
for i in range(10):
    print(results.iloc[i].params)

{'boosting_type': 'gbdt', 'class_weight': {0: 0.2, 1: 3, 2: 0.5, 3: 0.3}, 'colsample_bytree': 0.9, 'importance_type': 'split', 'learning_rate': 0.001, 'max_depth': 34, 'min_child_samples': 25, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 75, 'n_jobs': 2, 'num_leaves': 1156, 'objective': None, 'random_state': 5, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'silent': 'warn', 'subsample': 0.5, 'subsample_for_bin': 200000, 'subsample_freq': 1, 'verbosity': -1}
{'boosting_type': 'gbdt', 'class_weight': {0: 0.2, 1: 3, 2: 0.5, 3: 0.3}, 'colsample_bytree': 0.9, 'importance_type': 'split', 'learning_rate': 0.001, 'max_depth': 6, 'min_child_samples': 25, 'min_child_weight': 0.001, 'min_split_gain': 0.0, 'n_estimators': 75, 'n_jobs': 2, 'num_leaves': 1156, 'objective': None, 'random_state': 5, 'reg_alpha': 0.01, 'reg_lambda': 0.01, 'silent': 'warn', 'subsample': 0.3, 'subsample_for_bin': 200000, 'subsample_freq': 1, 'verbosity': -1}
{'boosting_type': 'gbdt', 'class_weight': {0: 0.2

In [136]:
results.iloc[0].model.predict(X_test)

array([1, 1, 1, ..., 1, 1, 1])

In [173]:
for i in range(len(hypersearch_list)):
    y_pred = results.iloc[i].model.predict(X_test)
    nd = Node(y_true_input=y_test, y_pred_input=y_pred)
    print(accuracy_score(y_test, y_pred))
    print(f'right {nd.right} | wrong {nd.wrong}')
    print('')


0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.0600514726908779
right 94419 | wrong 0

0.6600472362554147
right 50184 | wrong 44235

0.6600472362554147
right 50184 | wrong 44235

0.6600472362554147
right 50184 | wrong 44235

0.6613817134263231
right 50049 | wrong 44370

0.6613817134263231
right 50049 | wrong 44370

0.6613817134263231
right 50049 | wrong 44370

0.7726199176013302
right 41780 | wrong 52639

0.7725457799807243
right 41790 | wrong 52629

0.7725457799807243
right 41790 | wrong 52629

0.7725245978034082
right 41728 | wrong 52691

0.7727470106652263
right 41710 | wro

In [177]:
y_pred = results.iloc[13].model.predict(X_test)
nd = Node(y_true_input=y_test, y_pred_input=y_pred)
print(f'right {nd.right} | wrong {nd.wrong}')

right 50184 | wrong 44235


In [275]:
y_pred = results.iloc[0].model.predict(X_test)
nd = Node(y_true_input=y_test, y_pred_input=y_pred)
print(f'right {nd.right} | wrong {nd.wrong}')

right 94397 | wrong 22


In [178]:
dump(results.iloc[13].model, '../code/data/gb_model.joblib')

['../code/data/gb_model.joblib']

In [138]:
y_pred = results.iloc[2].model.predict(X_test)
accuracy_score(y_test, y_pred)

0.36491595971149876

In [61]:
accuracy_score(y_test, y_pred)

0.40089388788273544

In [300]:
valid = pd.read_pickle('../code/data/valid_tuple.pkl')
train = pd.read_pickle('../code/data/train_tuple.pkl')
test = pd.read_pickle('../code/data/test_tuple.pkl')

In [302]:
train[0]

array([[1.00000e+00, 2.00000e+00, 3.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [6.00000e+00, 4.00000e+00, 5.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.10000e+01, 4.00000e+00, 1.20000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.39386e+05, 4.00000e+00, 1.19000e+02, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.46000e+02, 2.00000e+00, 8.16300e+03, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.46000e+02, 4.00000e+00, 5.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])