In [1]:
import pandas as pd
import numpy as np
import sys 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
import plotly.express as px
import itertools
import optuna
from lightgbm import LGBMClassifier

from joblib import Parallel, delayed, dump, load
sys.path.insert(0, '../DevCode')

pd.set_option('display.expand_frame_repr', False)
pd.options.display.max_rows = 500



In [2]:
from DataManager import BetterTokenizer

In [3]:
from Simulator import TestingSampler

In [4]:
sp = TestingSampler()

In [5]:
sp.get_sample()

(array([1.90000e+01, 4.00000e+00, 6.02128e+05, 4.00000e+00, 5.00000e+00,
        7.00000e+00, 2.30000e+01, 4.00000e+00, 2.40000e+01, 2.50000e+01,
        2.22000e+02, 2.70000e+01, 6.15969e+05, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00]),
 1)

In [6]:
model = load('model.joblib')

In [7]:
test_sample = sp.sample[0].reshape(1,-1)

In [8]:
model.predict(test_sample)

array([0])

In [9]:
model.predict_proba(test_sample)

array([[0.88633711, 0.0180816 , 0.04074116, 0.05484013]])

In [6]:
df = pd.read_csv('./Data/malicious_phish.csv')
tm = BetterTokenizer()

In [7]:
tokenized_url = tm.tokenize_column(df['url'])
df['tokens'] = tokenized_url
df = df.rename({'type':'target'},axis=1)
df = df.drop(['url',],axis=1)
df['sizes'] = [len(i) for i in df['tokens']]
mean = df['sizes'].mean()
std = df['sizes'].std()
clipped_data = df[(df['sizes'] >= (mean - 2*std)) & (df['sizes'] <= (mean + 2*std))]
data = df.loc[clipped_data.index]
targets = df['target'].loc[clipped_data.index]

max_length = 0

for i in data['tokens'].values:
    if len(i) > max_length:
        max_length = len(i)
array_data = [np.array(i) for i in data['tokens'].values]

matrix_list = []
for i in array_data:
    new_vector = np.zeros(max_length)
    current_vector_size = i.shape[0]
    new_vector[:current_vector_size] = i
    matrix_list.append(new_vector)

matrix_main = np.vstack(matrix_list)

In [5]:
targets

0           phishing
1             benign
2             benign
3         defacement
4         defacement
             ...    
651186      phishing
651187      phishing
651188      phishing
651189      phishing
651190      phishing
Name: target, Length: 629456, dtype: object

In [8]:
target_list = []
for i in targets.values:
    if i == 'benign':
        target_list.append(0)
    elif i == 'phishing':
        target_list.append(1)
    elif i == 'defacement':
        target_list.append(2)
    else:
        target_list.append(3)

In [9]:
data['targets_int'] = target_list

In [9]:
data

Unnamed: 0,target,tokens,sizes,targets_int
0,phishing,"[1, 2, 3, 4, 5, 4, 1]",7,1
1,benign,"[6, 4, 5, 7, 8, 7, 9, 4, 10]",9,0
2,benign,"[11, 4, 12, 7, 13, 7, 14, 7, 15, 4, 16]",11,0
3,defacement,"[17, 18, 7, 7, 19, 4, 20, 2, 21, 4, 22, 7, 23,...",31,2
4,defacement,"[17, 18, 7, 7, 36, 2, 37, 4, 38, 7, 23, 4, 24,...",25,2
...,...,...,...,...
651186,phishing,"[62048, 4, 3247, 4, 5, 7, 3248, 7, 3758, 7, 62...",13,1
651187,phishing,"[2686, 4, 45660, 4, 5, 7, 23449, 2, 4843, 7, 1...",16,1
651188,phishing,"[19, 4, 8613, 4, 5, 7, 62048, 7, 5920, 7, 4298...",12,1
651189,phishing,"[187, 4, 188, 4, 12, 7, 189, 7, 627327, 327, 3...",12,1


In [10]:
matrix_main

array([[1.0000e+00, 2.0000e+00, 3.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [6.0000e+00, 4.0000e+00, 5.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.1000e+01, 4.0000e+00, 1.2000e+01, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.9000e+01, 4.0000e+00, 8.6130e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.8700e+02, 4.0000e+00, 1.8800e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.9000e+01, 4.0000e+00, 1.5712e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

[1,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 3,
 2,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 2,
 2,
 0,
 2,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 0,
 2,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 1,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 2,
 0,
 3,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 2,
 2,
 0,
 0,


In [10]:
targets = np.array(target_list)

In [11]:
total_length = int(matrix_main.shape[0])
ap = int(matrix_main.shape[0] * 0.7)

In [30]:
total_length

629456

In [28]:
ap

440619

In [12]:
valid_test_length = int((total_length - ap) / 2)

In [13]:
valid_size = ap + valid_test_length
valid_size


535037

In [14]:

train = (matrix_main[:ap], targets[:ap])

valid = (matrix_main[ap: valid_size], targets[ap: valid_size])


test = (matrix_main[valid_size : ], targets[valid_size : ])

In [27]:
test[0]

array([[1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       ...,
       [1.9000e+01, 4.0000e+00, 8.6130e+03, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.8700e+02, 4.0000e+00, 1.8800e+02, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00],
       [1.9000e+01, 4.0000e+00, 1.5712e+04, ..., 0.0000e+00, 0.0000e+00,
        0.0000e+00]])

In [29]:
type(test)

tuple

In [30]:
import pickle

In [31]:
with open('test_tuple.pkl', 'wb') as file:
    pickle.dump(test, file)

In [32]:
with open('test_tuple.pkl', 'rb') as file:
    loaded_tuple = pickle.load(file)

In [33]:
loaded_tuple

(array([[1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.7000e+01, 1.8000e+01, 7.0000e+00, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [1.9000e+01, 4.0000e+00, 8.6130e+03, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.8700e+02, 4.0000e+00, 1.8800e+02, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [1.9000e+01, 4.0000e+00, 1.5712e+04, ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]]),
 array([3, 3, 3, ..., 1, 1, 1]))

In [35]:
len(loaded_tuple[0])

94419

In [37]:
sample_length = len(loaded_tuple[1]) 

In [38]:
np.random.choice(sample_length)

31728

In [39]:
loaded_tuple[0][np.random.choice(sample_length)]

array([1.90000e+01, 4.00000e+00, 6.24173e+05, 4.00000e+00, 5.00000e+00,
       7.00000e+00, 6.24174e+05, 7.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
       0.00000e+00, 0.00000e+00])

In [40]:
loaded_tuple[1][np.random.choice(sample_length)]

1

In [16]:
train[0].shape

(440619, 47)

In [17]:
valid[0].shape

(94418, 47)

In [18]:
test[0].shape

(94419, 47)

In [13]:
# X_train, X_test, y_train, y_test = train_test_split(matrix_main, np.array(target_list), test_size=0.15, random_state=42)

In [15]:
defaults_new = {
    'boosting_type':['rf'],
    'n_estimators':[75],
    'max_depth':[3,5,8,13],
    'num_leaves':[13 **2],
    'min_child_samples':[75, 100],
    'subsample':[0.98],
    'colsample_bytree': [0.3, 0.5],
    'random_state': [5],
    'verbosity':[-1],
    'n_jobs':[1],
    'learning_rate':[1.0],
    'class_weight':[{0:0.2, 1:0.3, 2:0.3, 3:0.2}]
    
}

In [16]:
combinations = itertools.product(*defaults_new.values())

sub_dicts = [dict(zip(defaults_new.keys(), combo)) for combo in combinations]

hypersearch_list = [LGBMClassifier(**params) for params in sub_dicts]

In [17]:
X_train, y_train = train[0], train[1]
X_test, y_test = valid[0], valid[1]

In [25]:
X_train

array([[1.00000e+00, 2.00000e+00, 3.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [6.00000e+00, 4.00000e+00, 5.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [1.10000e+01, 4.00000e+00, 1.20000e+01, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [2.39386e+05, 4.00000e+00, 1.19000e+02, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.46000e+02, 2.00000e+00, 8.16300e+03, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.46000e+02, 4.00000e+00, 5.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [26]:
y_train

array([1, 0, 0, ..., 0, 0, 0])

In [18]:
def train_new(input_model):
    model = input_model
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    score = accuracy_score(y_test, y_pred)

    data_tuple = (score, model.get_params(), model)

    return data_tuple

def run_grid():
    results = Parallel(n_jobs=16)(delayed(train_new)(i) for i in hypersearch_list)
    return results

def create_df():
    data_tuple = run_grid()
    data = pd.DataFrame(data_tuple,
                        columns=['total_score', 'params', 'model'])
    data = data.sort_values(by='total_score', ascending=False)
    return data

In [19]:
results = create_df()

In [21]:
results

Unnamed: 0,total_score,params,model
13,0.772999,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
15,0.770732,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
9,0.761465,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
12,0.761253,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
11,0.760565,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
14,0.758468,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
8,0.737741,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
10,0.735771,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
7,0.682974,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."
5,0.6779,"{'boosting_type': 'rf', 'class_weight': {0: 0....","LGBMClassifier(boosting_type='rf',\n ..."


In [22]:
results.iloc[0].model

In [24]:
dump(results.iloc[0].model, 'model.joblib')

['model.joblib']