- 데이터 전처리

In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset_v01.csv')

In [4]:
# 결측치 확인
test_data.isna().sum()

Unnamed: 0                   0
url_len                      0
url_num_hyphens_dom          0
url_path_len               466
url_domain_len             466
url_hostname_len           463
url_num_dots                 0
url_num_underscores          0
url_query_len                0
url_num_query_para           0
url_ip_present               0
url_entropy                  0
url_chinese_present          0
url_port                     0
html_num_tags('iframe')      0
html_num_tags('script')    467
html_num_tags('embed')       0
html_num_tags('object')      0
html_num_tags('div')         0
html_num_tags('head')        0
html_num_tags('body')        0
html_num_tags('form')      467
html_num_tags('a')         461
html_num_tags('applet')      0
dtype: int64

In [6]:
# 데이터 정보 확인
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2441 entries, 0 to 2440
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               2441 non-null   int64  
 1   url_len                  2441 non-null   int64  
 2   url_num_hyphens_dom      2441 non-null   int64  
 3   url_path_len             1975 non-null   float64
 4   url_domain_len           1975 non-null   float64
 5   url_hostname_len         1978 non-null   float64
 6   url_num_dots             2441 non-null   int64  
 7   url_num_underscores      2441 non-null   int64  
 8   url_query_len            2441 non-null   int64  
 9   url_num_query_para       2441 non-null   int64  
 10  url_ip_present           2441 non-null   int64  
 11  url_entropy              2441 non-null   float64
 12  url_chinese_present      2441 non-null   int64  
 13  url_port                 2441 non-null   int64  
 14  html_num_tags('iframe') 

In [5]:
test_data["html_num_tags('applet')"].unique()

array([0], dtype=int64)

In [6]:
test_data["url_chinese_present"].unique()

array([0], dtype=int64)

In [7]:
# 범주형 데이터 가변수화
test_data = test_data.replace(['malicious', 'benign'], [0, 1]) # 나중에 0 -> -1로 대치

In [8]:
# 의미 없는 데이터 제거
test_data = test_data.drop(['Unnamed: 0', 'url_chinese_present', "html_num_tags('applet')"], axis=1)

- 결측치 처리
    - url_path_len         
    - url_domain_len       
    - url_hostname_len     
    - html_num_tags('script')
    - html_num_tags('form')
    - html_num_tags('a')  

In [9]:
# 1. 평균값으로 대치
test1 = test_data.fillna({'url_path_len': test_data['url_path_len'].mean(),
                          'url_domain_len': test_data['url_domain_len'].mean(),
                          'url_hostname_len': test_data['url_hostname_len'].mean(),
                          "html_num_tags('script')": test_data["html_num_tags('script')"].mean(),
                          "html_num_tags('form')": test_data["html_num_tags('form')"].mean(),
                          "html_num_tags('a')": test_data["html_num_tags('a')"].mean()
                          })

In [10]:
# 2. 임의 값(-1)으로 대치
test2 = test_data.fillna(-1)

In [11]:
# 3. 열 삭제
test3 = test_data.dropna(axis=1)

In [12]:
# 4. KNNImputer 사용
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5, weights='distance')
test4 = imputer.fit_transform(test_data)

- train data 전처리

In [13]:
# 범주형 데이터 가변수화
train_data = train_data.replace(['malicious', 'benign'], [0, 1])

In [14]:
# NaN 조치: 행 삭제
train_data.dropna(axis=0, inplace=True)

In [15]:
# 의미 없는 데이터 삭제
train_data = train_data.drop(['Unnamed: 0', 'url_chinese_present', "html_num_tags('applet')"], axis=1)

In [16]:
# test set에 NaN 값 있던 열 삭제 -> 따로 저장
na_cols = ['url_path_len', 'url_domain_len', 'url_hostname_len', "html_num_tags('script')","html_num_tags('form')", "html_num_tags('a')"]
train_data2 = train_data.drop(na_cols, axis=1)

In [17]:
x_train = train_data.drop('Result_v1', axis=1)
y_train = train_data['Result_v1']

x_train2 = train_data2.drop('Result_v1', axis=1)
y_train2 = train_data2['Result_v1']


- 모델링

In [21]:
from sklearn.ensemble import RandomForestClassifier

# 기본 모델 생성
model = RandomForestClassifier()
model.fit(x_train, y_train)

model2 = RandomForestClassifier()
model2.fit(x_train2, y_train2)

In [29]:
# 전처리 방법 1: 평균값 대치
p1 = model.predict(test1)
r1 = pd.DataFrame({'expected':p1})
r1.replace(0, -1, inplace=True) # 0, 1로 모델링 -> -1, 1로 변경
r1.to_csv('result_v1.csv', index=True, index_label='id') # f1-score: 0.91

In [32]:
# 전처리 방법 2: 임의값 대치
p2 = model.predict(test2)
r2 = pd.DataFrame({'expected':p2})
r2.replace(0, -1, inplace=True)
r2.to_csv('result_v2.csv', index=True, index_label='id') # f1-score: 0.86

In [36]:
# 전처리 방법 3: 열 삭제
p3 = model2.predict(test3)
r3 = pd.DataFrame({'expected':p3})
r3.replace(0, -1, inplace=True)
r3.to_csv('result_v3.csv', index=True, index_label='id') # f1-score: 0.88

In [40]:
# 전처리 방법 4: KNNImputer
p4 = model.predict(test4)
r4 = pd.DataFrame({'expected':p4})
r4.replace(0, -1, inplace=True)
r4.to_csv('result_v4.csv', index=True, index_label='id') # f1-score: 0.94

In [30]:
# GridSearch로 찾은 변수 넣어서 최적화
m3 = RandomForestClassifier(max_features=2, n_estimators=160)
m3.fit(x_train, y_train)

In [43]:
# RF 최적화 + KNNImputer
p5 = m3.predict(test4)
r5 = pd.DataFrame({'expected':p5})
r5.replace(0, -1, inplace=True)
r5.to_csv('result_v5.csv', index=True, index_label='id') # f1-score: 0.94

In [44]:
# KNNImputer: K값 최적화
neighbors = range(2, 11, 2)
for i in neighbors:
    imputer = KNNImputer(n_neighbors=i, weights='distance')
    tmp = imputer.fit_transform(test_data)
    pred = m3.predict(tmp)
    result = pd.DataFrame({'expected':pred})
    result.replace(0, -1, inplace=True)
    result.to_csv(f'result_v6_neighbor{i}.csv', index=True, index_label='id')
        

----

In [18]:
# lgbm
import lightgbm as lgb

m6 = lgb.LGBMClassifier()
m6.fit(x_train, y_train)

In [19]:
p6 = m6.predict(test4)
result = pd.DataFrame({'expected':p6})
result.replace(0, -1, inplace=True)
result.to_csv('result_lgbm.csv', index=True, index_label='id') # f1-score: 0.93

In [22]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

estimators = [('rf', RandomForestClassifier(max_features=2, n_estimators=160)),
              ('xgb', XGBClassifier(learning_rate=0.17, max_depth=5, n_estimators=130)),
              ('gbm', lgb.LGBMClassifier())]

model_stacking = StackingClassifier(estimators=estimators
                                    , final_estimator= RandomForestClassifier())

In [24]:
model_stacking.fit(x_train, y_train)

In [25]:
p7 = model_stacking.predict(test4)
result = pd.DataFrame({'expected':p7})
result.replace(0, -1, inplace=True)
result.to_csv('result_stack.csv', index=True, index_label='id') # f1-score: 0.94

In [27]:
# 4. KNNImputer 사용
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
col_name = list(test_data)
test5 = pd.DataFrame(imputer.fit_transform(test_data))

In [31]:
p8 = m3.predict(test5)
result = pd.DataFrame({'expected':p8})
result.replace(0, -1, inplace=True)
result.to_csv('result_real_jjinmak.csv', index=True, index_label='id') # f1-score: 0.92 ?