In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import gc
import re
import glob

import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
from tqdm.auto import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, auc

from urllib.parse import quote, unquote, urlparse

import lightgbm as lgb

In [2]:
import random
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed) # 禁止hash随机化
set_seed(2022)

In [3]:
# train

train_files = glob.glob('../data/train/*.csv')

df_train = pd.DataFrame()

for filepath in tqdm(train_files):
    df = pd.read_csv(filepath)
    df_train = pd.concat([df_train, df]).reset_index(drop=True)
    
df_train.fillna('__NaN__', inplace=True)

# 强迫症发作..
df_train = df_train.rename(columns={'lable': 'label'})
df_train
print(len(df_train))

  0%|          | 0/6 [00:00<?, ?it/s]

33219


In [12]:
# label
# 0. 白
# 1. SQL 注入
# 2. 目录历遍
# 3. 远程代码执行
# 4. 命令执行
# 5. XSS 跨站脚本

In [4]:
df_test = pd.read_csv('../data/test/test.csv')
df_test.fillna('__NaN__', inplace=True)
df_test

Unnamed: 0,id,method,user_agent,url,refer,body
0,0,GET,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,/demo/aisec/upload.php?act='%7C%7C(select+1+fr...,http://demo.aisec.cn/demo/aisec/upload.php?t=0...,GET /demo/aisec/upload.php?act='%7C%7C(select+...
1,1,GET,Dalvik/2.1.0 (Linux; U; Android 11; M2102J2SC ...,/livemsg?ad_type=WL_WK&ty=web&pu=1&openudid=5f...,__NaN__,GET /livemsg?ad_type=WL_WK&ty=web&pu=1&openudi...
2,2,GET,Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/2...,/create_user/?username=%3Cscript%3Ealert(docum...,__NaN__,__NaN__
3,3,GET,__NaN__,/mmsns/WeDwicXmkOl4kjKsBycicI0H3q41r6syFFvu46h...,__NaN__,__NaN__
4,4,PUT,Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/2...,/naizau.jsp/,__NaN__,GET /login HTTP/1.1 Host: 111.160.211.18:8088 ...
...,...,...,...,...,...,...
3995,3995,GET,Dalvik/2.1.0 (Linux; U; Android 10; POT-AL00 B...,/livemsg?ad_type=WL_WK&ty=web&pu=1&openudid=64...,__NaN__,GET /livemsg?ad_type=WL_WK&ty=web&pu=1&openudi...
3996,3996,GET,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,/runtime.js,http://121.4.111.58:3000/,GET /runtime.js HTTP/1.1 Host: 121.4.111.58:30...
3997,3997,POST,Mozilla/4.0,/query?493521812,__NaN__,__NaN__
3998,3998,GET,Wget/1.11.4,/stats.php?rand=JtmT4wBtrpNy5RJnNX9wCUo,__NaN__,__NaN__


In [5]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df.shape

(37219, 7)

In [6]:
def get_url_query(s):
    li = re.split('[=&]', urlparse(s)[4])
    return [li[i] for i in range(len(li)) if i % 2 == 1]


def find_max_str_length(x):
    max_ = 0
    li = [len(i) for i in x]
    return max(li) if len(li) > 0 else 0


def find_str_length_std(x):
    max_ = 0
    li = [len(i) for i in x]
    return np.std(li) if len(li) > 0 else -1


df['url_unquote'] = df['url'].apply(unquote)
df['url_query'] = df['url_unquote'].apply(lambda x: get_url_query(x))
df['url_query_num'] = df['url_query'].apply(len)
df['url_query_max_len'] = df['url_query'].apply(find_max_str_length)
df['url_query_len_std'] = df['url_query'].apply(find_str_length_std)

In [16]:
def find_url_filetype(x):
    try:
        return re.search(r'\.[a-z]+', x).group()
    except:
        return '__NaN__'
    
    
df['url_path'] = df['url_unquote'].apply(lambda x: urlparse(x)[2])
df['url_filetype'] = df['url_path'].apply(lambda x: find_url_filetype(x))

df['url_path_len'] = df['url_path'].apply(len)
df['url_path_num'] = df['url_path'].apply(lambda x: len(re.findall('/',  x)))

In [17]:
df['ua_short'] = df['user_agent'].apply(lambda x: x.split('/')[0])
df['ua_first'] = df['user_agent'].apply(lambda x: x.split(' ')[0])

In [18]:
# %%time

def add_tfidf_feats(df, col, n_components=16):
    text = list(df[col].values)
    tf = TfidfVectorizer(min_df=1, 
                         analyzer='char_wb',
                         ngram_range=(1,3),
                         stop_words='english')
    tf.fit(text)
    X = tf.transform(text)
    svd = TruncatedSVD(n_components=n_components)
    svd.fit(X)
    X_svd = svd.transform(X)
    for i in range(n_components):
        df[f'{col}_tfidf_{i}'] = X_svd[:, i]
    return df


df = add_tfidf_feats(df, 'url_unquote', n_components=16)
df = add_tfidf_feats(df, 'user_agent', n_components=16)
df = add_tfidf_feats(df, 'body', n_components=32)

Wall time: 4min 35s


In [19]:
for col in tqdm(['method', 'refer', 'url_filetype', 'ua_short', 'ua_first']):
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

  0%|          | 0/5 [00:00<?, ?it/s]

In [20]:
not_use_feats = ['id', 'user_agent', 'url', 'body', 'url_unquote', 'url_query', 'url_path', 'label']
use_features = [col for col in df.columns if col not in not_use_feats]

In [21]:
train = df[df['label'].notna()]
test = df[df['label'].isna()]

train.shape, test.shape

((33219, 82), (4000, 82))

In [22]:
NUM_CLASSES = 6
FOLDS = 5
TARGET = 'label'

from sklearn.preprocessing import label_binarize

def run_lgb(df_train, df_test, use_features):
    
    target = TARGET
    oof_pred = np.zeros((len(df_train), NUM_CLASSES))
    y_pred = np.zeros((len(df_test), NUM_CLASSES))
    
    folds = StratifiedKFold(n_splits=FOLDS)
    for fold, (tr_ind, val_ind) in enumerate(folds.split(train, train[TARGET])):
        print(f'Fold {fold + 1}')
        x_train, x_val = df_train[use_features].iloc[tr_ind], df_train[use_features].iloc[val_ind]
        y_train, y_val = df_train[target].iloc[tr_ind], df_train[target].iloc[val_ind]
        train_set = lgb.Dataset(x_train, y_train)
        val_set = lgb.Dataset(x_val, y_val)
        
        params = {
            'learning_rate': 0.1,
            'metric': 'multiclass',
            'objective': 'multiclass',
            'num_classes': NUM_CLASSES,
            'feature_fraction': 0.75,
            'bagging_fraction': 0.75,
            'bagging_freq': 2,
            'n_jobs': -1,
            'seed': 2022,
            'max_depth': 10,
            'num_leaves': 100,
            'lambda_l1': 0.5,
            'lambda_l2': 0.8,
            'verbose': -1
        }
        
        model = lgb.train(params, 
                          train_set, 
                          num_boost_round=500,
                          early_stopping_rounds=100,
                          valid_sets=[train_set, val_set],
                          verbose_eval=100)
        oof_pred[val_ind] = model.predict(x_val)
        y_pred += model.predict(df_test[use_features]) / folds.n_splits
        
        print("Features importance...")
        gain = model.feature_importance('gain')
        feat_imp = pd.DataFrame({'feature': model.feature_name(), 
                         'split': model.feature_importance('split'), 
                         'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
        print('Top 50 features:\n', feat_imp.head(50))
        
        del x_train, x_val, y_train, y_val, train_set, val_set
        gc.collect()
        
    return y_pred, oof_pred
    

y_pred, oof_pred = run_lgb(train, test, use_features)

Fold 1
Training until validation scores don't improve for 100 rounds
[100]	training's multi_logloss: 0.00566234	valid_1's multi_logloss: 0.0691244
Early stopping, best iteration is:
[60]	training's multi_logloss: 0.0109684	valid_1's multi_logloss: 0.067304
Features importance...
Top 50 features:
                  feature  split       gain
42          body_tfidf_0    269  14.131562
11   url_unquote_tfidf_1    672  10.942426
43          body_tfidf_1    220  10.542240
23  url_unquote_tfidf_13    665   6.556286
13   url_unquote_tfidf_3    550   5.560482
30    user_agent_tfidf_4    217   4.519808
26    user_agent_tfidf_0    292   3.755170
17   url_unquote_tfidf_7    638   2.995091
16   url_unquote_tfidf_6    556   2.834933
15   url_unquote_tfidf_5    548   2.578374
21  url_unquote_tfidf_11    642   2.298545
12   url_unquote_tfidf_2    456   1.769005
56         body_tfidf_14    218   1.683001
3      url_query_max_len    297   1.587594
27    user_agent_tfidf_1    209   1.576739
47          bo

In [23]:
print(accuracy_score(np.argmax(oof_pred, axis=1), df_train['label']))

0.980071645744905

In [25]:
sub = pd.read_csv('../data/submit_example.csv')
sub['predict'] = np.argmax(y_pred, axis=1)
sub

Unnamed: 0,id,predict
0,0,1
1,1,1
2,2,5
3,3,0
4,4,3
...,...,...
3995,3995,1
3996,3996,1
3997,3997,0
3998,3998,0


In [26]:
sub['predict'].value_counts()

2    898
0    814
1    810
3    652
4    428
5    398
Name: predict, dtype: int64

In [27]:
sub.to_csv('baseline-22-9-1.csv', index=False)