In [1]:
# -*- coding: utf-8 -*-

In [2]:
import warnings

warnings.simplefilter('ignore')

import os
import gc
import re
import glob

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import random
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from torch import nn
from transformers import RobertaTokenizer, RobertaModel, RobertaConfig
from transformers import AdamW,get_linear_schedule_with_warmup,logging
# from torch.utils.data import TensorDataset,SequentialSampler,RandomSampler,DataLoader
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [3]:
def setup_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)  # 禁止hash随机化
    torch.manual_seed(seed)
setup_seed(2022)

In [4]:
# train

train_files = glob.glob('../data/train/*.csv')

df_train = pd.DataFrame()

for filepath in tqdm(train_files):
    df = pd.read_csv(filepath)
    df_train = pd.concat([df_train, df]).reset_index(drop=True)

df_train.fillna('__NaN__', inplace=True)

# 强迫症发作..
df_train = df_train.rename(columns={'lable': 'label'})
df_train.info()
print(len(df_train))

100%|██████████| 6/6 [00:00<00:00,  6.88it/s]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33219 entries, 0 to 33218
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          33219 non-null  int64 
 1   method      33219 non-null  object
 2   user_agent  33219 non-null  object
 3   url         33219 non-null  object
 4   refer       33219 non-null  object
 5   body        33219 non-null  object
 6   label       33219 non-null  int64 
dtypes: int64(2), object(5)
memory usage: 1.8+ MB
33219


In [5]:
type(df_train['label'][0])

numpy.int64

In [6]:
df_train.head(5)

Unnamed: 0,id,method,user_agent,url,refer,body,label
0,17902,GET,Dalvik/2.1.0 (Linux; U; Android 11; SM-G9860 B...,/livemsg?ad_type=WL_WK&ty=web&pu=0&openudid=ed...,__NaN__,GET /livemsg?ad_type=WL_WK&ty=web&pu=0&openudi...,1
1,190,GET,Dalvik/2.1.0 (Linux; U; Android 11; Mi 10 Buil...,/livemsg?ad_type=WL_WK&ty=web&pu=0&openudid=d6...,__NaN__,GET /livemsg?ad_type=WL_WK&ty=web&pu=0&openudi...,1
2,8799,GET,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,/(select%20extractvalue(xmltype('%3c%3fxml%20v...,__NaN__,GET /(select%20extractvalue(xmltype('%3c%3fxml...,1
3,8788,GET,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,/ftp/quarantine/?(select%20load_file('%5c%5c%5...,__NaN__,GET /ftp/quarantine/?(select%20load_file('%5c%...,1
4,16030,GET,Dalvik/2.1.0 (Linux; U; Android 9; MI 9 SE MIU...,/livemsg?ad_type=WL_WK&oadid=&ty=web&pu=0&adap...,__NaN__,GET /livemsg?ad_type=WL_WK&oadid=&ty=web&pu=0&...,1


In [7]:
df_train['label'].value_counts()


1    14038
2     9939
0     6489
3     1397
4      697
5      659
Name: label, dtype: int64

In [9]:
df_test = pd.read_csv('../data/test/test.csv')
df_test.fillna('__NaN__', inplace=True)
print(len(df_test))


4000


In [10]:
df_test.head(5)

Unnamed: 0,id,method,user_agent,url,refer,body
0,0,GET,Mozilla/5.0 (Windows NT 10.0; Win64; x64) Appl...,/demo/aisec/upload.php?act='%7C%7C(select+1+fr...,http://demo.aisec.cn/demo/aisec/upload.php?t=0...,GET /demo/aisec/upload.php?act='%7C%7C(select+...
1,1,GET,Dalvik/2.1.0 (Linux; U; Android 11; M2102J2SC ...,/livemsg?ad_type=WL_WK&ty=web&pu=1&openudid=5f...,__NaN__,GET /livemsg?ad_type=WL_WK&ty=web&pu=1&openudi...
2,2,GET,Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/2...,/create_user/?username=%3Cscript%3Ealert(docum...,__NaN__,__NaN__
3,3,GET,__NaN__,/mmsns/WeDwicXmkOl4kjKsBycicI0H3q41r6syFFvu46h...,__NaN__,__NaN__
4,4,PUT,Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/2...,/naizau.jsp/,__NaN__,GET /login HTTP/1.1 Host: 111.160.211.18:8088 ...


In [11]:
class My_Dataset(Dataset):
    def __init__(self, df, tokenizer, max_len, test_mode):
        self.method = df['method'].values
        self.user_agent = df['user_agent'].values
        self.url = df['url'].values
        self.refer = df['refer'].values
        self.body = df['body'].values
        if not test_mode:
            self.label = df['label'].values

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.test_mode = test_mode

    def __len__(self):
        return len(self.method)


    def tokenize_text(self, text: str, max_len=512) -> tuple:

        encoded_inputs = self.tokenizer(text, max_length=max_len, padding='max_length', truncation=True)
        input_ids = torch.LongTensor(encoded_inputs['input_ids'])
        mask = torch.LongTensor(encoded_inputs['attention_mask'])
        return input_ids, mask

    def __getitem__(self,idx):

        method_maxlen = 4
        user_agent_maxlen = 60
        url_maxlen = 128
        refer_maxlen = 60
        body_maxlen = 256

        method = self.method[idx]
        user_agent = self.user_agent[idx]
        url = self.url[idx]
        refer = self.refer[idx]
        body = self.body[idx]

        # sep: '</s>'

        if len(method)>method_maxlen:
            method= method[:method_maxlen//2]+method[-(method_maxlen//2):]

        if len(user_agent)>user_agent_maxlen:
            user_agent = user_agent[:user_agent_maxlen//2]+user_agent[-(user_agent_maxlen//2):]

        if len(url)>url_maxlen:
            url = url[:url_maxlen//2]+url[-(url_maxlen//2):]


        if len(refer)>refer_maxlen:
            refer = refer[:refer_maxlen//2]+refer[-(refer_maxlen//2):]

        if len(body)>body_maxlen:
            body = body[:body_maxlen//2]+body[-(body_maxlen//2):]


        cat_text = method+'</s>'+body+'</s>'+user_agent+'</s>'+url+'</s>'+refer
        cat_input, cat_mask = self.tokenize_text(cat_text, max_len=self.max_len)

        sample = dict(
            input_ids=cat_input,
            attention_mask=cat_mask
        )


        if not self.test_mode:
            sample['label'] = torch.LongTensor([self.label[idx]])

        return sample


In [12]:
def create_data_loader(df, tokenizer, max_len, batch_size, test_mode=False):
    ds=My_Dataset(
        df=df,
        tokenizer=tokenizer,
        max_len=max_len,
        test_mode=test_mode
    )
    if test_mode:
        return DataLoader(ds, batch_size=batch_size, shuffle=False)
    else:
        return DataLoader(ds, batch_size=batch_size, shuffle=True)






In [13]:
# len(train_data_loader)

In [14]:
# data = next(iter(train_data_loader))
# data.keys()
# print(data['input_ids'].shape)
# print(data['attention_mask'].shape)

In [15]:
# for inputs in train_data_loader:
#     print()

In [16]:
# inputs

In [17]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print('use device: ', device)

use device:  cuda:0


In [18]:
class WebAttack_Classfier(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert_cnofig = RobertaConfig.from_pretrained(PRE_TRAINED_MODEL_PATH + './config.json')
        self.bert = RobertaModel.from_pretrained(PRE_TRAINED_MODEL_PATH, config=self.bert_cnofig)
        self.fc = nn.Linear(768, 6)


    def forward(self, inputs):
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        output, pooled_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            return_dict=False
        )

        # out = self.fc(pooled_output)

        mean_output = output.mean(1)
        out = self.fc(mean_output)

        return out


In [19]:
# y = model(data)


In [20]:
# y.shape


In [21]:


def build_optimizer(model, learning_rate, num_total_steps):
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    print('learning_rate: ', learning_rate)
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=1e-6)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
    #                                             num_training_steps=args.max_steps)
    print('num_training_steps: ', num_total_steps)
    print('warmup_steps: ', num_total_steps*0.1)
    # print('num_training_steps: ', args.max_steps)
    # print('warmup_steps: ', args.warmup_steps)
    # scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps,
    #                                             num_training_steps= args.max_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_total_steps*0.1,
                                                num_training_steps= num_total_steps)
    return optimizer, scheduler




In [22]:
from sklearn.metrics import accuracy_score, auc, f1_score
# print('f1: ', f1_score(np.argmax(oof_pred, axis=1), df_train['label'], average='macro'))
def train_epoch(args, model, data_loader, loss_fn, optimizer, device, scheduler):
    model = model.train()
    losses = []

    pred_list = []
    target_list = []

    for inputs in tqdm(data_loader):
        targets = inputs["label"].to(device)
        targets = targets.squeeze(1)

        outputs = model(inputs)
        _, preds = torch.max(outputs, dim=1)


        pred_list.extend(preds.cpu().numpy().tolist())
        target_list.extend(targets.cpu().numpy().tolist())


        loss = loss_fn(outputs, targets)
        losses.append(loss.item())
        loss.backward()


        # -----------------------------------对抗攻击------------------------------------------------
        if args.use_fgm:
            # 对抗训练
            fgm.attack()  # 在embedding上添加对抗扰动
            outputs = model(inputs)
            loss_adv = loss_fn(outputs, targets)
            loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度
            fgm.restore()  # 恢复embedding参数

        if args.use_pgd:
            pgd.backup_grad()
            for t in range(K):
                pgd.attack(is_first_attack=(t == 0))
                if t != K - 1:
                    model.zero_grad()
                else:
                    pgd.restore_grad()

                outputs = model(inputs)
                loss_adv = loss_fn(outputs, targets)
                loss_adv.backward()  # 反向传播，并在正常的grad基础上，累加对抗训练的梯度

            pgd.restore()


        # ----------------------------------------------------------------------------------------





        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()
        scheduler.step()

        if args.ema != False:
            args.ema.update()


    acc =  accuracy_score(y_true=target_list, y_pred=pred_list)
    f1 = f1_score(y_true=target_list, y_pred=pred_list, average='macro')
    mean_loss = np.mean(losses)

    return acc, f1, mean_loss

In [23]:
def eval_model(args, model, data_loader, loss_fn, device):
    model = model.eval() # 验证预测模式
    if args.ema!=False:
        args.ema.apply_shadow()


    losses = []


    pred_list = []
    target_list = []

    with torch.no_grad():
        for inputs in tqdm(data_loader):
            targets = inputs["label"].to(device)
            targets = targets.squeeze(1)

            outputs = model(inputs)
            _, preds = torch.max(outputs, dim=1)


            pred_list.extend(preds.cpu().numpy().tolist())
            target_list.extend(targets.cpu().numpy().tolist())


            loss = loss_fn(outputs, targets)


            losses.append(loss.item())



    acc =  accuracy_score(y_true=target_list, y_pred=pred_list)
    f1 = f1_score(y_true=target_list, y_pred=pred_list, average='macro')
    mean_loss = np.mean(losses)


    return acc, f1, mean_loss

In [24]:
class Args:
    def __init__(self):
        self.ema = True
        self.use_fgm = True
        self.use_pgd = False

args = Args()

In [25]:
PRE_TRAINED_MODEL_PATH = 'E:/打工/预训练模型/hfl/roberta-base/'
print('use pretrain model: ', PRE_TRAINED_MODEL_PATH)

use pretrain model:  E:/打工/预训练模型/hfl/roberta-base/


In [26]:
print('-'*20)
print('！！！！！！！！！！！kfold开始冲啊！！！！！！！！！！！')
print('-' * 20)
from sklearn.model_selection import KFold, StratifiedKFold
from collections import defaultdict
skf = StratifiedKFold(n_splits=5)
for fold, (train_idx, val_idx) in enumerate(skf.split(df_train, df_train['label'])):


    print('**' * 10, '第', fold + 1, '折', 'ing....', '**' * 10)
    train_data_df = df_train.iloc[train_idx]
    train_data_df.index = range(len(train_data_df)) # 重置索引
    val_data_df = df_train.iloc[val_idx]
    val_data_df.index = range(len(val_data_df)) # 重置索引
    print('train_set_len: ',len(train_data_df), 'dev_set_len: ', len(val_data_df))




    tokenizer = RobertaTokenizer.from_pretrained(PRE_TRAINED_MODEL_PATH)
    train_data_loader = create_data_loader(df=train_data_df,
                       tokenizer=tokenizer,
                       max_len=32,
                       batch_size=32,
                       test_mode=False)

    val_data_loader = create_data_loader(df=val_data_df,
                       tokenizer=tokenizer,
                       max_len=32,
                       batch_size=32,
                       test_mode=False)


    # 每一折实例化新模型
    model = WebAttack_Classfier()
    model = model.to(device)

    EPOCHS = 1 # 训练轮数
    print('EPOCH: ', EPOCHS)
    total_steps = len(train_data_loader) * EPOCHS
    optimizer, scheduler = build_optimizer(model, learning_rate=2e-5, num_total_steps=total_steps)



    loss_fn = nn.CrossEntropyLoss().to(device)


    if args.ema==True:
        print('-'*10,'采用EMA机制训练','-'*10)
        from tricks import EMA
        args.ema = EMA(model, 0.999)
        args.ema.register()

    if args.use_fgm==True:
        print('-' * 10, '采用FGM对抗训练', '-' * 10)
        from tricks import FGM
        # 初始化
        fgm = FGM(model)

    if args.use_pgd==True:
        print('-' * 10, '采用PGD对抗训练', '-' * 10)
        from tricks import PGD
        # 初始化
        pgd = PGD(model=model)
        K = 3




    history = defaultdict(list) # 记录10轮loss和acc
    best_f1 = 0


    # -------------------控制早停--------------
    early_stop_epochs = 2
    no_improve_epochs = 0


    for epoch in range(EPOCHS):

        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print('-' * 10)

        train_acc, train_f1, train_loss = train_epoch(
            args,
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            device,
            scheduler
        )

        print(f'train_loss: {train_loss} \n train_acc: {train_acc} \n train_f1: {train_f1}')

        val_acc, val_f1, val_loss = eval_model(
            args,
            model,
            val_data_loader,
            loss_fn,
            device
        )

        print(f'val_loss: {val_loss} \n val_acc: {val_acc} \n val_f1: {val_f1}')
        print()

        history['train_acc'].append(train_acc)
        history['train_f1'].append(train_f1)
        history['train_loss'].append(train_loss)
        history['val_acc'].append(val_acc)
        history['val_f1'].append(val_f1)
        history['val_loss'].append(val_loss)

        if val_f1 >= best_f1:
            print('best model saved!!!!!!!!!!!!!')
            torch.save(model.state_dict(), f'./save model/best_model_{fold+1}fold.bin')
            best_f1 = val_f1

            no_improve_epochs = 0

        else:
            no_improve_epochs += 1



        if no_improve_epochs == early_stop_epochs:
            print('no improve score !!! stop train !!!')
            break


        if args.ema != False:
            args.ema.restore()

--------------------
！！！！！！！！！！！kfold开始冲啊！！！！！！！！！！！
--------------------
******************** 第 1 折 ing.... ********************
train_set_len:  1600 dev_set_len:  400
EPOCH:  1
learning_rate:  2e-05
num_training_steps:  50
warmup_steps:  5.0
---------- 采用EMA机制训练 ----------
---------- 采用FGM对抗训练 ----------
Epoch 1/1
----------
train_loss: 0.2930969069886487 
 train_acc: 0.89875 
 train_f1: 0.47333772218564846
val_loss: 1.7749531544171846 
 val_acc: 0.0 
 val_f1: 0.0

best model saved!!!!!!!!!!!!!
******************** 第 2 折 ing.... ********************
train_set_len:  1600 dev_set_len:  400
EPOCH:  1
learning_rate:  2e-05
num_training_steps:  50
warmup_steps:  5.0
---------- 采用FGM对抗训练 ----------
Epoch 1/1
----------
train_loss: 0.30377004902402405 
 train_acc: 0.911875 
 train_f1: 0.23847662634847988
val_loss: 0.00023503872440554775 
 val_acc: 1.0 
 val_f1: 1.0

best model saved!!!!!!!!!!!!!
******************** 第 3 折 ing.... ********************
train_set_len:  1600 dev_set_len:  400
E

Some weights of the model checkpoint at E:/打工/预训练模型/hfl/roberta-base/ were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 50/50 [00:11<00:00,  4.42it/s]
100%|██████████| 13/13 [00:00<00:00, 20.22it/s]
Some weights of the model checkpoint at E:/打工/预训练模型/hfl/roberta-base/ were not used when initializing RobertaModel: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weig