In [None]:
!pip install  -q  tqdm
!pip install -q torch torchvision
!pip install -q pytorch-transformers
!pip install -q sklearn

In [53]:
from pytorch_transformers import BertConfig,BertForSequenceClassification,BertTokenizer
import pandas as pd
import numpy as np
import collections
from tqdm import tqdm_notebook  as tqdm
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
import torch
from sklearn.model_selection import train_test_split
from pytorch_transformers import AdamW, WarmupLinearSchedule
import argparse

In [67]:
parser = argparse.ArgumentParser()


parser.add_argument("--max_seq_length", default=384, type=int,
                    help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                         "longer than this will be truncated, and sequences shorter than this will be padded.")
parser.add_argument("--max_content_length", default=192, type=int,
                    help="The maximum number of tokens for the content. Content longer than this will "
                         "be truncated to this length.")
parser.add_argument("--weight_decay", default=0.0, type=float,
                    help="Weight deay if we apply some.")
parser.add_argument("--warmup_steps", default=0, type=int,
                    help="Linear warmup over warmup_steps.")
parser.add_argument("--learning_rate", default=5e-5, type=float,
                    help="The initial learning rate for Adam.")
parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                    help="Epsilon for Adam optimizer.")
parser.add_argument("--max_grad_norm", default=1.0, type=float,
                    help="Max gradient norm.")
parser.add_argument("--model_name",default="bert-base-chinese",type=str,
                    help='bert pretrianed model')
parser.add_argument("--train_batch_size", default=8, type=int,
                    help="Batch size for training.")
parser.add_argument("--eval_batch_size", default=8, type=int,
                    help="Batch size for evaluation.")
parser.add_argument("--num_train_epochs", default=3.0, type=float,
                    help="Total number of training epochs to perform.")
args = parser.parse_args('')
device = torch.device("cuda" if torch.cuda.is_available()  else "cpu")
args.device=device

In [31]:
config = BertConfig.from_pretrained("bert-base-chinese")
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
model = BertForSequenceClassification.from_pretrained("bert-base-chinese", config=config)
model = model.to(device)

100%|██████████| 520/520 [00:00<00:00, 95421.01B/s]


In [32]:
assert config.num_labels==2

In [3]:
data = pd.read_csv('./data/data_with_label.csv')

In [4]:
data.content=data.content.fillna('')
data.root_content=data.root_content.fillna('')

In [5]:
data.head(2)

Unnamed: 0,uid,is_orig,content,root_content,wb_id,mobile,label
0,5f2731634875ed63960e73e7b7318188,0,转发微博,柠檬水，你喝对了吗？ 不同的搭配有不同的功效哦，​ ​​​快来试试吧 ~ ​​,1746505677,18688728723,1.0
1,1d4c605c286b8413fbda3da0855524ec,0,[爱你],“微博第三个表情是你今年下半年的状态” 我是[爱你] ​​,1873781221,13837811122,1.0


In [6]:
data.tail(2)

Unnamed: 0,uid,is_orig,content,root_content,wb_id,mobile,label
43108,182f95d15b85539fb9209c1572eaf44e,1,#微博豪车抽奖# http:,,6701158809,18479698848,1.0
43109,182f95d15b85539fb9209c1572eaf44e,1,微博@借钱 居然抽奔驰，好想被宠幸哦，今年的好运都指着它了！为了豪车，冲呀！！#微博豪车抽奖...,,6701158809,18479698848,1.0


In [62]:
train_data,eval_data = train_test_split(data,test_size=0.33, random_state=42)

In [7]:
class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self,
                 input_ids,
                 input_mask,
                 segment_ids,
                 label
                ):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label = label

In [64]:
def convert_data(data,
                 max_seq_length,
                 max_content_length,
                 sep_token='[SEP]',
                 pad_token=0,
                 sequence_a_segment_id=0,
                 sequence_b_segment_id=1,
                 pad_token_segment_id=0,
                 mask_padding_with_zero=True
                 ):
    features = []
    for content, root_content, label in tqdm_notebook(zip(data.content, data.root_content, data.label), total=len(data)):
        content_tokens = tokenizer.tokenize(content)
        root_content_tokens = tokenizer.tokenize(root_content)

        if len(content_tokens) > max_content_length:
            content_tokens = content_tokens[0:max_content_length]
        # The -3 accounts for [SEP] and [SEP]
        max_tokens_for_root_content = max_seq_length - len(content_tokens) - 2

        if len(root_content_tokens) > max_tokens_for_root_content:
            root_content_tokens = root_content_tokens[0:max_tokens_for_root_content]

        tokens = []
        segment_ids = []

        # content
        for token in content_tokens:
            tokens.append(token)
            segment_ids.append(sequence_a_segment_id)

        # SEP token
        tokens.append(sep_token)
        segment_ids.append(sequence_a_segment_id)

        # root_content
        for token in root_content_tokens:
            tokens.append(token)
            segment_ids.append(sequence_b_segment_id)

        # SEP token
        tokens.append(sep_token)
        segment_ids.append(sequence_b_segment_id)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
        while len(input_ids) < max_seq_length:
            input_ids.append(pad_token)
            input_mask.append(0 if mask_padding_with_zero else 1)
            segment_ids.append(pad_token_segment_id)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        features.append(InputFeatures(
            input_ids, input_mask, segment_ids, label))

    all_input_ids = torch.tensor(
        [f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor(
        [f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor(
        [f.segment_ids for f in features], dtype=torch.long)
    all_label = torch.tensor([f.label for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask,
                            all_segment_ids, all_label)
    return dataset

In [38]:
train_dataset = convert_data(train_data,max_seq_length=args.max_seq_length,max_content_length=args.max_content_length)

HBox(children=(IntProgress(value=0, max=28883), HTML(value='')))




In [65]:
eval_dataset =  convert_data(eval_data,max_seq_length=args.max_seq_length,max_content_length=args.max_content_length)

HBox(children=(IntProgress(value=0, max=14227), HTML(value='')))

In [68]:
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
eval_sampler = SequentialSampler(eval_dataset) 
eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

In [43]:
optimizer_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]

In [52]:
t_total = len(train_dataloader)  * args.num_train_epochs
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)

In [None]:
global_step = 0
model.zero_grad()

for _ in tqdm(range(int(args.num_train_epochs)), desc="Epoch"):
    tr_acc=[]
    tr_loss=[]
    for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
        model.train()
        batch = tuple(t.to(args.device) for t in batch)
        inputs = {'input_ids':       batch[0],
                  'attention_mask':  batch[1], 
                  'token_type_ids':  batch[2],  
                  'labels': batch[3], }
        labels = batch[3]
        loss,logits = model(**inputs)[:2]
        _,indices=torch.max(logits,dim=1)
        tr_acc.append(labels.numpy()==indices.numpy())
        tr_loss.append(loss.numpy())
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

        optimizer.step()
        scheduler.step()  # Update learning rate schedule
        model.zero_grad()
        
        if step%(len(train_dataloader)//10)==0:
            
            print(f'train loss:{np.mean(tr_loss)} on {step} step')
            print(f'train acc:{np.mean(tr_acc)} on {step} step')
            tr_loss=[]
            tr_acc=[]
            
            
    tr_acc=[]
    tr_loss=[]
    tr_result=[]
    print("***** Running evaluation *****")
    i=0
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {'input_ids':       batch[0],
                      'attention_mask':  batch[1], 
                      'token_type_ids':  batch[2],  
                      'labels': batch[3], }
            labels = batch[3]
            loss,logits = model(**inputs)[:2]
            _,indices=torch.max(logits,dim=1)
            tr_acc.append(labels.numpy()==indices.numpy())
            tr_loss.append(loss.numpy())
            tr_result.append(indices.numpy())
    print(f'eval loss: {np.mean(tr_loss)}')
    print(f'eval acc: {np.mean(tr_acc)}')