In [42]:
import torch
from transformers import BertTokenizer, AdamW, BertModel, BertPreTrainedModel, BertConfig
import tqdm
from collections import Counter
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import torchtext
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
import torch.optim as optim
import jieba
import json
import os 
import argparse

In [2]:
torch.version

<module 'torch.version' from '/usr/local/miniconda3/envs/dl10/lib/python3.6/site-packages/torch/version.py'>

# dataset

In [3]:
df_train = pd.read_csv("./corpus/df_train.csv")
df_dev = pd.read_csv("./corpus/df_dev.csv")
df_test = pd.read_csv("./corpus/df_test.csv")

In [4]:
df_train.dtypes

query_id          object
context           object
question          object
answer            object
input_ids         object
token_type_ids    object
attention_mask    object
start_position     int64
end_position       int64
dtype: object

In [5]:
class Sentence_Pair_Dataset(Dataset):
    def __init__(self, df, is_train = True):
    # 加载语料
        # 将数据集全部加载到内存
        self.df = df.copy()
        if is_train:
            self.df = self.df[self.df.end_position <= 512]
        self.df = self.df.reset_index()

        self.is_train = is_train
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        if self.is_train:
            query_id   = self.df.loc[idx, 'query_id']
            input_ids = self.df.loc[idx, 'input_ids']
            token_type_ids = self.df.loc[idx, 'token_type_ids']
            attention_mask = self.df.loc[idx, 'attention_mask']
            start_position = self.df.loc[idx, 'start_position']
            end_position = self.df.loc[idx, 'end_position']
            sample = {"query_id":query_id,
                             "input_ids":input_ids, 
                             "token_type_ids":token_type_ids , 
                             "attention_mask":attention_mask,
                             "start_position" : start_position,
                             "end_position": end_position}
            return sample
        else:
            query_id   = self.df.loc[idx, 'query_id']
            input_ids   = self.df.loc[idx, 'input_ids']
            token_type_ids = self.df.loc[idx, 'token_type_ids']
            attention_mask = self.df.loc[idx, 'attention_mask']           
                        
            sample = {"query_id" : query_id,
                            "input_ids":input_ids, 
                             "token_type_ids":token_type_ids, 
                             "attention_mask":attention_mask}
            return sample            

In [6]:
def collate_fn(batch):
    # 因为token_list是一个变长的数据，所以需要用一个list来装这个batch的token_list
    # 
    query_id = [item['query_id'] for item in batch]
    #
    input_ids = [eval(item['input_ids']) for item in batch]
#     input_ids = [torch.LongTensor(item) for item in input_ids]
    #
    attention_mask = [eval(item['attention_mask']) for item in batch]
#     attention_mask = [torch.LongTensor(item) for item in attention_mask]
    #
    token_type_ids = [eval(item['token_type_ids']) for item in batch]
#     token_type_ids = [torch.LongTensor(item) for item in token_type_ids]
    # 
    start_position = [item['start_position'] for item in batch]
#     start_labels = torch.Tensor(start_labels)
    # 
    end_position = [item['end_position'] for item in batch]
#     end_labels = torch.Tensor(end_labels)    
    return {
        'query_id':query_id,
        'input_ids': input_ids,
        'attention_mask' : attention_mask,
        'token_type_ids':token_type_ids,
        'start_position': start_position,
        'end_position':end_position
            }

def collate_dev_test_fn(batch):
    # 因为token_list是一个变长的数据，所以需要用一个list来装这个batch的token_list
    # 
    query_id = [item['query_id'] for item in batch]
    #
    input_ids = [eval(item['input_ids']) for item in batch]
#     input_ids = [torch.LongTensor(item) for item in input_ids]
    #
    attention_mask = [eval(item['attention_mask']) for item in batch]
#     attention_mask = [torch.LongTensor(item) for item in attention_mask]
    #
    token_type_ids = [eval(item['token_type_ids']) for item in batch]
#     token_type_ids = [torch.LongTensor(item) for item in token_type_ids]

    
    return {
        'query_id':query_id,
        'input_ids': input_ids,
        'attention_mask' : attention_mask,
        'token_type_ids':token_type_ids,
            }

In [7]:
train = Sentence_Pair_Dataset(df_train)
train_iter = DataLoader(train, 
                      batch_size=2, 
                      shuffle=True, 
                      collate_fn = collate_fn)

dev = Sentence_Pair_Dataset(df_dev, is_train=False)
dev_iter = DataLoader(dev, 
                    batch_size=4, 
                    shuffle=False, 
                    collate_fn = collate_dev_test_fn)

test = Sentence_Pair_Dataset(df_test, is_train=False)
test_iter = DataLoader(test, 
                    batch_size=4, 
                    shuffle=False, 
                    collate_fn = collate_dev_test_fn)

# model

In [38]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss

class Bert_QA(nn.Module):

    def __init__(self, num_labels):
        super(Bert_QA, self).__init__()        
        self.bert_config = BertConfig.from_pretrained('/home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch/bert_config.json', output_hidden_states=True)
        self.tokenizer  = BertTokenizer.from_pretrained('/home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch/vocab.txt', config=self.bert_config)
        self.bertmodel = BertModel.from_pretrained('/home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch', config=self.bert_config)
        self.fc1 = nn.Linear(self.bert_config.hidden_size, self.bert_config.hidden_size)
        self.activation1 = nn.ReLU()
        self.fc2 = nn.Linear(self.bert_config.hidden_size, self.bert_config.hidden_size)
        self.activation2 = nn.ReLU()        
        self.classifier = nn.Linear(self.bert_config.hidden_size, num_labels)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        sequence_output, pooled_output, _ = self.bertmodel(input_ids, token_type_ids = token_type_ids, attention_mask = attention_mask)
        fc1_out    = self.activation1(self.fc1(sequence_output))
        fc2_out    = self.activation2(self.fc2(fc1_out))
        
        logits = self.classifier(fc2_out)  # (B, T, 2)
        start_logits, end_logits = logits.split(1, dim=-1)# ((B, T, 1),(B, T, 1))
        start_logits = start_logits.squeeze(-1) # (B, T)
        end_logits = end_logits.squeeze(-1) # (B, T)
        
        return start_logits, end_logits
    
    def predict(self, question, context):
        input_ids = self.tokenizer.encode(question, context)
        token_type_ids = self.tokenizer.create_token_type_ids_from_sequences(self.tokenizer.tokenize(question), self.tokenizer.tokenize(context))
        input_ids = torch.LongTensor(input_ids).unsqueeze(0).to(device)
        token_type_ids = torch.LongTensor(token_type_ids).unsqueeze(0).to(device)
        start, end = self.forward(input_ids, token_type_ids=token_type_ids)
        start = torch.max(start, dim=1)[1].cpu().numpy().tolist()[0]
        end = torch.max(end, dim=1)[1].cpu().numpy().tolist()[0]

        tt = input_ids.cpu().numpy().tolist()[0]
        
        result = self.tokenizer.convert_ids_to_tokens(tt[start: end])
        print("".join(result).replace('#', ''))
        return None

In [9]:
model = Bert_QA(num_labels = 2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

I0417 05:06:28.604979 140126615160576 configuration_utils.py:281] loading configuration file /home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch/bert_config.json
I0417 05:06:28.607290 140126615160576 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "m

In [None]:
torch.cuda.empty_cache()

# uxiliary Function

## evaluate auxiliary function

In [None]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Evaluation script for LIC2020 DuReader_robust
"""

from __future__ import print_function
from collections import OrderedDict
import io
import json
import six
import sys
if six.PY2:
    reload(sys)
    sys.setdefaultencoding('utf8')
import argparse


def _tokenize_chinese_chars(text):
    """
    :param text: input text, unicode string
    :return:
        tokenized text, list
    """

    def _is_chinese_char(cp):
        """Checks whether CP is the codepoint of a CJK character."""
        # This defines a "chinese character" as anything in the CJK Unicode block:
        #     https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
        #
        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
        # despite its name. The modern Korean Hangul alphabet is a different block,
        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
        # space-separated words, so they are not treated specially and handled
        # like the all of the other languages.
        if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
            (cp >= 0x3400 and cp <= 0x4DBF) or  #
            (cp >= 0x20000 and cp <= 0x2A6DF) or  #
            (cp >= 0x2A700 and cp <= 0x2B73F) or  #
            (cp >= 0x2B740 and cp <= 0x2B81F) or  #
            (cp >= 0x2B820 and cp <= 0x2CEAF) or
            (cp >= 0xF900 and cp <= 0xFAFF) or  #
            (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
            return True

        return False

    output = []
    buff = ""
    for char in text:
        cp = ord(char)
        if _is_chinese_char(cp) or char == "=":
            if buff != "":
                output.append(buff)
                buff = ""
            output.append(char)
        else:
            buff += char

    if buff != "":
        output.append(buff)

    return output


def _normalize(in_str):
    """
    normalize the input unicode string
    """
    in_str = in_str.lower()
    sp_char = [
        u':', u'_', u'`', u'，', u'。', u'：', u'？', u'！', u'(', u')',
        u'“', u'”', u'；', u'’', u'《', u'》', u'……', u'·', u'、', u',',
        u'「', u'」', u'（', u'）', u'－', u'～', u'『', u'』', '|'
    ]
    out_segs = []
    for char in in_str:
        if char in sp_char:
            continue
        else:
            out_segs.append(char)
    return ''.join(out_segs)


def find_lcs(s1, s2):
    """find the longest common subsequence between s1 ans s2"""
    m = [[0 for i in range(len(s2)+1)] for j in range(len(s1)+1)]
    max_len = 0
    p = 0
    for i in range(len(s1)):
        for j in range(len(s2)):
            if s1[i] == s2[j]:
                m[i+1][j+1] = m[i][j]+1
                if m[i+1][j+1] > max_len:
                    max_len = m[i+1][j+1]
                    p = i+1
    return s1[p-max_len:p], max_len


def evaluate(ref_ans, pred_ans, verbose=False):
    """
    ref_ans: reference answers, dict
    pred_ans: predicted answer, dict
    return:
        f1_score: averaged F1 score
        em_score: averaged EM score
        total_count: number of samples in the reference dataset
        skip_count: number of samples skipped in the calculation due to unknown errors
    """
    f1 = 0
    em = 0
    total_count = 0
    skip_count = 0
    datas = ref_ans['data'][0]["paragraphs"]
    for document in datas:
        para = document['context'].strip()
        for qa in (document['qas']):
            total_count += 1
            query_id = qa['id']
            query_text = qa['question'].strip()
            answers = [a['text'] for a in qa['answers']]
            try:
                prediction = pred_ans[str(query_id)]
            except:
                skip_count += 1
                if verbose:
                    print("para: {}".format(para))
                    print("query: {}".format(query_text))
                    print("ref: {}".format('#'.join(answers)))
                    print("Skipped")
                    print('----------------------------')
                continue
            _f1 = calc_f1_score(answers, prediction)
            f1 += _f1
            em += calc_em_score(answers, prediction)
            if verbose:
                print("para: {}".format(para))
                print("query: {}".format(query_text))
                print("ref: {}".format('#'.join(answers)))
                print("cand: {}".format(prediction))
                print("score: {}".format(_f1))
                print('----------------------------')

    f1_score   = 100.0 * f1 / total_count
    em_score = 100.0 * em / total_count
    return f1_score, em_score, total_count, skip_count


def calc_f1_score(answers, prediction):
    f1_scores = []
    for ans in answers:
        ans_segs = _tokenize_chinese_chars(_normalize(ans))
        prediction_segs = _tokenize_chinese_chars(_normalize(prediction))
        lcs, lcs_len = find_lcs(ans_segs, prediction_segs)
        if lcs_len == 0:
            f1_scores.append(0)
            continue
        prec = 1.0*lcs_len/len(prediction_segs)
        rec   = 1.0*lcs_len/len(ans_segs)
        f1     = (2 * prec * rec) / (prec + rec)
        f1_scores.append(f1)
    return max(f1_scores)


def calc_em_score(answers, prediction):
    em = 0
    for ans in answers:
        ans_ = _normalize(ans)
        prediction_ = _normalize(prediction)
        if ans_ == prediction_:
            em = 1
            break
    return em

## evaluate_dev_test

In [None]:
# def evaluate_dev_test(df, iterator, is_dev=True, dev_path=None):
#     all_start_predicionts = []
#     all_end_predicionts = []
#     model.eval()

#     with torch.no_grad():
#         par = tqdm.tqdm(enumerate(iterator),
#                         total=len(iterator),
#                         miniters = 10)
#         with par as t:
#             for idx, batch_samples  in t:
#                 query_id = batch_samples['query_id']             
#                 input_ids = batch_samples['input_ids']
#                 token_type_ids = batch_samples['token_type_ids']
#                 attention_mask = batch_samples['attention_mask']
#                 #
#                 input_ids = torch.LongTensor(input_ids).to(device)
#                 token_type_ids = torch.LongTensor(token_type_ids).to(device)
#                 attention_mask = torch.LongTensor(attention_mask).to(device)

#                 # predict
#                 start_predictions, end_predictions = model(input_ids, 
#                                                            token_type_ids = token_type_ids, 
#                                                            attention_mask = attention_mask) 
#                 start_predictions = torch.max(start_predictions, dim=1)[1].cpu().numpy().tolist()
#                 end_predictions = torch.max(end_predictions, dim=1)[1].cpu().numpy().tolist()

#                 # 
#                 all_start_predicionts.extend(start_predictions)
#                 all_end_predicionts.extend(end_predictions)
#             #
#             print(len(all_start_predicionts))
#             print(len(all_end_predicionts))
#             df_result = df.copy()
#             df_result['start'] = all_start_predicionts
#             df_result['stop'] = all_end_predicionts
#             #
#             df_result['prediction'] = df_result.apply(lambda x:eval(x['input_ids'])[x['start']:x['stop']], axis=1)
#             df_result['prediction'] = df_result['prediction'].apply(lambda x:tokenizer.convert_ids_to_tokens(x))
#             df_result['prediction'] = df_result['prediction'].apply(lambda x:''.join(x))
#             df_result['prediction'] = df_result.prediction.str.replace('#', '')
#             # 转dict格式
#             dict_result = dict(zip(df_result['query_id'], df_result['prediction']))
#             # 验证集
#             if is_dev:
#                 # 
#                 ref_ans = json.load(io.open(dev_path))
#                 # 
#                 F1, EM, TOTAL, SKIP = evaluate(ref_ans, dict_result)
#                 output_result = OrderedDict()
#                 output_result['F1'] = '%.3f' % F1
#                 output_result['EM'] = '%.3f' % EM
#                 output_result['TOTAL'] = TOTAL
#                 output_result['SKIP'] = SKIP
#                 # 
#                 print(output_result)
#                 return output_result 
#             else:
#                 return dict_result

## find_best_answer_for_passage

In [None]:
def find_best_answer_for_passage(list_start_probs, list_end_probs, context_span, max_answer_len=24):
    """
    Finds the best answer with the maximum start_prob * end_prob from a single passage
    """
    result = []
    for idx in range(len(context_span)):
        best_start, best_end, max_prob = -1, -1, 0
        idx_start = context_span[idx][0]
        idx_end  = context_span[idx][1]
        start_probs = list_start_probs[idx]
        end_probs  = list_end_probs[idx]
        for start_idx in range(idx_start, idx_end):
            for ans_len in range(1, max_answer_len):
                end_idx = start_idx + ans_len
                if end_idx >= idx_end:
                    continue
                prob = abs(start_probs[start_idx] + end_probs[end_idx])
                if prob > max_prob:
                    best_start = start_idx
                    best_end = end_idx
                    max_prob = prob
        result.append((best_start, best_end))
    return result


In [None]:
def evaluate_dev_test(df, iterator, is_dev=True, dev_path=None):
    all_start_predicionts = []
    all_end_predicionts = []
    all_context_span = []
    
    model.eval()

    with torch.no_grad():
        par = tqdm.tqdm(enumerate(iterator),
                        total=len(iterator),
                        miniters = 10)
        with par as t:
            for idx, batch_samples  in t:
                query_id = batch_samples['query_id']             
                input_ids = batch_samples['input_ids']
                token_type_ids = batch_samples['token_type_ids']
                attention_mask = batch_samples['attention_mask']
                #
                input_ids = torch.LongTensor(input_ids).to(device)
                token_type_ids = torch.LongTensor(token_type_ids).to(device)
                attention_mask = torch.LongTensor(attention_mask).to(device)

                # predict
                start_predictions, end_predictions = model(input_ids, 
                                                           token_type_ids = token_type_ids, 
                                                           attention_mask = attention_mask) 
#                 print(list(zip(torch.max(start_predictions, dim=1)[1].cpu().numpy().tolist(), torch.max(end_predictions, dim=1)[1].cpu().numpy().tolist())))
#                 print('--'*40)
                
                start_predictions = F.softmax(start_predictions)
                end_predictions = F.softmax(end_predictions)
#                 print(start_predictions)
                start_predictions = start_predictions.cpu().numpy().tolist()
                end_predictions = end_predictions.cpu().numpy().tolist()
                # 
                t1 = np.argwhere(input_ids.cpu().numpy() == 102)[:,1][::2].tolist()
                t2 = np.argwhere(input_ids.cpu().numpy() == 102)[:,1][1::2].tolist()
#                 print(start_predictions)
                all_context_span.extend(zip(t1, t2))
                all_start_predicionts.extend(start_predictions)
                all_end_predicionts.extend(end_predictions)
            
#             print(all_context_span)
            prediction_span = find_best_answer_for_passage(all_start_predicionts, all_end_predicionts, all_context_span)     
            df_result = df.copy()
            df_result['start'] = [x[0] for x in prediction_span]
            df_result['stop'] = [x[1] for x in prediction_span]
#             print(list(zip(df_result['start'], df_result['stop'])))
            #
            df_result['prediction'] = df_result.apply(lambda x:eval(x['input_ids'])[x['start']:x['stop']], axis=1)
            df_result['prediction'] = df_result['prediction'].apply(lambda x:model.tokenizer.convert_ids_to_tokens(x))
            df_result['prediction'] = df_result['prediction'].apply(lambda x:''.join(x))
            df_result['prediction'] = df_result.prediction.str.replace('#', '')
#             print(df_result['prediction'])
            # 转dict格式
            dict_result = dict(zip(df_result['query_id'], df_result['prediction']))
            # 验证集
            if is_dev:
                # 
                ref_ans = json.load(io.open(dev_path))
                # 
                F1, EM, TOTAL, SKIP = evaluate(ref_ans, dict_result)
                output_result = OrderedDict()
                output_result['F1'] = '%.3f' % F1
                output_result['EM'] = '%.3f' % EM
                output_result['TOTAL'] = TOTAL
                output_result['SKIP'] = SKIP
                # 
                print(output_result)
                return output_result 
            else:
                return dict_result

## count_parameters

In [None]:
def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
count_parameters(model)

# train

In [None]:
# 设备 
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 定义损失函数
criterion = nn.CrossEntropyLoss().to(device)
# 定义优化器
optimizer = AdamW(params=model.parameters(), lr=1e-6)
#
best_f1 = 0

for epoch_id in range(100):
    model.train()
    tr_loss = 0
    nb_tr_steps = 0
    all_predictions, all_labels = [], []
    
    pbar = tqdm.tqdm(enumerate(train_iter), 
                                 desc='epoch：' + str(epoch_id)  + ' train ', 
                                 total=len(train_iter), 
                                 miniters = 50,
                                 ncols=50)
    with pbar as t:
        query_id_list = []
        input_ids_list = []
        pre_start_position_list = []
        pre_end_position_list = []
        for idx, batch_samples  in t:

            query_id = batch_samples['query_id']
            input_ids = batch_samples['input_ids']            
            token_type_ids = batch_samples['token_type_ids']
            attention_mask = batch_samples['attention_mask']
            start_position = batch_samples['start_position']
            end_position = batch_samples['end_position']
            #
            query_id_list.extend(query_id)
            input_ids_list.extend(input_ids)
            
            # 
#             query_id = torch.LongTensor(query_id).to(device)
            input_ids = torch.LongTensor(input_ids).to(device)
            token_type_ids = torch.LongTensor(token_type_ids).to(device)
            attention_mask = torch.LongTensor(attention_mask).to(device)
            start_position = torch.LongTensor(start_position).to(device)
            end_position = torch.LongTensor(end_position).to(device)
            # 
            model.zero_grad()
            # 正向传播
            start_predictions, end_predictions = model(input_ids, 
                                                       token_type_ids = token_type_ids, 
                                                       attention_mask = attention_mask) 
            #
            pre_start_position = torch.max(start_predictions, dim=1)[1].cpu().numpy().tolist()
            pre_end_position = torch.max(end_predictions, dim=1)[1].cpu().numpy().tolist()

            pre_start_position_list.extend(pre_start_position)
            pre_end_position_list.extend(pre_end_position)
            
            #
            loss1 = criterion(start_predictions, start_position)
            loss2 = criterion(end_predictions, end_position)
            loss = (loss1 + loss2) / 2
            # 反向传播
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()
            nb_tr_steps += 1
            # 打印结果
            if idx % 50 == 0:
                print("Train loss: {}".format(tr_loss / nb_tr_steps))
            
        
#############################################################################
        df_train_pre = pd.DataFrame({'query_id':query_id_list,
                                     'input_ids':[str(each) for each in input_ids_list],
                                     'start':pre_start_position_list,
                                     'stop':pre_end_position_list})
        df_train_pre['prediction'] = df_train_pre.apply(lambda x:eval(x['input_ids'])[x['start']:x['stop']], axis=1)
        df_train_pre['prediction'] = df_train_pre['prediction'].apply(lambda x:model.tokenizer.convert_ids_to_tokens(x))
        df_train_pre['prediction'] = df_train_pre['prediction'].apply(lambda x:''.join(x))
        df_train_pre['prediction'] = df_train_pre.prediction.str.replace('#', '')
        dict_result = dict(zip(df_train_pre['query_id'], df_train_pre['prediction']))
        #
        ref_ans = json.load(io.open('./corpus/train.json'))
        F1, EM, TOTAL, SKIP = evaluate(ref_ans, dict_result)
        output_result = OrderedDict()
        output_result['F1'] = '%.3f' % F1
        output_result['EM'] = '%.3f' % EM
        output_result['TOTAL'] = TOTAL
        output_result['SKIP'] = SKIP
        # 
        print(output_result)

    # 每一个epoch打印dev集上的效果
    print('epoch{}：dev----------------------------------')
    dev_result = evaluate_dev_test(df_dev, dev_iter, dev_path='./corpus/dev.json')
    dev_f1_score = float(dev_result['F1'])
    if dev_f1_score > best_f1:
        print('最好的成绩为{}'.format(dev_f1_score))
        print('正在保存模型')
        best_f1 = dev_f1_score
        torch.save(model.state_dict(), r'./output/large_parameter_v1.pkl')

# predict_test

In [10]:
model = Bert_QA(num_labels = 2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.load_state_dict(torch.load('./output/large_parameter_v1.pkl'))

I0417 04:44:55.645853 139900671473408 configuration_utils.py:281] loading configuration file /home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch/bert_config.json
I0417 04:44:55.647903 139900671473408 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "m

RuntimeError: CUDA error: out of memory

In [49]:
dev_result = evaluate_dev_test(df_dev, dev_iter, dev_path='./corpus/dev.json', is_dev=True)
# dev_result = evaluate_dev_test(df_dev, dev_iter, is_dev=False)

100%|██████████| 355/355 [01:04<00:00,  5.54it/s]


OrderedDict([('F1', '85.105'), ('EM', '73.394'), ('TOTAL', 1417), ('SKIP', 0)])


In [24]:
ref_ans = json.load(io.open('./corpus/dev.json'))
# 
F1, EM, TOTAL, SKIP = evaluate(ref_ans, dev_result)
output_result = OrderedDict()
output_result['F1'] = '%.3f' % F1
output_result['EM'] = '%.3f' % EM
output_result['TOTAL'] = TOTAL
output_result['SKIP'] = SKIP
# 

In [25]:
output_result

OrderedDict([('F1', '85.171'), ('EM', '73.536'), ('TOTAL', 1417), ('SKIP', 0)])

In [72]:
dev_result['40b0e1f5d4823443e39ed1ecc7222456']

'710点券'

In [51]:
test_result = evaluate_dev_test(df_test, test_iter, is_dev=False)

100%|██████████| 12500/12500 [38:09<00:00,  5.47it/s]


In [37]:
df_test[df_test.query_id=='f9a31c67c8b7a1f9c9b8d8070bbe011d']

Unnamed: 0,query_id,context,question,input_ids,token_type_ids,attention_mask
48699,f9a31c67c8b7a1f9c9b8d8070bbe011d,"据了解,目前白云机场官网已授权可办理网上值机的航空公司仅限于以下航空公司的国内航班,包括中国...",白云机场 值机流程客服,"[101, 4635, 756, 3322, 1767, 966, 3322, 3837, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


In [52]:
test_result

{'3c41636fb3f3a1bca8dbf60e1d9a8d18': '橙色',
 '8edd3333dcff47508ebba5a6249fa8e9': '10年以上',
 '386169bb13528eb53a923e3f068cf0db': '5万元',
 '362d218b681886d4644b6c9ca991675a': '实木颗粒板',
 '9754ed445e8745360b75665c315804f9': '胡桃木',
 'd9554f0cb5e1734f2851d80b896c87c4': '38码',
 'f8f80bf67d1843f6922139b150eb9f66': '86.3万人',
 '3a3e49397f9eb555c2035b98dddefb95': '86.3万人',
 '41daac805992de358a1aa00d44d05a27': '86.3万人',
 'b93b8c2a5f088bd7474617044ff72c7d': '86.3万人',
 '673f816d6674f6b3b5986bf9d6603b69': '86.3万人',
 '4c4041532e3ef04eb21480e3fbe1ae6b': '86.3万人',
 '9789245b3deac143ef554bd648b3a1df': '86.3万人',
 '03276b91aa5745909bd14b127bd9b506': '86.3万人',
 '9484d7d9f4d130260cf53859f3e16cba': '86.3万人',
 '66c810df4e04a1885ca66e499b931a6c': '86.3万人',
 'd9fc999fe63a862f2fdfeb19a994a0e9': '86.3万人',
 '1948b22507731093bbdb2941fee215c7': '86.3万人',
 'fe7e7fde6c9aa0ff18e3556a4c8227fa': '86.3万人',
 '237278e9cc2d9e9317a3a33cf9abb8a2': '86.3万人',
 '9e2328cdd18ac61e8f2772051d8c2470': '86.3万人',
 '511da775a003a4b0e760f4029e

In [53]:
with open('result.json', 'w') as file_obj:
    json.dump(test_result, file_obj, ensure_ascii=False)

In [262]:
len(test_result)

50000

# predict


In [39]:
model = Bert_QA(num_labels = 2)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
model.load_state_dict(torch.load('./output/large_parameter_v1.pkl'))

I0417 07:30:17.594708 139999693768448 configuration_utils.py:281] loading configuration file /home/zhoujx/Pretrained_models/chinese_roberta_wwm_large_ext_pytorch/bert_config.json
I0417 07:30:17.596580 139999693768448 configuration_utils.py:319] Model config BertConfig {
  "_num_labels": 2,
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "directionality": "bidi",
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "min_length": 0,
  "m

<All keys matched successfully>

In [40]:
# text1
text1 = '周杰伦（Jay Chou），1979年1月18日出生于台湾省新北市，\
祖籍福建省泉州市永春县，中国台湾流行乐男歌手、原创音乐人、演员、导演等，毕业于淡江中学。'

In [41]:
model.predict('周杰伦',  text1)




In [24]:
# text2
text2 = '4月14日晚间，格力电器发布了2019年度业绩快报，实现营业总收入2005亿元，\
同比上涨0.24%。归母净利润246.72亿元，同比下降5.84%。\
智通财经APP获悉，此前雷军与董明珠开始新的五年赌约，2019年小米(01810)全年实现营收达2058亿，同比增长17.7%，\
小米集团总体营收已超过格力电器。根据中金观点，目前格力电器开启去库存周期，看好公司明年价值。\
而据此前小米财报显示，小米2019年即有意降低库存，2020年全面备战5G。'

In [37]:
model.predict('小米', text2)

205##8亿
