A QA example for ELECTRA using Google Colab.  
This example is from https://github.com/ymcui/Chinese-ELECTRA.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Process data just like source code does.

In [None]:
%tensorflow_version 1.x

import os
import sys
import warnings
import json
import pickle
import tensorflow as tf

os.chdir('./drive/My Drive/Python/Research/bert')
sys.path.append('tasks/datasets/cmrc_2018/utils')
warnings.filterwarnings('ignore')

from tasks.datasets.cmrc_2018.utils import configure_finetuning
from tasks.datasets.cmrc_2018.utils.finetune import task_builder

In [3]:
def data_processing(model, data, param, save, training=True):
  with tf.io.gfile.GFile(param, 'r') as file1:
    param1 = json.load(file1)

  id1, qid1, input1, seg1, mask1, count1 = [], [], [], [], [], 0
  start1, end1, token1, map1, max1 = [], [], [], [], []
  config1 = configure_finetuning.FinetuningConfig(model, data, **param1)
  task1 = task_builder.get_tasks(config1)[0]
  data1 = task1.get_examples('train' if training else 'dev')

  for example1 in data1:
    sample1 = task1.featurize(example1, training, for_eval=True if not training else False)
    sample1 = sample1 if isinstance(sample1, list) else [sample1]
    count1 = count1+1
    
    for doc1 in sample1:
      qid1.append(example1.qas_id)
      id1.append(doc1['cmrc2018_eid'])
      input1.append(doc1['input_ids'])
      seg1.append(doc1['segment_ids'])
      mask1.append(doc1['input_mask'])

      if training:
        start1.append(doc1['cmrc2018_start_positions'])
        end1.append(doc1['cmrc2018_end_positions'])
      else:
        token1.append(doc1['cmrc2018_tokens'])
        map1.append(doc1['cmrc2018_token_to_orig_map'])
        max1.append(doc1['cmrc2018_token_is_max_context'])

    if count1 % 1000 == 0:
      print(str(count1)+' samples processed.')

  total1 = {
    'id': id1,
    'qid': qid1,
    'input': input1,
    'seg': seg1,
    'mask': mask1,
    'start': start1,
    'end': end1,
    'token': token1,
    'map': map1,
    'max': max1}

  with open(save, 'wb') as file1:
    pickle.dump(total1, file1)


model_1 = 'electra_small_ch'
data_1 = 'tasks/datasets/cmrc_2018'
param_1 = 'tasks/datasets/cmrc_2018/utils/params_cmrc2018.json'
data_processing(model_1, data_1, param_1, 'tasks/datasets/cmrc_2018/train.pkl', True)
data_processing(model_1, data_1, param_1, 'tasks/datasets/cmrc_2018/dev.pkl', False)

Build QA model based on ELECTRA.

In [None]:
%tensorflow_version 2.x

import os
import warnings
import time
import json
import pickle
import nltk
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras

os.chdir('./drive/My Drive/Python/Research/bert')
warnings.filterwarnings('ignore')
nltk.download('punkt')

import mymodels as mm

In [2]:
MODEL = 'electra'
VOCAB = 'models/electra_small_ch/vocab.txt'
CONFIG = 'models/electra_small_ch/electra_config.json'
CKPT = 'models/electra_small_ch/electra_small'
SAVE = 'tasks/models/cmrc_2018/model'
PRED = 'tasks/datasets/cmrc_2018/pred.json'
MAXLEN = 512
ANSLEN = 30
LRATE = 3e-4
BATCH = 32
EPOCH = 2
BEAM = 20
LMODE = 2
LDECAY = {
  'embedding': 0.8**13,
  'encoder/layer_0': 0.8**12,
  'encoder/layer_1': 0.8**11,
  'encoder/layer_2': 0.8**10,
  'encoder/layer_3': 0.8**9,
  'encoder/layer_4': 0.8**8,
  'encoder/layer_5': 0.8**7,
  'encoder/layer_6': 0.8**6,
  'encoder/layer_7': 0.8**5,
  'encoder/layer_8': 0.8**4,
  'encoder/layer_9': 0.8**3,
  'encoder/layer_10': 0.8**2,
  'encoder/layer_11': 0.8**1}

In [3]:
def data_processing(path, batch, training=True):
  with open(path, 'rb') as file1:
    data1 = pickle.load(file1)

  input1 = np.array(data1['input'])
  seg1 = np.array(data1['seg'])
  mask1 = np.array(data1['mask'])
  len1 = len(data1['input'])

  if training:
    start1 = np.array(data1['start'])
    end1 = np.array(data1['end'])
    data2 = tf.data.Dataset.from_tensor_slices((input1, seg1, mask1, start1, end1))
    return data2.shuffle(len(start1)).batch(batch), data1, len1
  else:
    data2 = tf.data.Dataset.from_tensor_slices((input1, seg1, mask1))
    return data2.batch(batch), data1, len1


training_1, file_1, len_1 = data_processing('tasks/datasets/cmrc_2018/train.pkl', BATCH, True)
dev_1, file_2, len_2 = data_processing('tasks/datasets/cmrc_2018/dev.pkl', BATCH, False)

In [4]:
class ModelELECTRA(keras.Model):
  def __init__(self, model, config, beam):
    super(ModelELECTRA, self).__init__()
    self.beam = beam
    self.bert = mm.BERT(config, model, 'seq')
    self.dense1 = keras.layers.Dense(1)
    self.dense2 = keras.layers.Dense(512, activation=mm.gelu_activating)
    self.dense3 = keras.layers.Dense(1)
        
  def propagating(self, text, segment, mask, training=True, start=None):
    length1 = mask.shape[1]
    seq1 = self.bert.propagating(text, segment, 1-mask, training)
    mask1 = tf.cast(mask*segment, tf.float32)+tf.one_hot(0, length1)
    start1 = self.dense1(seq1)[:, :, 0]+1000.0*(mask1-1)
    start2 = tf.nn.log_softmax(start1)
      
    if training:
      end0 = seq1
      index1 = tf.one_hot(start, depth=length1, axis=-1, dtype=tf.float32)
      feat1 = tf.reduce_sum(tf.expand_dims(index1, -1)*seq1, axis=1)
      feat1 = tf.tile(tf.expand_dims(feat1, 1), [1, length1, 1])
      end1 = tf.concat([feat1, end0], -1)
      end1 = self.dense3(self.dense2(end1))[:, :, 0]
      end1 = end1+1000.0*(mask1-1)
      end2 = tf.nn.log_softmax(end1)
      return start2, end2
    else:
      prob0, index0 = tf.nn.top_k(start2, k=self.beam)
      end0 = tf.tile(tf.expand_dims(seq1, 1), [1, self.beam, 1, 1])
      index1 = tf.one_hot(index0, depth=length1, axis=-1, dtype=tf.float32)
      feat1 = tf.reduce_sum(tf.expand_dims(seq1, 1)*tf.expand_dims(index1, -1), axis=-2)
      feat1 = tf.tile(tf.expand_dims(feat1, 2), [1, 1, length1, 1])
      end1 = tf.concat([feat1, end0], -1)
      end1 = self.dense3(self.dense2(end1))[:, :, :, 0]
      end1 = end1+tf.expand_dims(1000.0*(mask1-1), 1)
      end2 = tf.nn.log_softmax(end1)
      prob1, index1 = tf.nn.top_k(end2, k=self.beam)
      return start2, end2, prob0, index0, prob1, index1


class ModelQA(object):
  def __init__(self, tokenizer, model, maxlen, anslen):
    self.maxlen, self.anslen = maxlen, anslen
    self.model = model
    self.tokenizer = tokenizer
    self.vocab = list(self.tokenizer.vocab.keys())

  def processing(self, data, label=False):
    text0, seg0, mask0, start0, end0 = [], [], [], [], []
    
    for i1 in data:
      text1, segm1, mask1 = self.tokenizer.encoding(i1[0], i1[1], self.maxlen)
      text0.append(text1)
      seg0.append(segm1)
      mask0.append(mask1)
      
      if label:
        start0.append(i1[2])
        end0.append(i1[3])

    text0 = np.array(text0)
    seg0 = np.array(seg0)
    mask0 = 1-np.array(mask0)
    start0 = np.array(start0)
    end0 = np.array(end0)
    return text0, seg0, mask0, start0, end0

  def searching(self, input, seg, mask, constraint=None):
    pred1, pred2, prob1, index1, prob2, index2 = self.model.propagating(input, seg, mask, False)
    prob1, index1, prob2, index2 = prob1.numpy(), index1.numpy(), prob2.numpy(), index2.numpy()
    reply0 = []

    if constraint is not None:
      token1, map1, max1 = constraint[0], constraint[1], constraint[2]

    for b1 in range(index1.shape[0]):
      reply1, value1 = 'empty', -1000

      for s1 in range(index1.shape[1]):
        for e1 in range(index2.shape[2]):
          start1, sprob1 = index1[b1, s1], prob1[b1, s1]
          end1, eprob1 = index2[b1, s1, e1], prob2[b1, s1, e1]

          if sprob1+eprob1 < value1:
            continue
          if start1 == 0:
            continue
          if start1 > end1:
            continue
          if end1-start1+1 > self.anslen:
            continue

          if constraint is not None:
            if start1 >= len(token1[b1]):
              continue
            if end1 >= len(token1[b1]):
              continue
            if start1 not in map1[b1]:
              continue
            if end1 not in map1[b1]:
              continue
            if not max1[b1].get(start1, False):
              continue

          if constraint is not None:
            reply1, value1 = ''.join(token1[b1][start1:end1+1]), sprob1+eprob1
          else:
            list1 = [self.vocab[i1] for i1 in input[b1][start1:end1+1]]
            reply1, value1 = ''.join(list1), sprob1+eprob1

      reply1 = reply1.replace(' ##', '').replace('##', '')
      reply0.append({'reply': reply1, 'probability': value1})
      # To do: too lazy to make the final process of predicted answers.
      
    return reply0

  def replying(self, context, question):
    text1, seg1, mask1, _, _ = self.processing([[question, context]])
    return self.searching(text1, seg1, mask1)[0]['reply']


def loss_computing(predstart, predend, start, end):
  length1 = predstart.shape[1]
  pos1 = tf.one_hot(start, depth=length1, dtype=tf.float32)
  loss1 = -tf.reduce_mean(tf.reduce_sum(pos1*predstart, axis=-1))
  pos2 = tf.one_hot(end, depth=length1, dtype=tf.float32)
  loss2 = -tf.reduce_mean(tf.reduce_sum(pos2*predend, axis=-1))
  return (loss1+loss2)/2.0

In [5]:
tokenizer_1 = mm.Tokenizer()
tokenizer_1.loading(VOCAB)
model_1 = ModelELECTRA(MODEL, CONFIG, BEAM)
model_1.bert.loading(CKPT)
optimizer_1 = mm.AdamW(EPOCH*(int(len_1/BATCH)+1), LRATE, lmode=LMODE, ldecay=LDECAY)
loss_1 = tf.keras.metrics.Mean(name='training_loss')


@tf.function
def step_training(text, segment, mask, start, end):
  with tf.GradientTape() as tape_1:
    pred_1, pred_2 = model_1.propagating(text, segment, mask, True, start)
    loss_0 = loss_computing(pred_1, pred_2, start, end)

  grad_1 = tape_1.gradient(loss_0, model_1.trainable_variables)
  grad_1, _ = tf.clip_by_global_norm(grad_1, 1.0)
  optimizer_1.apply_gradients(zip(grad_1, model_1.trainable_variables))
  loss_1(loss_0)

In [6]:
temp_1 = 'Epoch {} running, training loss is {}, and step cost is {}.'
count_1 = 0

for e_1 in range(EPOCH):
  for x_1, x_2, x_3, y_1, y_2 in training_1:
    time_1, count_1 = time.time(), count_1+1
    step_training(x_1, x_2, x_3, y_1, y_2)

    if count_1 % 100 == 0:
      o_1 = round(float(loss_1.result()), 4)
      print(temp_1.format(e_1+1, o_1, round(time.time()-time_1, 4)))
      loss_1.reset_states()

Epoch 1 running, training loss is 3.292, and step cost is 0.5161.
Epoch 1 running, training loss is 1.9053, and step cost is 0.5154.
Epoch 1 running, training loss is 1.6242, and step cost is 0.5158.
Epoch 1 running, training loss is 1.4967, and step cost is 0.5157.
Epoch 1 running, training loss is 1.3801, and step cost is 0.5158.
Epoch 2 running, training loss is 1.2286, and step cost is 0.5191.
Epoch 2 running, training loss is 1.1254, and step cost is 0.5184.
Epoch 2 running, training loss is 1.1072, and step cost is 0.514.
Epoch 2 running, training loss is 1.0529, and step cost is 0.5176.
Epoch 2 running, training loss is 1.0585, and step cost is 0.5173.


In [7]:
model_1.save_weights(SAVE)
qamodel_1 = ModelQA(tokenizer_1, model_1, MAXLEN, ANSLEN)
qamodel_1.model = model_1

context_1 = '陈某生于一九九三年，上海浦东新区人氏。他英俊潇洒、骁勇善战、足智多谋，是不可多得的猛将。'
question_1 = '陈某是哪里人？'
print(qamodel_1.replying(context_1, question_1))

上海浦东新区人氏


In [8]:
df_1 = pd.DataFrame()

for i_1, x_1 in enumerate(dev_1):
  id_1 = file_2['id'][i_1*BATCH:(i_1+1)*BATCH]
  qid_1 = file_2['qid'][i_1*BATCH:(i_1+1)*BATCH]
  tok_1 = file_2['token'][i_1*BATCH:(i_1+1)*BATCH]
  map_1 = file_2['map'][i_1*BATCH:(i_1+1)*BATCH]
  max_1 = file_2['max'][i_1*BATCH:(i_1+1)*BATCH]

  reply_1 = pd.DataFrame(qamodel_1.searching(x_1[0], x_1[1], x_1[2], [tok_1, map_1, max_1]))
  reply_1['id'] = id_1
  reply_1['qid'] = qid_1
  df_1 = df_1.append(reply_1, ignore_index=True)

df_1 = df_1.sort_values(['probability']).reset_index(drop=True).drop_duplicates(['qid'], keep='last')
df_1 = df_1.sort_values(['id']).reset_index(drop=True)
json.dump(df_1.set_index('qid')['reply'].to_dict(), open(PRED, 'w'))

Evaluate QA model by provided script.

In [9]:
!python \
  tasks/datasets/cmrc_2018/utils/cmrc2018_drcd_evaluate.py \
  tasks/datasets/cmrc_2018/dev.json \
  tasks/datasets/cmrc_2018/pred.json

{"AVERAGE": "71.159", "F1": "80.528", "EM": "61.789", "TOTAL": 3219, "SKIP": 0}
