In [None]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse drive  -o nonempty

import os
os.chdir('drive/bert4keras')
import sys
sys.path.append('/root/.local/lib/python3.6/site-packages')

E: Package 'python-software-properties' has no installation candidate
Selecting previously unselected package google-drive-ocamlfuse.
(Reading database ... 145480 files and directories currently installed.)
Preparing to unpack .../google-drive-ocamlfuse_0.7.23-0ubuntu1~ubuntu18.04.1_amd64.deb ...
Unpacking google-drive-ocamlfuse (0.7.23-0ubuntu1~ubuntu18.04.1) ...
Setting up google-drive-ocamlfuse (0.7.23-0ubuntu1~ubuntu18.04.1) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope

In [None]:
%tensorflow_version 1.x
import tensorflow as tf
!pip install bert4keras

TensorFlow 1.x selected.
Collecting bert4keras
[?25l  Downloading https://files.pythonhosted.org/packages/38/bd/acb933644c7c205a487f2982e073a16553db3428b8fa903bd74151931000/bert4keras-0.9.7.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 7.3MB/s 
Building wheels for collected packages: bert4keras
  Building wheel for bert4keras (setup.py) ... [?25l[?25hdone
  Created wheel for bert4keras: filename=bert4keras-0.9.7-cp36-none-any.whl size=43299 sha256=f549508b9b16902f907d68bd821f951ad5f0d2d2c9dc51af6e48df82dfce3392
  Stored in directory: /root/.cache/pip/wheels/71/7b/06/4b4bca2005cfccd3a157cb012d1f91a83c252442c9358c238c
Successfully built bert4keras
Installing collected packages: bert4keras
Successfully installed bert4keras-0.9.7


In [None]:
from __future__ import division
import json, re
import numpy as np
import pandas as pd
from tqdm import tqdm
from bert4keras.backend import keras, K
from bert4keras.layers import Loss
from bert4keras.models import build_transformer_model
from bert4keras.tokenizers import Tokenizer, load_vocab
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, open
from bert4keras.snippets import DataGenerator, AutoRegressiveDecoder
from keras.models import Model
from sympy import Integer

os.environ["CUDA_VISIBLE_DEVICES"]="0"

Using TensorFlow backend.


# Experiment

In [None]:
# 基本参数
maxlen = 192
batch_size = 32
epochs = 25

# bert base配置
config_path = 'uer/mixed_corpus_bert_base_model/bert_config.json'
checkpoint_path = 'uer/mixed_corpus_bert_base_model/bert_model.ckpt'
dict_path = 'uer/mixed_corpus_bert_base_model/vocab.txt'
weights_path = 'ape210k/bestmodel/best_model.weights'


## Definition

In [None]:
def is_equal(a, b):
  a = round(float(a), 6)
  b = round(float(b), 6)
  return a == b


def remove_bucket(equation):
  l_buckets, buckets = [], []
  for i, c in enumerate(equation):
    if c == '(':
      l_buckets.append(i)
    elif c == ')':
      buckets.append((l_buckets.pop(), i))
  eval_equation = eval(equation)
  for l, r in buckets:
    new_equation = '%s %s %s' % (
      equation[:l], equation[l + 1:r], equation[r + 1:]
    )
    try:
      if is_equal(eval(new_equation.replace(' ', '')), eval_equation):
        equation = new_equation
    except:
      pass
  return equation.replace(' ', '')
  
class data_generator(DataGenerator):

  def __iter__(self, random=False):
    batch_token_ids, batch_segment_ids = [], []
    for is_end, (question, equation, answer) in self.sample(random):
      token_ids, segment_ids = tokenizer.encode(
        question, equation, maxlen=maxlen
      )
      batch_token_ids.append(token_ids)
      batch_segment_ids.append(segment_ids)
      if len(batch_token_ids) == self.batch_size or is_end:
        batch_token_ids = sequence_padding(batch_token_ids)
        batch_segment_ids = sequence_padding(batch_segment_ids)
        yield [batch_token_ids, batch_segment_ids], None
        batch_token_ids, batch_segment_ids = [], []


class CrossEntropy(Loss):
  def compute_loss(self, inputs, mask=None):
    y_true, y_mask, y_pred = inputs
    y_true = y_true[:, 1:]  # 目标token_ids
    y_mask = y_mask[:, 1:]  # segment_ids，刚好指示了要预测的部分
    y_pred = y_pred[:, :-1]  # 预测序列，错开一位
    loss = K.sparse_categorical_crossentropy(y_true, y_pred)
    loss = K.sum(loss * y_mask) / K.sum(y_mask)
    return loss

class AutoSolve(AutoRegressiveDecoder):
  """
  使用Bert+UNILM
  """
  @AutoRegressiveDecoder.wraps(default_rtype='probas')
  def predict(self, inputs, output_ids, states):
    token_ids, segment_ids = inputs
    token_ids = np.concatenate([token_ids, output_ids], 1)
    segment_ids = np.concatenate([segment_ids, np.ones_like(output_ids)], 1)
    return model.predict([token_ids, segment_ids])[:, -1]

  def generate(self, text, topk=1):
    token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
    output_ids = self.beam_search([token_ids, segment_ids], topk)  # 基于beam search
    return tokenizer.decode(output_ids).replace(' ', '')

class Evaluator(keras.callbacks.Callback):
  def __init__(self):
    self.best_acc = 0.

  def on_epoch_end(self, epoch, logs=None):
    metrics = self.evaluate(valid_data)  # 评测模型
    if metrics['acc'] >= self.best_acc:
      self.best_acc = metrics['acc']
      model.save_weights('best_model.weights')  # 保存模型
    metrics['best_acc'] = self.best_acc
    print('valid_data:', metrics)

  def evaluate(self, data, topk=1):
    total, right = 0.0, 0.0
    for question, equation, answer in tqdm(data):
      total += 1
      pred_equation = autosolve.generate(question, topk)
      try:
        right += int(is_equal(eval(pred_equation), eval(answer)))
      except:
        pass
    return {'acc': right / total}

# prepare data
def load_data(filename):
  D = []
  for l in open(filename):
    l = json.loads(l)
    question, equation, answer = l['original_text'], l['equation'], l['ans']
    # 处理带分数
    question = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', question)
    equation = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', equation)
    answer = re.sub('(\d+)\((\d+/\d+)\)', '(\\1+\\2)', answer)
    equation = re.sub('(\d+)\(', '\\1+(', equation)
    answer = re.sub('(\d+)\(', '\\1+(', answer)
    # 分数去括号
    question = re.sub('\((\d+/\d+)\)', '\\1', question)
    # 处理百分数
    equation = re.sub('([\.\d]+)%', '(\\1/100)', equation)
    answer = re.sub('([\.\d]+)%', '(\\1/100)', answer)
    # 冒号转除号、剩余百分号处理
    equation = equation.replace(':', '/').replace('%', '/100')
    answer = answer.replace(':', '/').replace('%', '/100')
    if equation[:2] == 'x=':
      equation = equation[2:]
    try:
      if is_equal(eval(equation), eval(answer)):
        D.append((question, remove_bucket(equation), answer))
    except:
      continue
  return D

## Train

In [None]:
# load data
train_data = load_data('ape210k/data/train.ape.json')
valid_data = load_data('ape210k/data/valid.ape.json')
test_data = load_data('ape210k/data/test.ape.json')

evaluator = Evaluator()
train_generator = data_generator(train_data, batch_size)


In [None]:
# 从头训练
token_dict, keep_tokens = load_vocab(
    dict_path=dict_path,
    simplified=True,
    startswith=['[PAD]', '[UNK]', '[CLS]', '[SEP]'],
)
tokenizer = Tokenizer(token_dict,do_lower_case=True) #编码器

# 构建Bert
model = build_transformer_model(
  config_path,
  checkpoint_path,
  application='unilm',
  keep_tokens=keep_tokens,  # 只保留keep_tokens中的字，精简原字表
)

output = CrossEntropy(2)(model.inputs + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(2e-5))
model.summary()

autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64)

# Train
model.fit(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=epochs,
    callbacks=[evaluator]
)

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where

Epoch 1/1


100%|██████████| 4999/4999 [12:30<00:00,  6.66it/s]


valid_data: {'acc': 0.47989597919583915, 'best_acc': 0.47989597919583915}


<keras.callbacks.callbacks.History at 0x7f1e40067358>

# Load and Predict

In [None]:
# load data
train_data = load_data('ape210k/data/train.ape.json')
valid_data = load_data('ape210k/data/valid.ape.json')
test_data = load_data('ape210k/data/test.ape.json')

# evaluator = Evaluator()

#加载词表
token_dict, keep_tokens = json.load(open('ape210k/bestmodel/token_dict_keep_tokens.json'))

tokenizer = Tokenizer(token_dict,do_lower_case=True) 

# 构建Bert
model = build_transformer_model(
  config_path,
  checkpoint_path,
  application='unilm',
  keep_tokens=keep_tokens,  
)

output = CrossEntropy(2)(model.inputs + model.outputs)

model = Model(model.inputs, output)
model.compile(optimizer=Adam(5e-5))
model.summary()

autosolve = AutoSolve(start_id=None, end_id=tokenizer._token_end_id, maxlen=64)

#加载权重
print('加载参数')
model.load_weights(weights_path)
print('加载完毕')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.


  'be expecting any data to be passed to {0}.'.format(name))


Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, None)         0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Embedding-Token (Embedding)     multiple             10433280    Input-Token[0][0]                
                                                                 MLM-Norm[0][0]                   
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, None, 768)    1536        Input-Segment[0][0]        

In [None]:
def predict(data, dataname,topk=1, print_num=5):
  total, right = 0.0, 0.0
  correct_print = 0
  false_print = 0
  for question, equation, answer in tqdm(data):
    total += 1
    pred_equation =autosolve.generate(question, topk)
    try:
      is_right = is_equal(eval(pred_equation), eval(answer))
      right += int(is_right)
      if correct_print < print_num:
        if is_right:
          correct_print += 1
          print(' ')
          print('> {}'.format(question))
          print('= equation:{} \t answer:{}'.format(equation, answer))
          print('< euqation:{} \t answer:{}'.format(pred_equation, eval(pred_equation)))
          print(' ')
      if false_print < print_num:
        if not is_right:
          false_print += 1
          print(' ')
          print('> {}'.format(question))
          print('= equation:{} \t answer:{}'.format(equation, answer))
          print('< euqation:{} \t answer:{}'.format(pred_equation, eval(pred_equation)))
          print(' ')
          
    except:
      pass
  print('For {}, acc:{:.3f}'.format(dataname, right/total))


In [None]:
predict(train_data, 'train data')

  0%|          | 0/200392 [00:00<?, ?it/s]

 
> 五年级同学参加义务捐书活动，五1班捐了500本，五2班捐的本数是五1班80%，五3班捐的本数是五2班120%，五1班和五3班比谁捐书多？(请用两种方法比较一下)．
= equation:1 	 answer:1
< euqation:1 	 answer:1
 


  0%|          | 2/200392 [00:00<7:42:50,  7.22it/s]

 
> 小王要将150千克含药量20%的农药稀释成含药量5%的药水．需要加水多少千克？
= equation:150*20/100/(5/100)-150 	 answer:450
< euqation:150*20/100/(5/100)-150 	 answer:450.0
 


  0%|          | 3/200392 [00:00<10:10:22,  5.47it/s]

 
> 一个圆形花坛的半径是4米，现在要扩建花坛，将半径增加1米，这时花坛的占地面积增加了多少米**2．
= equation:3.14*(4+1)**2-3.14*4**2 	 answer:28.26
< euqation:3.14*(4+1)**2-3.14*4**2 	 answer:28.259999999999998
 


  0%|          | 4/200392 [00:00<10:32:13,  5.28it/s]

 
> 甲乙两数的差和商都是6．那么甲乙两数的和是多少？(请分别使用方程和列式方法解题)
= equation:6/(6-1)*6+6/(6-1) 	 answer:8.4
< euqation:6/(6-1)*6+6/(6-1) 	 answer:8.399999999999999
 


  0%|          | 5/200392 [00:00<10:55:49,  5.09it/s]

 
> 一件西服原价3200元，六月份先降价1/8，又加价1/8，这件西服的现价是多少元？原价高还是现价高？
= equation:3200*(1-1/8)*(1+1/8) 	 answer:3150
< euqation:3200*(1-1/8)*(1+1/8) 	 answer:3150.0
 


  0%|          | 8/200392 [00:01<10:33:13,  5.27it/s]

 
> 六1班原来男生占总数的2/5，又转来5名男生，现在男生占总数的5/11，女生有多少人？
= equation:5/(5/11/(1-5/11)-2/5/(1-2/5)) 	 answer:30
< euqation:5/(5/11/(1-5/11)-2/5/(1-2/5)) 	 answer:30.000000000000007
 
 
> 一个超市购进5吨大米，5天卖出2000千克，还剩多少千克？
= equation:5*1000-2000 	 answer:3000
< euqation:5*1000-2000 	 answer:3000
 
 
> 学校有75个篮球，35个排球，把这些球平均分给5个班，每个班分得几个球？
= equation:(75+35)/5 	 answer:22
< euqation:(75+35)/5 	 answer:22.0
 


  0%|          | 10/200392 [00:01<8:20:35,  6.67it/s]

 
> 小红每分钟打110个字，她从10点开始打字，10点二十五分结束，共打了多少个字？
= equation:110*25 	 answer:2750
< euqation:110*25 	 answer:2750
 
 
> 蔬菜市场运回茄子1200千克．运回的西红柿是茄子的1/3．西红柿有多少千克？
= equation:1200*1/3 	 answer:400
< euqation:1200*1/3 	 answer:400.0
 


 98%|█████████▊| 196443/200392 [7:57:31<10:28,  6.29it/s]

In [None]:
predict(valid_data, 'valid_data')

  0%|          | 0/4999 [00:00<?, ?it/s]




  0%|          | 1/4999 [00:02<3:33:51,  2.57s/it]

 
> 学校把135本练习本平均分给3个班，每班多少本？
= equation:135/3 	 answer:45
< euqation:135/3 	 answer:45.0
 


  0%|          | 3/4999 [00:02<1:52:41,  1.35s/it]

 
> 为了帮助四川地震灾民，工厂赶制一批救灾帐篷，第一车间完成了这批帐篷的1/5，第二车间完成了这批帐篷的1/4，还剩下这批帐篷的几分之几没完成？
= equation:1-1/5-1/4 	 answer:(11/20)
< euqation:1-1/5-1/4 	 answer:0.55
 
 
> 有50个数的平均数是83，如果去掉其中两个数，这两个数的和是118，那么剩下的书的平均数是多少？
= equation:(50*83-118)/(50-2) 	 answer:84
< euqation:(83*50-118)/(50-2) 	 answer:84.0
 


  0%|          | 6/4999 [00:03<59:39,  1.39it/s]  

 
> 种植一种观赏树木，死亡棵数与成活棵数的比是1：15，这种观赏树木的成活率是多少．
= equation:15/(15+1)*100/100 	 answer:(93.75/100)
< euqation:15/(1+15)*100/100 	 answer:0.9375
 
 
> 我国首艘航母辽宁舰的弦号是16，这个数共有多少个因数．
= equation:5 	 answer:5
< euqation:16/4 	 answer:4.0
 
 
> 一个圆锥的底面直径是4厘米，高3厘米，把它从顶点往下一刀切开，成为形状相同的两半，表面积增加多少平方厘米？
= equation:4*3/2*2 	 answer:12
< euqation:4*3/2*2 	 answer:12.0
 


  0%|          | 9/4999 [00:03<34:36,  2.40it/s]

 
> 两个相同的数相乘，积是3600，这个数是多少．
= equation:60 	 answer:60
< euqation:3600/1 	 answer:3600.0
 
 
> 甲乙两球从同一地点沿周长为980米圆周匀速滚动，隔35分钟遇一次，若甲的速度是乙的3倍，求甲球滚动一周的时间．
= equation:980/(3*980/(35*2)) 	 answer:(23+1/3)
< euqation:980/(980/35*3-980/35) 	 answer:17.5
 


  0%|          | 12/4999 [00:03<22:47,  3.65it/s]

 
> 一块0.15公顷的稻田，按行距2分米，穴距15厘米，一共可插秧多少株？
= equation:15*10000/(2*0.15*10) 	 answer:50000
< euqation:0.15*10000*2*15/100/(0.15*10000) 	 answer:0.3
 


  0%|          | 17/4999 [00:05<22:52,  3.63it/s]

 
> 有6个棱长分别是4厘米、5厘米、6厘米的相同的长方体，把它们的某些面染上红色，使得6个长方体中染有红色的面恰好分别是1个面、2个面、3个面、4个面、5个面和6个面．染色后把所有长方体分割成棱长为1厘米的小正方体，分割完毕后，恰有一面是红色的小正方体最多有多少个？
= equation:6*5/1*1+(5*4*2+5*2*2)/1*1*3+5*2*5+3*2+3*2*2+3*4*2+2*4*2 	 answer:318
< euqation:(5-2)*4/1*1+(5-2)*4/1*2+(4-2)*5/1*2 	 answer:56.0
 


100%|██████████| 4999/4999 [11:05<00:00,  7.52it/s]

For valid_data, acc:0.717





In [None]:
predict(test_data, 'test_data')

  0%|          | 2/4998 [00:00<18:51,  4.42it/s]

 
> 2.75-(1+5/6)+(3+1/4)-(2+1/6)．
= equation:2.75-(1+5/6)+3+1/4-(2+1/6) 	 answer:2
< euqation:2.75-(1+5/6)+3+1/4-(2+1/6) 	 answer:1.9999999999999996
 
 
> 王艳家买了一台洗衣机和一台电冰箱，一共花了6000元，电冰箱的价钱是洗衣机的3/5，求洗衣机的价钱．
= equation:6000/(1+3/5) 	 answer:3750
< euqation:6000/(1+3/5) 	 answer:3750.0
 


  0%|          | 5/4998 [00:00<12:47,  6.51it/s]

 
> 有5筐苹果的重量相等，如果从每筐中取出10kg，那么剩下的苹果相当于原来3筐的重量，原来每筐苹果重多少千克？
= equation:10*5/(5-3) 	 answer:25
< euqation:5*10/(5-3) 	 answer:25.0
 
 
> 王阿姨每分钟打60字，她15分钟能打多少字．
= equation:60*15 	 answer:900
< euqation:60*15 	 answer:900
 
 
> 甲数是42，乙数是甲数的3/7，乙数是多少．
= equation:42*3/7 	 answer:18
< euqation:42*3/7 	 answer:18.0
 


  0%|          | 9/4998 [00:01<13:45,  6.04it/s]

 
> 有李树5棵，每棵产李子60.8千克，桃树8棵，每棵产桃子47.5千克，收获哪种水果比较重？比另一种重多少千克？
= equation:47.5*8-60.8*5 	 answer:76
< euqation:60.8*5-47.5*8 	 answer:-76.0
 


  0%|          | 11/4998 [00:01<12:24,  6.70it/s]

 
> 甲、乙两地相距120千米，客车和货车同时从甲地出发驶向乙地，客车到达乙地后立即沿原路返回，在途中的丙地与货车相遇．之后，客车和货车继续前进，各自到达甲地和乙地后又马上折回，结果两车又恰好在丙地相遇．已知两车在出发后的2小时首次相遇，那么客车的速度是每小时多少千米？
= equation:(120+120/3)/2 	 answer:80
< euqation:120*2/2 	 answer:120.0
 


  0%|          | 17/4998 [00:02<07:54, 10.50it/s]

 
> 一个长方体的长和宽都是4米，高是5米，如果底面积扩大5倍，要使体积不变，高应该是多少厘米．
= equation:5/5*100 	 answer:100
< euqation:5*4*100/10 	 answer:200.0
 


  0%|          | 19/4998 [00:02<11:00,  7.54it/s]

 
> 设上题答数是a=90．某项修桥工程，甲队单独做a天完成，乙队单独做270天完成，现在两队合做，中间甲队共休息了14天，乙队共休息了40天(但两队不会同一天休息)．那么从开始到完工共用了多少天．
= equation:(1-1/270*14-1/90*40)/(1/90+1/270)+14+40 	 answer:88
< euqation:135/1.5+(135-14)/(1/5) 	 answer:695.0
 


  0%|          | 20/4998 [00:02<12:04,  6.87it/s]

 
> 甲、乙两车从相距180千米的A地去B地，甲车比乙车晚1.5小时出发，结果两车同时到达．甲、乙两车的速度比是4：3．甲车每小时多少千米？
= equation:180*(1-3/4)/1.5/(3/4) 	 answer:40
< euqation:180*3/(4+3-4) 	 answer:180.0
 


100%|██████████| 4998/4998 [10:58<00:00,  7.59it/s]

For test_data, acc:0.716



