In [1]:
# FB15k Job Default Config

TASK='fb15k'
NUM_VOCAB=16396  #NUM_VOCAB and NUM_RELATIONS must be consistent with vocab.txt file 
NUM_RELATIONS=1345

# training hyper-paramters
BATCH_SIZE=512
LEARNING_RATE=5e-4
EPOCH=400
SOFT_LABEL=0.8
SKIP_STEPS=1000
MAX_SEQ_LEN=3
HIDDEN_DROPOUT_PROB=0.1
ATTENTION_PROBS_DROPOUT_PROB=0.1

# file paths for training and evaluation 
DATA="./data"
OUTPUT="./output_"+ TASK
TRAIN_FILE= DATA + TASK + "/train.coke.txt"
VALID_FILE=DATA + TASK + "/valid.coke.txt"
TEST_FILE=DATA + TASK + "/test.coke.txt"
VOCAB_PATH=DATA + TASK + "/vocab.txt"
TRUE_TRIPLE_PATH=DATA + TASK + "/all.txt"
CHECKPOINTS= OUTPUT + "/models"
INIT_CHECKPOINTS= CHECKPOINTS
LOG_FILE=OUTPUT+"/train.log"
LOG_EVAL_FILE=OUTPUT+"/test.log"

# transformer net config, the follwoing are default configs for all tasks
HIDDEN_SIZE=256
NUM_HIDDEN_LAYERS=12
NUM_ATTENTION_HEADS=4
MAX_POSITION_EMBEDDINGS=40

In [2]:
args = {
    'dataset': TASK,
    'vocab_size' : NUM_VOCAB,
    'num_relations': NUM_RELATIONS,
    
    'use_cuda': False,
    'do_train': True,
    'do_predict': False,
    'use_ema': False,
    'use_fast_executor': False,
    'num_iteration_per_drop_scope': 1,
    
    'train_file': TRAIN_FILE,
    'true_triple_path': TRUE_TRIPLE_PATH,
    'vocab_path': VOCAB_PATH,
    'sen_candli_file': None, 
    'sen_trivial_file': None,
    'predict_file': None,
    "in_tokens": False,
    
    'max_seq_len':MAX_SEQ_LEN,
    'checkpoints':CHECKPOINTS,
    'soft_label': SOFT_LABEL,
    'batch_size': BATCH_SIZE,
    'epoch': EPOCH,
    'learning_rate': LEARNING_RATE,
    'skip_steps': SKIP_STEPS,
    'hidden_dropout_prob': HIDDEN_DROPOUT_PROB,
    'attention_probs_dropout_prob':ATTENTION_PROBS_DROPOUT_PROB,
    
    'hidden_size': HIDDEN_SIZE,
    'num_hidden_layers': NUM_HIDDEN_LAYERS,
    'num_attention_heads':NUM_ATTENTION_HEADS,
    'max_position_embeddings':MAX_POSITION_EMBEDDINGS,
    
    "hidden_act": "gelu",
    "initializer_range": 0.02, 
    "intermediate_size": 512,  
    "init_checkpoint":  None,
    "init_pretraining_params":  None, 
    "weight_sharing": True,
    
    "lr_scheduler": "linear_warmup_decay",
    "weight_decay": 0.01, 
    "warmup_proportion": 0.1,
    "ema_decay": 0.9999,
    "use_fp16": False, 
    "loss_scaling": 1.0,
    
    "skip_steps": 1000,
    "verbose": False,
    
    
}

In [4]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import multiprocessing
import os
import time
import logging
import json
import random

import numpy as np
import paddle

if paddle.__version__.startswith('2.'):
    paddle.enable_static() # into static mode

import paddle.fluid as fluid

from reader.coke_reader import KBCDataReader
from reader.coke_reader import PathqueryDataReader
from model.coke import CoKEModel
from optimization import optimization
#from evaluation import kbc_evaluation
from evaluation import kbc_batch_evaluation
from evaluation import compute_kbc_metrics
from evaluation import pathquery_batch_evaluation
from evaluation import compute_pathquery_metrics
from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params, init_checkpoint

  and should_run_async(code)
06/14/2022 11:58:04 - INFO - reader.coke_reader -   10


In [12]:
!pip install paddlepaddle==2.3.0 -i https://mirror.baidu.com/pypi/simple

Looking in indexes: https://mirror.baidu.com/pypi/simple
Collecting paddlepaddle==2.3.0
  Downloading https://mirror.baidu.com/pypi/packages/01/37/040347acdd4683bbe45a914bf2321f261f378a902822e2cf6cd3b7265cce/paddlepaddle-2.3.0-cp38-cp38-win_amd64.whl (64.2 MB)
Collecting paddle-bfloat==0.1.2
  Downloading https://mirror.baidu.com/pypi/packages/9b/b9/764f50d1c7dd242e61f378aea838aa67d64013c399ff7ccd6a11284de082/paddle_bfloat-0.1.2-cp38-cp38-win_amd64.whl (40 kB)
Collecting astor
  Downloading https://mirror.baidu.com/pypi/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: paddle-bfloat, astor, paddlepaddle
Successfully installed astor-0.8.1 paddle-bfloat-0.1.2 paddlepaddle-2.3.0


In [6]:
def create_model(pyreader_name, coke_config):
    pyreader = fluid.layers.py_reader\
            (
        capacity=50,
        shapes=[[-1, args.max_seq_len, 1],
                [-1, args.max_seq_len, 1],
                [-1, args.max_seq_len, 1], [-1, 1], [-1, 1]],
        dtypes=[
            'int64', 'int64', 'float32', 'int64', 'int64'],
        lod_levels=[0, 0, 0, 0, 0],
        name=pyreader_name,
        use_double_buffer=True)
    (src_ids, pos_ids, input_mask, mask_labels, mask_positions) = fluid.layers.read_file(pyreader)

    coke = CoKEModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        input_mask=input_mask,
        config=coke_config,
        soft_label=args.soft_label,
        weight_sharing=args.weight_sharing,
        use_fp16=args.use_fp16)

    loss, fc_out = coke.get_pretraining_output(mask_label=mask_labels, mask_pos=mask_positions)
    if args.use_fp16 and args.loss_scaling > 1.0:
        loss = loss * args.loss_scaling

    batch_ones = fluid.layers.fill_constant_batch_size_like(
        input=mask_labels, dtype='int64', shape=[1], value=1)
    num_seqs = fluid.layers.reduce_sum(input=batch_ones)

    return pyreader, loss, fc_out, num_seqs


In [5]:
def init_coke_net_config(args, print_config = True):
    
    config = {}
    config["hidden_size"] = args["hidden_size"]
    config["num_hidden_layers"] = args["num_hidden_layers"]
    config["num_attention_heads"] = args["num_attention_heads"]
    config["vocab_size"] = args["vocab_size"]
    config["num_relations"] = args["num_relations"]
    config["max_position_embeddings"] = args["max_position_embeddings"]
    config["hidden_act"] = args["hidden_act"]
    config["hidden_dropout_prob"] = args["hidden_dropout_prob"]
    config["attention_probs_dropout_prob"] = args["attention_probs_dropout_prob"]
    config["initializer_range"] = args["initializer_range"]
    config["intermediate_size"] = args["intermediate_size"]
    
    if print_config is True:
        for arg, value in config.items():
            print(f"{arg}: {value}")
            
    return config

In [3]:
from reader.coke_reader import KBCDataReader

def get_data_reader(args, data_file, epoch, is_training, shuffle, vocab_size):
    Reader = KBCDataReader
    data_reader = Reader(vocab_path=args["vocab_path"], 
                         data_path=data_file,
                         max_seq_len=args["max_seq_len"],
                         batch_size=args["batch_size"],
                         is_training=is_training,
                         shuffle=shuffle,
                         epoch=epoch,
                         dev_count=1,
                         vocab_size=vocab_size)
    return data_reader

06/14/2022 11:29:30 - INFO - bin.reader.coke_reader -   10


In [4]:
coke_config = init_coke_net_config(args, print_config=True)

hidden_size: 256
num_hidden_layers: 12
num_attention_heads: 4
vocab_size: 16396
num_relations: 1345
max_position_embeddings: 40
hidden_act: gelu
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
initializer_range: 0.02
intermediate_size: 512


In [None]:
if args['do_train']:
    train_data_reader = get_data_reader(args, args["train_file"], is_training=True,
                                          epoch=args["epoch"], shuffle=True, dev_count=1,
                                          vocab_size=args["vocab_size"])
    