In [1]:
# FB15k Job Default Config

TASK='fb15k'
NUM_VOCAB=16396  #NUM_VOCAB and NUM_RELATIONS must be consistent with vocab.txt file 
NUM_RELATIONS=1345

# training hyper-paramters
BATCH_SIZE=512
LEARNING_RATE=5e-4
EPOCH=400
SOFT_LABEL=0.8
SKIP_STEPS=1000
MAX_SEQ_LEN=3
HIDDEN_DROPOUT_PROB=0.1
ATTENTION_PROBS_DROPOUT_PROB=0.1

# file paths for training and evaluation 
DATA="./data/"
OUTPUT="./output_"+ TASK
TRAIN_FILE= DATA + TASK + "/train.coke.txt"
VALID_FILE=DATA + TASK + "/valid.coke.txt"
TEST_FILE=DATA + TASK + "/test.coke.txt"
VOCAB_PATH=DATA + TASK + "/vocab.txt"
TRUE_TRIPLE_PATH=DATA + TASK + "/all.txt"
CHECKPOINTS= OUTPUT + "/models"
INIT_CHECKPOINTS= CHECKPOINTS
LOG_FILE=OUTPUT+"/train.log"
LOG_EVAL_FILE=OUTPUT+"/test.log"

# transformer net config, the follwoing are default configs for all tasks
HIDDEN_SIZE=256
NUM_HIDDEN_LAYERS=12
NUM_ATTENTION_HEADS=4
MAX_POSITION_EMBEDDINGS=40

In [2]:
args = {
    'dataset': TASK,
    'vocab_size' : NUM_VOCAB,
    'num_relations': NUM_RELATIONS,
    
    'use_cuda': False,
    'do_train': True,
    'do_predict': False,
    'use_ema': False,
    'use_fast_executor': False,
    'num_iteration_per_drop_scope': 1,
    
    'train_file': TRAIN_FILE,
    'true_triple_path': TRUE_TRIPLE_PATH,
    'vocab_path': VOCAB_PATH,
    'sen_candli_file': None, 
    'sen_trivial_file': None,
    'predict_file': None,
    "in_tokens": False,
    
    'max_seq_len':MAX_SEQ_LEN,
    'checkpoints':CHECKPOINTS,
    'soft_label': SOFT_LABEL,
    'batch_size': BATCH_SIZE,
    'epoch': EPOCH,
    'learning_rate': LEARNING_RATE,
    'skip_steps': SKIP_STEPS,
    'hidden_dropout_prob': HIDDEN_DROPOUT_PROB,
    'attention_probs_dropout_prob':ATTENTION_PROBS_DROPOUT_PROB,
    
    'hidden_size': HIDDEN_SIZE,
    'num_hidden_layers': NUM_HIDDEN_LAYERS,
    'num_attention_heads':NUM_ATTENTION_HEADS,
    'max_position_embeddings':MAX_POSITION_EMBEDDINGS,
    
    "hidden_act": "gelu",
    "initializer_range": 0.02, 
    "intermediate_size": 512,  
    "init_checkpoint":  None,
    "init_pretraining_params":  None, 
    "weight_sharing": True,
    
    "lr_scheduler": "linear_warmup_decay",
    "weight_decay": 0.01, 
    "warmup_proportion": 0.1,
    "ema_decay": 0.9999,
    "use_fp16": False, 
    "loss_scaling": 1.0,
    
    "skip_steps": 1000,
    "verbose": False,
    
    
}

In [3]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import argparse
import collections
import multiprocessing
import os
import time
import logging
import json
import random

import numpy as np
import paddle

if paddle.__version__.startswith('2.'):
    paddle.enable_static() # into static mode

import paddle.fluid as fluid

from reader.coke_reader import KBCDataReader
from reader.coke_reader import PathqueryDataReader
from model.coke import CoKEModel
from optimization import optimization
#from evaluation import kbc_evaluation
from evaluation import kbc_batch_evaluation
from evaluation import compute_kbc_metrics
from evaluation import pathquery_batch_evaluation
from evaluation import compute_pathquery_metrics
from utils.args import ArgumentGroup, print_arguments
from utils.init import init_pretraining_params, init_checkpoint

06/21/2022 15:36:35 - INFO - reader.coke_reader -   10


In [12]:
# !pip install paddlepaddle==2.3.0 -i https://mirror.baidu.com/pypi/simple

Looking in indexes: https://mirror.baidu.com/pypi/simple
Collecting paddlepaddle==2.3.0
  Downloading https://mirror.baidu.com/pypi/packages/01/37/040347acdd4683bbe45a914bf2321f261f378a902822e2cf6cd3b7265cce/paddlepaddle-2.3.0-cp38-cp38-win_amd64.whl (64.2 MB)
Collecting paddle-bfloat==0.1.2
  Downloading https://mirror.baidu.com/pypi/packages/9b/b9/764f50d1c7dd242e61f378aea838aa67d64013c399ff7ccd6a11284de082/paddle_bfloat-0.1.2-cp38-cp38-win_amd64.whl (40 kB)
Collecting astor
  Downloading https://mirror.baidu.com/pypi/packages/c3/88/97eef84f48fa04fbd6750e62dcceafba6c63c81b7ac1420856c8dcc0a3f9/astor-0.8.1-py2.py3-none-any.whl (27 kB)
Installing collected packages: paddle-bfloat, astor, paddlepaddle
Successfully installed astor-0.8.1 paddle-bfloat-0.1.2 paddlepaddle-2.3.0


In [4]:
logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO)
logging.getLogger().setLevel(logging.INFO)
logger = logging.getLogger(__name__)

  and should_run_async(code)


In [5]:
def create_model(pyreader_name, coke_config):
    pyreader = fluid.layers.py_reader\
            (
        capacity=50,
        shapes=[[-1, args["max_seq_len"], 1],
                [-1, args["max_seq_len"], 1],
                [-1, args["max_seq_len"], 1], [-1, 1], [-1, 1]],
        dtypes=[
            'int64', 'int64', 'float32', 'int64', 'int64'],
        lod_levels=[0, 0, 0, 0, 0],
        name=pyreader_name,
        use_double_buffer=True)
    (src_ids, pos_ids, input_mask, mask_labels, mask_positions) = fluid.layers.read_file(pyreader)

    coke = CoKEModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        input_mask=input_mask,
        config=coke_config,
        soft_label=args["soft_label"],
        weight_sharing=args["weight_sharing"],
        use_fp16=args["use_fp16"])

    loss, fc_out = coke.get_pretraining_output(mask_label=mask_labels, mask_pos=mask_positions)
    if args["use_fp16"] and args["loss_scaling"] > 1.0:
        loss = loss * args["loss_scaling"]

    batch_ones = fluid.layers.fill_constant_batch_size_like(
        input=mask_labels, dtype='int64', shape=[1], value=1)
    num_seqs = fluid.layers.reduce_sum(input=batch_ones)

    return pyreader, loss, fc_out, num_seqs


In [6]:
def kbc_predict(test_exe, test_program, test_pyreader, fetch_list, all_examples, true_triplets_dict, eval_result_file):
    eval_i = 0
    step = 0
    batch_eval_rets = []
    f_batch_eval_rets = []
    test_pyreader.start()
    while True:
        try:
            batch_results = []
            np_fc_out = test_exe.run(fetch_list=fetch_list, program=test_program)[0]
            _batch_len = np_fc_out.shape[0]
            for idx in range(np_fc_out.shape[0]):
                logits = [float(x) for x in np_fc_out[idx].flat]
                batch_results.append(logits)
            rank, frank = kbc_batch_evaluation(eval_i, all_examples, batch_results, true_triplets_dict)
            batch_eval_rets.extend(rank)
            f_batch_eval_rets.extend(frank)
            if step % 10 == 0:
                logger.info("Processing kbc_predict step: %d exmaples:%d" % (step, eval_i))
            step += 1
            eval_i += _batch_len
        except fluid.core.EOFException:
            test_pyreader.reset()
            break
    eval_result = compute_kbc_metrics(batch_eval_rets, f_batch_eval_rets, eval_result_file)
    return eval_result

In [7]:
def predict(test_exe, test_program, test_pyreader, fetch_list, all_examples, args):
    dataset = args["dataset"]
    if not os.path.exists(args["checkpoints"]):
        os.makedirs(args["checkpoints"])
    eval_result_file = os.path.join(args["checkpoints"], "eval_result.json")
    logger.info(">> Evaluation result file: %s" % eval_result_file)

    if dataset.lower() in ["pathquerywn", "pathqueryfb"]:
        sen_candli_dict, trivial_sen_set = _load_pathquery_eval_dict(args["sen_candli_file"],
                                                                   args["sen_trivial_file"])
        logger.debug(">> Load sen_candli_dict size: %d" % len(sen_candli_dict))
        logger.debug(">> Trivial sen set size: %d" % len(trivial_sen_set))
        logger.debug(">> Finish load sen_candli set at:{}".format(time.ctime()))
        eval_performance = pathquery_predict(test_exe, test_program, test_pyreader, fetch_list,
                                              all_examples, sen_candli_dict, trivial_sen_set,
                                              eval_result_file)

        outs = "%s\t%.3f\t%.3f" % (args["dataset"], eval_performance['mq'], eval_performance['fhits10'])
        logger.info("\n---------- Evaluation Performance --------------\n%s\n%s" %
                    ("\t".join(["TASK", "MQ", "Hits@10"]), outs))
    else:
        true_triplets_dict = _load_kbc_eval_dict(args["true_triple_path"])
        logger.info(">> Finish loading true triplets dict %s" % time.ctime())
        eval_performance = kbc_predict(test_exe, test_program, test_pyreader, fetch_list,
                                        all_examples, true_triplets_dict, eval_result_file)
        outs = "%s\t%.3f\t%.3f\t%.3f\t%.3f" % (args["dataset"],
                                               eval_performance['fmrr'],
                                               eval_performance['fhits1'],
                                               eval_performance['fhits3'],
                                               eval_performance['fhits10'])
        logger.info("\n----------- Evaluation Performance --------------\n%s\n%s" %
                    ("\t".join(["TASK", "MRR", "Hits@1", "Hits@3", "Hits@10"]), outs))
    return eval_performance


In [8]:
def _load_kbc_eval_dict(true_triple_file):
    def load_true_triples(true_triple_file):
        true_triples = []
        with open(true_triple_file, "r") as fr:
            for line in fr.readlines():
                tokens = line.strip("\r \n").split("\t")
                assert len(tokens) == 3
                true_triples.append(
                    (int(tokens[0]), int(tokens[1]), int(tokens[2])))
        logger.debug("Finish loading %d true triples" % len(true_triples))
        return true_triples
    true_triples = load_true_triples(true_triple_file)
    true_triples_dict = collections.defaultdict(lambda: {'hs': collections.defaultdict(list),
                                          'ts': collections.defaultdict(list)})
    for h, r, t in true_triples:
        true_triples_dict[r]['ts'][h].append(t)
        true_triples_dict[r]['hs'][t].append(h)
    return true_triples_dict

In [15]:
def init_coke_net_config(args, print_config = True):
    
    config = {}
    config["hidden_size"] = args["hidden_size"]
    config["num_hidden_layers"] = args["num_hidden_layers"]
    config["num_attention_heads"] = args["num_attention_heads"]
    config["vocab_size"] = args["vocab_size"]
    config["num_relations"] = args["num_relations"]
    config["max_position_embeddings"] = args["max_position_embeddings"]
    config["hidden_act"] = args["hidden_act"]
    config["hidden_dropout_prob"] = args["hidden_dropout_prob"]
    config["attention_probs_dropout_prob"] = args["attention_probs_dropout_prob"]
    config["initializer_range"] = args["initializer_range"]
    config["intermediate_size"] = args["intermediate_size"]
    
    
    if print_config is True:
        logger.info('----------- CoKE Network Configuration -------------')
        for arg, value in config.items():
            logger.info('%s: %s' % (arg, value))
        logger.info('------------------------------------------------')
    return config


In [9]:
def init_predict_checkpoint(args, exe, startup_prog):
    if args["dataset"] in ["pathQueryWN", "pathQueryFB"]:
        assert args["sen_candli_file"] is not None and args["sen_trivial_file"] is not None, "during test, pathQuery sen_candli_file and path_trivial_file must be set "
    if not args["init_checkpoint"]:
        raise ValueError("args 'init_checkpoint' should be set if"
                         "only doing prediction!")
    init_checkpoint(
        exe,
        args["init_checkpoint"],
        main_program=startup_prog,
        use_fp16=args["use_fp16"])

In [10]:
def init_train_checkpoint(args, exe, startup_prog):
    if args["init_checkpoint"] and args["init_pretraining_params"]:
        logger.info(
            "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
            "both are set! Only arg 'init_checkpoint' is made valid.")
    if args["init_checkpoint"]:
        init_checkpoint(
            exe,
            args["init_checkpoint"],
            main_program=startup_prog,
            use_fp16=args["use_fp16"],
            print_var_verbose=False)
    elif args["init_pretraining_params"]:
        init_pretraining_params(
            exe,
            args["init_pretraining_params"],
            main_program=startup_prog,
            use_fp16=args["use_fp16"])


In [11]:
from reader.coke_reader import KBCDataReader

def get_data_reader(args, data_file, epoch, is_training, shuffle, dev_count, vocab_size):
    if args["dataset"].lower() in ["pathqueryfb", "pathquerywn"]:
        Reader = PathqueryDataReader
    else:
        Reader = KBCDataReader
    data_reader = Reader(vocab_path=args["vocab_path"], 
                         data_path=data_file,
                         max_seq_len=args["max_seq_len"],
                         batch_size=args["batch_size"],
                         is_training=is_training,
                         shuffle=shuffle,
                         epoch=epoch,
                         dev_count=dev_count,
                         vocab_size=vocab_size)
    return data_reader

In [4]:
# coke_config = init_coke_net_config(args, print_config=True)

hidden_size: 256
num_hidden_layers: 12
num_attention_heads: 4
vocab_size: 16396
num_relations: 1345
max_position_embeddings: 40
hidden_act: gelu
hidden_dropout_prob: 0.1
attention_probs_dropout_prob: 0.1
initializer_range: 0.02
intermediate_size: 512


In [None]:
# if args['do_train']:
#     train_data_reader = get_data_reader(args, args["train_file"], is_training=True,
#                                           epoch=args["epoch"], shuffle=True, dev_count=1,
#                                           vocab_size=args["vocab_size"])
    

In [12]:
def main(args):
    if not (args["do_train"] or args["do_predict"]):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")
    if args["use_cuda"]:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    startup_prog = fluid.Program()

    # Init programs
    coke_config = init_coke_net_config(args, print_config=True)
    if args["do_train"]:
        train_data_reader = get_data_reader(args, args["train_file"], is_training=True,
                                            epoch=args["epoch"], shuffle=True, dev_count=dev_count,
                                            vocab_size=args["vocab_size"])

        num_train_examples = train_data_reader.total_instance
        if args["in_tokens"]:
            max_train_steps = args["epoch"] * num_train_examples // (
                    args["batch_size"] // args["max_seq_len"]) // dev_count
        else:
            max_train_steps = args["epoch"] * num_train_examples // (
                args["batch_size"]) // dev_count
        warmup_steps = int(max_train_steps * args["warmup_proportion"])
        logger.info("Device count: %d" % dev_count)
        logger.info("Num train examples: %d" % num_train_examples)
        logger.info("Max train steps: %d" % max_train_steps)
        logger.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        # Create model and set optimization for train
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, _, num_seqs = create_model(
                    pyreader_name='train_reader',
                    coke_config=coke_config)

                scheduled_lr = optimization(
                    loss=loss,
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args["learning_rate"],
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args["weight_decay"],
                    scheduler=args["lr_scheduler"],
                    use_fp16=args["use_fp16"],
                    loss_scaling=args["loss_scaling"])

                if args["use_ema"]:
                    ema = fluid.optimizer.ExponentialMovingAverage(args["ema_decay"])
                    ema.update()

                fluid.memory_optimize(train_program, skip_opt_set=[loss.name, num_seqs.name])

        if args["verbose"]:
            if args["in_tokens"]:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args["batch_size"] // args["max_seq_len"])
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args["batch_size"])
            logger.info("Theoretical memory usage in training:  %.3f - %.3f %s" %
                        (lower_mem, upper_mem, unit))

    if args["do_predict"]:
        # Create model for prediction
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, _, fc_out, num_seqs = create_model(
                    pyreader_name='test_reader',
                    coke_config=coke_config)

                if args["use_ema"] and 'ema' not in dir():
                    ema = fluid.optimizer.ExponentialMovingAverage(args["ema_decay"])

                fluid.memory_optimize(test_prog, skip_opt_set=[fc_out.name, num_seqs.name])

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    # Init checkpoints
    if args["do_train"]:
        init_train_checkpoint(args, exe, startup_prog)
    elif args["do_predict"]:
        init_predict_checkpoint(args, exe, startup_prog)

    # Run training
    if args["do_train"]:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args["use_fast_executor"]
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args["num_iteration_per_drop_scope"]

        train_exe = fluid.ParallelExecutor(
            use_cuda=args["use_cuda"],
            loss_name=loss.name,
            exec_strategy=exec_strategy,
            main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_reader.data_generator())

        train_pyreader.start()
        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()
        while steps < max_train_steps:
            try:
                steps += 1
                if steps % args["skip_steps"] == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args["skip_steps"] == 0:
                    if warmup_steps <= 0:
                        np_loss, np_num_seqs = outputs
                    else:
                        np_loss, np_lr, np_num_seqs = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args["verbose"]:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            np_lr[0]
                            if warmup_steps > 0 else args["learning_rate"])
                        logger.info(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = train_data_reader.get_progress()

                    logger.info("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                                "speed: %f steps/s" %
                                (epoch, current_example, num_train_examples, steps,
                                 np.sum(total_cost) / np.sum(total_num_seqs),
                                 args["skip_steps"] / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if steps == max_train_steps:
                    save_path = os.path.join(args["checkpoints"], "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)
            except fluid.core.EOFException:
                logger.warning(">> EOFException")
                save_path = os.path.join(args.checkpoints, "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
        logger.info(">>Finish training at %s " % time.ctime())

    # Run prediction
    if args["do_predict"]:
        assert dev_count == 1, "During prediction, dev_count expects 1, current is %d" % dev_count
        test_data_reader = get_data_reader(args, args["predict_file"], is_training=False,
                                           epoch=1, shuffle=False, dev_count=dev_count,
                                           vocab_size=args["vocab_size"])
        test_pyreader.decorate_tensor_provider(test_data_reader.data_generator())

        if args["use_ema"]:
            with ema.apply(exe):
                eval_performance = predict(exe, test_prog, test_pyreader,
                                           [fc_out.name], test_data_reader.examples, args)
        else:
            eval_performance = predict(exe, test_prog, test_pyreader,
                                       [fc_out.name], test_data_reader.examples, args)

        logger.info(">>Finish predicting at %s " % time.ctime())


In [13]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
main(args)

06/21/2022 15:40:44 - INFO - __main__ -   ----------- CoKE Network Configuration -------------
06/21/2022 15:40:44 - INFO - __main__ -   hidden_size: 256
06/21/2022 15:40:44 - INFO - __main__ -   num_hidden_layers: 12
06/21/2022 15:40:44 - INFO - __main__ -   num_attention_heads: 4
06/21/2022 15:40:44 - INFO - __main__ -   vocab_size: 16396
06/21/2022 15:40:44 - INFO - __main__ -   num_relations: 1345
06/21/2022 15:40:44 - INFO - __main__ -   max_position_embeddings: 40
06/21/2022 15:40:44 - INFO - __main__ -   hidden_act: gelu
06/21/2022 15:40:44 - INFO - __main__ -   hidden_dropout_prob: 0.1
06/21/2022 15:40:44 - INFO - __main__ -   attention_probs_dropout_prob: 0.1
06/21/2022 15:40:44 - INFO - __main__ -   initializer_range: 0.02
06/21/2022 15:40:44 - INFO - __main__ -   intermediate_size: 512
06/21/2022 15:40:44 - INFO - __main__ -   ------------------------------------------------
06/21/2022 15:40:50 - INFO - __main__ -   Device count: 8
06/21/2022 15:40:50 - INFO - __main__ -   N