In [1]:
import os
import collections

# Data Preprocessing

In [2]:
datasets = ["fb15k", "fb15k237", "pathqueryFB", "pathqueryWN", "wn18", "wn18rr"]

## Preprocessing KBC Datasets

Extract all the unique entities and relations across all the 3 data files. This is to be later used to build the vocabulary.

In [20]:
def get_unique_entities_relations(train, valid, test):
    print("[INFO] Extracting entities and relations...")
    entity_list = set()
    relation_list = set()
    
    for input_file in [train, valid, test]:
        filename = input_file.split('\\')
        print(f"[INFO] Working with {filename[-1]} file")
        
        # tab-separated (head, relation, tail) triples
        with open(input_file, "r") as f:
            for line in f.readlines():
                tokens = line.strip().split("\t")
                assert len(tokens) == 3
                entity_list.add(tokens[0])
                entity_list.add(tokens[2])
                relation_list.add(tokens[1])

    return entity_list, relation_list

Create the vocabulary with the extracted entities and relations. Write it to the disc.

In [4]:
def write_vocab(vocabulary, entity_list, relation_list):
    print("[INFO] Creating vocabulary...")
    fout = open(vocabulary, "w")
    fout.write("[PAD]" + "\n")
    for i in range(95):
        fout.write(f"[unused{i}]\n")
    fout.write("[UNK]" + "\n")
    fout.write("[CLS]" + "\n")
    fout.write("[SEP]" + "\n")
    fout.write("[MASK]" + "\n")
    for e in entity_list:
        fout.write(e + "\n")
    for r in relation_list:
        fout.write(r + "\n")
    vocab_size = 100 + len(entity_list) + len(relation_list)
    print(f"[INFO] Vocabulary size {vocab_size}")
    fout.close()

Load the vocabulary as an ordered dictionary.

In [5]:
def load_vocab(vocab_file):
    vocab = collections.OrderedDict()
    fin = open(vocab_file)
    for num, line in enumerate(fin):
        token = line.strip()
        index = num
        vocab[token] = int(index)
    return vocab

Recreate the relation triples using the (indexed) vocabulary.

For each of the training, validation and test files, read each raw triple.
Replace the elements of the triple with their corresponding index in the vocabulary dictionary.

In [6]:
def write_true_triples(train, valid, test, vocab, output_file):
    print("[INFO] Writing down all the triples...")
    true_triples = []
    for input_file in [train, valid, test]:
        with open(input_file, "r") as f:
            for line in f.readlines():
                h, r, t = line.strip('\r \n').split('\t')
                assert (h in vocab) and (r in vocab) and (t in vocab)
                hpos = vocab[h]
                rpos = vocab[r]
                tpos = vocab[t]
                true_triples.append((hpos, rpos, tpos))
    
    print(f"[INFO] Number of true triples: {len(true_triples)}")
    fout = open(output_file, "w")
    for hpos, rpos, tpos in true_triples:
        fout.write(str(hpos) + "\t" + str(rpos) + "\t" + str(tpos) + "\n")
    fout.close()

Update the training, validation and test files with masks.

In [21]:
def generate_mask_type(input_file, output_file):
    filename = input_file.split('\\')
    print(f"[INFO] Generating masks for {filename[-1]}...")
    with open(output_file, "w") as fw:
        with open(input_file, "r") as fr:
            for line in fr.readlines():
                fw.write(line.strip('\r \n') + "\tMASK_HEAD\n")
                fw.write(line.strip('\r \n') + "\tMASK_TAIL\n")

Load raw training, validation and test datasets

In [8]:
data = "data\\fb15k"

# existing (input) files
old_train = os.path.join(os.getcwd(), data, "train.txt")
old_valid = os.path.join(os.getcwd(), data, "valid.txt")
old_test = os.path.join(os.getcwd(), data, "test.txt")

Create output files

In [9]:
# new vocabulary file
vocab_file = os.path.join(os.getcwd(), data, "trial_vocab.txt")

# write all the triples 
triples_file = os.path.join(os.getcwd(), data, "trial_all_triples.txt")

# generate masks for the data
new_train = os.path.join(os.getcwd(), data, "trial_train.coke.txt")
new_valid = os.path.join(os.getcwd(), data, "trial_valid.coke.txt")
new_test = os.path.join(os.getcwd(), data, "trial_test.coke.txt")

In [10]:
def kbc_data_preprocess(old_train, old_valid, old_test, 
                        vocabulary, triples_file, 
                        new_train, new_valid, new_test):
    print("[INFO] Initiating data preprocessing...")
    
    entity_list, relation_list = get_unique_entities_relations(old_train, old_valid, old_test)
    
    write_vocab(vocabulary, entity_list, relation_list)
    vocab = load_vocab(vocabulary)
    
    write_true_triples(old_train, old_valid, old_test, vocab, triples_file)

    generate_mask_type(old_train, new_train)
    generate_mask_type(old_valid, new_valid)
    generate_mask_type(old_test, new_test)
    
    print("[INFO] Preprocessing successful!!")

In [22]:
kbc_data_preprocess(old_train, old_valid, old_test, 
                    vocab_file, triples_file, 
                    new_train, new_valid, new_test)

[INFO] Initiating data preprocessing...
[INFO] Extracting entities and relations...
[INFO] Working with train.txt file
[INFO] Working with valid.txt file
[INFO] Working with test.txt file
[INFO] Creating vocabulary...
[INFO] Vocabulary size 16396
[INFO] Writing down all the triples...
[INFO] Number of true triples: 592213
[INFO] Generating masks for train.txt...
[INFO] Generating masks for valid.txt...
[INFO] Generating masks for test.txt...
[INFO] Preprocessing successful!!


## Preprocessing Path Query Datasets

In [47]:
# def pathquery_get_unique_entities_relations(train, valid, test):
#     entity_list = {}
#     relation_list = {}
    
#     for input_file in [train, valid, test]:
#         with open(input_file, "r") as f:
#             for line in f.readlines():
#                 tokens = line.strip().split("\t")
#                 assert len(tokens) == 3
#                 entity_list[tokens[0]] = len(entity_list)
#                 entity_list[tokens[2]] = len(entity_list)
#                 relations = tokens[1].split(",")
#                 for relation in relations:
#                     relation_list[relation] = len(relation_list)
    
#     return entity_list, relation_list

In [49]:
# def filter_base_data(old_train, old_valid, old_test,
#                      train_base, valid_base, test_base):
#     def fil_base(input_file, output_file):
#         fout = open(output_file, "w")
#         base_n = 0
#         with open(input_file, "r") as f:
#             for line in f.readlines():
#                 tokens = line.strip().split("\t")
#                 assert len(tokens) == 3
#                 relations = tokens[1].split(",")
#                 if len(relations) == 1:
#                     fout.write(line)
#                     base_n += 1
#         fout.close()
#         return base_n

#     train_base_n = fil_base(old_train, train_base)
#     valid_base_n = fil_base(old_valid, valid_base)
#     test_base_n = fil_base(old_test, test_base)

In [50]:
# def generate_onlytail_mask_type(input_file, output_file):
#     with open(output_file, "w") as fw:
#         with open(input_file, "r") as fr:
#             for line in fr.readlines():
#                 fw.write(line.strip('\r \n') + "\tMASK_TAIL\n")

In [None]:
# def pathquery_data_preprocess(old_train, old_valid, old_test,
#                               vocab_path, sen_candli_file, trivial_sen_file,
#                               new_train, new_valid, new_test,
#                               train_base, valid_base, test_base):
    
#     print("Extracting unique entities and relations...")
#     entity_list, relation_list = pathquery_get_unique_entities_relations(old_train, old_valid, old_test)
    
#     print("Updating vocabulary...")
#     write_vocab(vocab_path, entity_list, relation_list)
    
#     filter_base_data(old_train, old_valid, old_test,
#                      train_base, valid_base, test_base)
    
#     generate_mask_type(old_train, new_train)
#     generate_onlytail_mask_type(old_valid, new_valid)
#     generate_onlytail_mask_type(old_test, new_test)
    
#     vocab = load_vocab(vocab_path)
    
# #     generate_eval_files(vocab_path, old_test, 
# #                         train_base, valid_base, test_base, 
# #                         sen_candli_file, trivial_sen_file)

In [45]:
# # pathquery_datasets = ["pathqueryFB", "pathqueryWN"]
# data = "data\\pathqueryFB"

In [46]:
# # existing (input) files
# old_train = os.path.join(os.getcwd(), data, "train")
# old_valid = os.path.join(os.getcwd(), data, "valid")
# old_test = os.path.join(os.getcwd(), data, "test")

# new_train = os.path.join(os.getcwd(), data, "train.coke.txt")
# new_valid = os.path.join(os.getcwd(), data, "valid.coke.txt")
# new_test = os.path.join(os.getcwd(), data, "test.coke.txt")

# vocab_file = os.path.join(os.getcwd(), data, "vocab.txt")
# sen_candli_file = os.path.join(os.getcwd(), data, "sen_candli.txt")
# trivial_sen_file = os.path.join(os.getcwd(), data, "trivial_sen.txt")

# train_base = os.path.join(os.getcwd(), data, "train.base.txt")
# valid_base = os.path.join(os.getcwd(), data, "valid.base.txt")
# test_base = os.path.join(os.getcwd(), data, "test.base.txt")

In [None]:
#  pathquery_data_preprocess(old_train, old_valid, old_test,
#                               vocab_file, sen_candli_file, trivial_sen_file,
#                               new_train, new_valid, new_test,
#                               train_base, valid_base, test_base)