In [3]:
import numpy as np
import pandas as pd
import json
import re
import time
import datetime
import random
import os
import pickle
import warnings
warnings.filterwarnings("ignore")

### Step 1. Load temporal triplet
### Step 2. Load multilingual entity alignments
### Step 3. Output raw data

In [6]:
temporal_data = []
with open('EventKG/relations_entities_temporal.nq') as f:
    lines = f.readlines()
    temporal_data = lines

entity_file = []
with open('EventKG/entities.nq') as f:
    lines = f.readlines()
    entity_file = lines

In [7]:
def extract(string):
    pattern = re.compile(r'"(.*)"@(.*)<')
    match = re.findall(pattern, string)
    return match

temporal_event = dict()
for line in temporal_data:
    line = line.strip()
    line = line.split(' ')
    relation = line[0]
    # print(line)
    # break
    if relation not in temporal_event:
        # [subject, object, begin time, end time, relation, data source]
        temporal_event[relation] = [None, None, None, None, None, None]
    if line[1] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#subject>':
        temporal_event[relation][0] = line[2]
    elif line[1] == '<http://www.w3.org/1999/02/22-rdf-syntax-ns#object>':
        temporal_event[relation][1] = line[2]
    elif line[1] == '<http://semanticweb.cs.vu.nl/2009/11/sem/hasBeginTimeStamp>':
        temporal_event[relation][2] = line[2].split("^^")[0][1:-1]
    elif line[1] == '<http://semanticweb.cs.vu.nl/2009/11/sem/hasEndTimeStamp>':
        temporal_event[relation][3] = line[2].split("^^")[0][1:-1]
    elif line[1] == '<http://semanticweb.cs.vu.nl/2009/11/sem/roleType>':
        temporal_event[relation][4] = line[2]
        temporal_event[relation][5] = line[3]

temporal_triplet = pd.DataFrame.from_dict(temporal_event, orient='index',
                columns=['subject', 'object', 'beginTime', 'endTime', 'relation', 'dataSource'])
temporal_triplet = temporal_triplet.dropna()
print("raw temporal triplet:", temporal_triplet.shape)


languages = set()
entity_data = dict()
for line in entity_file:
    entity_ = line.strip().split(' ')
    entity_id = entity_[0]
    if entity_id not in entity_data:
        # entity_id -> [ID, description, language]
        entity_data[entity_id] = [entity_id, None, []]

    match = extract(line)
    if match:
        entity_dscp, entity_lang = match[0][0], match[0][1][:-1]
        entity_data[entity_id][1] = entity_dscp
        entity_data[entity_id][2].append(entity_lang)
        languages.add(entity_lang)
for entity_ in entity_data:
    entity_data[entity_][2] = ' '.join(entity_data[entity_][2])
entity_data = pd.DataFrame.from_dict(entity_data, orient='index', columns=['entityID', 'description', 'language'])

raw temporal triplet: (3592450, 6)


In [4]:
# preprocess the data:

def dump_raw_data(data, data_name):
    if not os.path.exists(data_name):
        os.mkdir(data_name)
    with open(os.path.join(data_name, 'raw_data.pickle'), 'wb') as handle:
        pickle.dump(data, handle)
    
def load_raw_data(data_name):
    with open(os.path.join(data_name, 'raw_data.pickle'), 'rb') as handle:
        data = pickle.load(handle)
    return data

def dump_split_data(data, data_name):
    if not os.path.exists(data_name):
        os.mkdir(data_name)
    with open(os.path.join(data_name, 'split_data.pickle'), 'wb') as handle:
        pickle.dump(data, handle)
    
def load_split_data(data_name):
    with open(os.path.join(data_name, 'split_data.pickle'), 'rb') as handle:
        data = pickle.load(handle)
    return data

def dump_task_data(data, data_name):
    if not os.path.exists(data_name):
        os.mkdir(data_name)
    with open(os.path.join(data_name, 'task_data.pickle'), 'wb') as handle:
        pickle.dump(data, handle)
    
def load_task_data(data_name):
    with open(os.path.join(data_name, 'task_data.pickle'), 'rb') as handle:
        data = pickle.load(handle)
    return data

def filter_triplet(data_name, start_time, end_time, frequency = 20):
    # filter the triplet based on the time interval and frequency
    # data_name: the name of the data source
    # start_time: the start time of the time interval
    # end_time: the end time of the time interval
    # frequency: the frequency of triplet
    # return: the filtered triplet
    print("start preparing %s data" % data_name)
    filtered_triplet = temporal_triplet[(temporal_triplet['dataSource'].str.contains(data_name)) & 
                temporal_triplet['subject'].str.contains("entity") & temporal_triplet['object'].str.contains("entity")]
    filtered_triplet = filtered_triplet[(filtered_triplet['beginTime'] > start_time) & (filtered_triplet['endTime'] < end_time)]
    print("raw temporal triplet:", len(filtered_triplet))
    filtered_triplet = filtered_triplet.drop_duplicates()
    entity_frequency = dict()
    for entity_ in ['subject', 'object']:
        for index, row in filtered_triplet.groupby(entity_).size().items():
            if index not in entity_frequency:
                entity_frequency[index] = 0
            entity_frequency[index] += row
    frequent_entity = [key for key, value in entity_frequency.items() if value >= frequency]
    filtered_triplet = filtered_triplet[filtered_triplet['subject'].isin(frequent_entity) 
                    & filtered_triplet['object'].isin(frequent_entity)]
    print("filtered temporal triplet:", filtered_triplet.shape)
    print("frequent entity:", len(frequent_entity))
    return filtered_triplet, frequent_entity

def language_partition(data):
    # partition the data based on the languages
    # data: the data to be partitioned
    # entity_data: the entity info
    # languages: the languages to be partitioned
    # return: the partitioned data
    data_partition = dict()
    for lang_ in languages:
        entity_lang_set = set(entity_data[entity_data['language'].str.contains(lang_)].entityID)
        temporal_triplet_lang = data[data['subject'].isin(entity_lang_set) & data['object'].isin(entity_lang_set)]
        relations_lang = set(temporal_triplet_lang.relation.unique())
        entities_lang = set(temporal_triplet_lang.subject.unique()) | set(temporal_triplet_lang.object.unique())
        print("languages: ", lang_) 
        print("number of triplets: ", len(temporal_triplet_lang))
        print("number of relations: ", len(relations_lang))
        print("number of entities: ", len(entities_lang))
        data_partition[lang_] = temporal_triplet_lang
    return data_partition

def prepare_data(data_name, start_time = "1980", end_time = "2022", frequency = 10):
    data, _ = filter_triplet(data_name, start_time, end_time, frequency)
    data_partition = language_partition(data)
    return data_partition

val_time = "2005-01-01"
test_time = "2010-01-01"

def string_to_datetime(string):
    try:
        element = datetime.datetime.strptime(string, "%Y-%m-%d")
        return datetime.datetime.timestamp(element)
    except ValueError:
        return None

def datetime_to_index(timestamp, min_time, max_time, num_time_interval):
    time_interval = (max_time - min_time) / (num_time_interval)
    return int((timestamp - min_time) / (time_interval + 1e-6))

def convert_time(data, min_time, max_time, time_steps, val_split = 28, test_split = 32, frac = 0.5):
    # convert the time to the time steps
    # data: the data to be converted
    # min_time: the minimum time of the data
    # max_time: the maximum time of the data
    # val_split: the time to split the data into validation set
    # test_split: the time to split the data into test set
    # time_steps: the time steps of the data
    # frac: the fraction of the data to be used for validation set
    # return: the converted data
    data = data.dropna()
    data['beginTime'] = data['beginTime'].apply(datetime_to_index, args = (min_time, max_time, time_steps))
    data['endTime'] = data['endTime'].apply(datetime_to_index, args = (min_time, max_time, time_steps))
    convert_data = {"raw": [], "train": [], "val": [], "int": [], "ext": []}
    for index, row in data.iterrows():
        for time_ in range(row['beginTime'], row['endTime'] + 1):
            convert_data['raw'].append((row['subject'], row['relation'], row['object'], time_))
            if time_ < val_split:
                if random.random() < frac:
                    convert_data['train'].append((row['subject'], row['relation'], row['object'], time_))
                else:
                    convert_data['int'].append((row['subject'], row['relation'], row['object'], time_))
            elif time_ < test_split:
                convert_data['val'].append((row['subject'], row['relation'], row['object'], time_))
            else:
                convert_data['ext'].append((row['subject'], row['relation'], row['object'], time_))
    relation_set = set([item[1] for item in convert_data['train']]) | set([item[1] for item in convert_data['int']])
    entity_set = set([item[0] for item in convert_data['train']]) | set([item[0] for item in convert_data['int']]) | set([item[2] for item in convert_data['train']]) | set([item[2] for item in convert_data['int']])

    for key in convert_data:
        convert_data[key] = pd.DataFrame(convert_data[key], columns = ['subject', 'relation', 'object', 'time']).drop_duplicates()
        convert_data[key] = convert_data[key][convert_data[key].relation.isin(relation_set)]
        convert_data[key] = convert_data[key][convert_data[key].subject.isin(entity_set) & convert_data[key].object.isin(entity_set)]
    
    print("number of entities: ", len(entity_set))
    print("number of relations: ", len(relation_set))
    print("number of train triplets: ", len(convert_data['train']))
    print("number of int triplets: ", len(convert_data['int']))
    print("number of val triplets: ", len(convert_data['val']))
    print("number of ext triplets: ", len(convert_data['ext']))
    print("total time step: ", time_steps)
    print("number of train time step: ", len(convert_data['train'].time.unique()))
    print("number of ext time step: ", len(convert_data['ext'].time.unique()))
    print("number of val time step: ", len(convert_data['val'].time.unique()))
    return convert_data
    
def split_data(input_data, time_steps):
    data = input_data.copy()
    split_data = dict()
    val_split = string_to_datetime(val_time)
    test_split = string_to_datetime(test_time)
    for lang_ in data:
        data[lang_]['beginTime'] = data[lang_]['beginTime'].apply(string_to_datetime)
        data[lang_]['endTime'] = data[lang_]['endTime'].apply(string_to_datetime)
    min_time, max_time = data['en']['beginTime'].min(), data['en']['endTime'].max()
    print(min_time, max_time)
    for lang_ in data:
        print("=========preparing {} data============".format(lang_))
        split_data[lang_] = convert_time(data[lang_], min_time, max_time, time_steps)
    return split_data

In [9]:
wiki_data = prepare_data('wiki', "1980", "2022", frequency = 10)
# yago_data = prepare_data('yago', "1980", "2022", frequency = 5)

start preparing wiki data
raw temporal triplet: 2185579
filtered temporal triplet: (288133, 6)
frequent entity: 48418
languages:  pl
number of triplets:  121671
number of relations:  91
number of entities:  17794
languages:  bg
number of triplets:  15934
number of relations:  68
number of entities:  3887
languages:  da
number of triplets:  101741
number of relations:  86
number of entities:  16692
languages:  hr
number of triplets:  12880
number of relations:  70
number of entities:  3496
languages:  nl
number of triplets:  255931
number of relations:  106
number of entities:  32776
languages:  de
number of triplets:  221043
number of relations:  104
number of entities:  29093
languages:  ro
number of triplets:  38241
number of relations:  77
number of entities:  7587
languages:  it
number of triplets:  192246
number of relations:  94
number of entities:  25124
languages:  no
number of triplets:  27229
number of relations:  72
number of entities:  6196
languages:  ru
number of triplets

In [11]:
dump_raw_data(wiki_data, 'wiki')
#dump_raw_data(yago_data, 'yago')

In [4]:
wiki_data = load_raw_data('wiki')
#yago_data = load_raw_data('yago')

In [5]:
wiki_data_split = split_data(wiki_data, 40)
dump_split_data(wiki_data_split, 'wiki')

315532800.0 1640908800.0
number of entities:  16706
number of relations:  83
number of train triplets:  139685
number of int triplets:  139490
number of val triplets:  63521
number of ext triplets:  35345
total time step:  40
number of train time step:  28
number of ext time step:  8
number of val time step:  4
number of entities:  3508
number of relations:  61
number of train triplets:  19740
number of int triplets:  19754
number of val triplets:  9556
number of ext triplets:  6358
total time step:  40
number of train time step:  28
number of ext time step:  8
number of val time step:  4
number of entities:  15710
number of relations:  78
number of train triplets:  120913
number of int triplets:  120683
number of val triplets:  50725
number of ext triplets:  26010
total time step:  40
number of train time step:  28
number of ext time step:  8
number of val time step:  4
number of entities:  3161
number of relations:  63
number of train triplets:  16873
number of int triplets:  16838
n

### Step 4: Prepare experimental datasets:
* source.csv: temporal triplets in source language;
* target_train.csv: temporal triplets in target language for training;
* target_val.csv: temporal triplets in target language for validation;
* target_int.csv: temporal triplets in target language for interpolation;
* target_ext.csv: temporal triplets in target language for extropolation;
* alignment_train.csv: alignment pairs for training;
* alignment_test.csv: alignment pairs for testing;

In [5]:
source_language = ['en', 'fr']
target_language = ['bg', 'da', 'de', 'es', 'hr', 'it', 'nl', 'no', 'pl', 'pt', 'ro', 'ru', 'sl']


In [6]:
wiki_data_split = load_split_data('wiki')

In [5]:
def get_map(data):
    relations = list(set(data['train'].relation.unique()) | set(data['int'].relation.unique()))
    relation2id = {item: index for index, item in enumerate(relations)}
    entities = list(set(data['train'].subject.unique()) | set(data['int'].subject.unique()) | set(data['train'].object.unique()) | set(data['int'].object.unique()))
    entity2id = {item: index for index, item in enumerate(entities)}
    return relation2id, entity2id

def reindex(data, relation2id, entity2id):
    data = data[data.relation.isin(set(relation2id.keys()))]
    data['relation'] = data['relation'].apply(lambda x: relation2id[x])
    data['object'] = data['object'].apply(lambda x: entity2id[x])
    data['subject'] = data['subject'].apply(lambda x: entity2id[x])
    return data

def experiment_data(source_lang, target_lang, data_split,data_name = 'wiki', alignment_ratio = 0.1):
    source_data, target_data = data_split[source_lang].copy(), data_split[target_lang].copy()
    relation2id, source_entity2id = get_map(source_data)
    _, target_entity2id = get_map(target_data)
    for key in source_data:
        source_data[key] = reindex(source_data[key], relation2id, source_entity2id)
    for key in target_data:
        target_data[key] = reindex(target_data[key], relation2id, target_entity2id)
    output_path = os.path.join(data_name, '{}-{}'.format(source_lang, target_lang))
    if not os.path.exists(output_path):
        os.makedirs(output_path)
    source_data = pd.concat([source_data['train'], source_data['int']])
    source_data.to_csv(os.path.join(output_path, 'source.csv'), index = False)
    target_data['train'].to_csv(os.path.join(output_path, 'target_train.csv'), index = False)
    target_data['int'].to_csv(os.path.join(output_path, 'target_int.csv'), index = False)
    target_data['val'].to_csv(os.path.join(output_path, 'target_val.csv'), index = False)
    target_data['ext'].to_csv(os.path.join(output_path, 'target_ext.csv'), index = False)

    alignment = source_entity2id.keys() & target_entity2id.keys()
    alignment_data = pd.DataFrame([[source_entity2id[item], target_entity2id[item]] for item in alignment], columns = ['source', 'target'])
    alignment_data_train = alignment_data.sample(frac = alignment_ratio)
    alignment_data_test = alignment_data.drop(alignment_data_train.index)
    alignment_data_train.to_csv(os.path.join(output_path, 'alignment_train.csv'), index = False) 
    alignment_data_test.to_csv(os.path.join(output_path, 'alignment_test.csv'), index = False)
    
    print("=========preparing {}-{} data============".format(source_lang, target_lang))
    print("number of source entity: ", len(source_entity2id))
    print("number of target entity: ", len(target_entity2id))
    print("number of alignment: ", len(alignment))

    return "{}-{}".format(source_lang, target_lang), source_data, target_data, alignment_data_train

def prepare_our_data(source_language, target_language, data_split, data_name = 'wiki', alignment_ratio = 0.1):
    task_data = dict()
    for target_lang_ in target_language:
        for source_lang_ in source_language:
            task, source_data, target_data, alignment_data_train = experiment_data(source_lang_, target_lang_, wiki_data_split, data_name, alignment_ratio)
            task_data[task] = {'source_data': source_data, 'target_data': target_data, 'alignment_data_train': alignment_data_train}
    return task_data

def prepare_OpenKE_data(task_data, data_name, output_path = "../baseline/OpenKE/benchmarks/"):
    for task in task_data:
        if task != 'en-it':
            continue
        source_data = task_data[task]['source_data']
        target_data = task_data[task]['target_data']
        alignment_data_train = task_data[task]['alignment_data_train']
        output_dir = os.path.join(output_path, data_name, task)
        
        entity_base = len(set(source_data.subject.unique()) | set(source_data.object.unique()))
        print(entity_base)
        relation_base = len(set(source_data.relation.unique()))

        train_data = [[item['subject'], item['object'], item['relation']] for _, item in source_data.iterrows()] + \
                        [[item['subject'] + entity_base, item['object'] + entity_base, item['relation']] for _, item in target_data['train'].iterrows()] + \
                            [[item['source'], item['target'] + entity_base, relation_base] for _, item in alignment_data_train.iterrows()]
        val_data = [[item['subject'] + entity_base, item['object'] + entity_base, item['relation']] for _, item in target_data['val'].iterrows()]
        test_data = [[item['subject'] + entity_base, item['object'] + entity_base, item['relation']] for _, item in target_data['ext'].iterrows()]
        test_int_data = [[item['subject'] + entity_base, item['object'] + entity_base, item['relation']] for _, item in target_data['int'].iterrows()]
        
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for data, name in zip([train_data, val_data, test_data, test_int_data], ['train2id.txt', 'valid2id.txt', 'test2id.txt', 'test_int2id.txt']):
            with open(os.path.join(output_dir, name), 'w') as f:
                f.write('{}\n'.format(len(data)))
                for item in data:
                    f.write('{} {} {}\n'.format(item[0], item[1], item[2]))
                f.close()
        
        
        number_of_relation = len(set(source_data.relation.unique())) + 1
        number_of_entity = len(get_map(target_data)[1]) + len(set(source_data.subject.unique()) | set(source_data.object.unique()))
        print(len(train_data[:][0]), len(val_data), len(test_data), len(test_int_data))
        with open(os.path.join(output_dir, 'entity2id.txt'), 'w') as f:
            f.write('{}\n'.format(number_of_entity))
            for i in range(number_of_entity):
                f.write('{}\t{}\n'.format(i,i))
            f.close()
        with open(os.path.join(output_dir, 'relation2id.txt'), 'w') as f:
            f.write('{}\n'.format(number_of_relation))
            for i in range(number_of_relation):
                f.write('{}\t{}\n'.format(i,i))
            f.close()

        

In [11]:
task_data = prepare_our_data(source_language, target_language, wiki_data_split, data_name = 'wiki', alignment_ratio = 0.1)

number of source entity:  34416
number of target entity:  3508
number of alignment:  3508
number of source entity:  32546
number of target entity:  3508
number of alignment:  3504
number of source entity:  34416
number of target entity:  15710
number of alignment:  15690
number of source entity:  32546
number of target entity:  15710
number of alignment:  15447
number of source entity:  34416
number of target entity:  27657
number of alignment:  27621
number of source entity:  32546
number of target entity:  27657
number of alignment:  27289
number of source entity:  34416
number of target entity:  31808
number of alignment:  31771
number of source entity:  32546
number of target entity:  31808
number of alignment:  31068
number of source entity:  34416
number of target entity:  3161
number of alignment:  3161
number of source entity:  32546
number of target entity:  3161
number of alignment:  3161
number of source entity:  34416
number of target entity:  23734
number of alignment:  23

In [12]:
dump_task_data(task_data, 'wiki')

In [6]:
task_data = load_task_data('wiki')
prepare_OpenKE_data(task_data, data_name = 'wiki')

34416
3 100263 51482 212826


In [None]:
f

# Prepare data for KEnS, AlignKGC, SS-AGA:

In [8]:
import os
import pandas as pd
import numpy as np

def data_transfer(path):
    out_path = "/home/ec2-user/quic-efs/user/ruijiew/TKGC/baseline/ss-aga-kgc/dataset"
    column_list = ["subject", "relation", "object"]
    
    for task in os.listdir(path):
        if not os.path.isdir(os.path.join(path, task)):
            continue
        source, target = task.split('-')[0], task.split('-')[1]
        
        if not os.path.exists(os.path.join(out_path, task)):
            os.mkdir(os.path.join(out_path, task))
        
        source_data = pd.read_csv(os.path.join(path, task, 'source.csv'))
        target_train_data = pd.read_csv(os.path.join(path, task, 'target_train.csv'))
        target_val_data = pd.read_csv(os.path.join(path, task, 'target_val.csv'))
        target_test_data = pd.read_csv(os.path.join(path, task, 'target_ext.csv'))
        alignment_data = pd.read_csv(os.path.join(path, task, 'alignment_train.csv'))
        try:
            emb_data = np.load(os.path.join(path, task, 'ent_embedding.npy'))
            np.save(os.path.join(out_path, task, 'entity_embeddings.npy'), emb_data)
            
            rel_data = np.load(os.path.join(path, task, 'rel_embedding.npy'))
            with open(os.path.join(out_path, task, 'relations.txt'), 'w') as f:
                for i in range(rel_data.shape[0]):
                    f.write('{}\n'.format(i))
                f.close()
        except:
            pass
        if not os.path.exists(os.path.join(out_path, task, 'kg')):
            os.mkdir(os.path.join(out_path, task, 'kg'))
        source_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-train.tsv".format(source)), sep = '\t', header = False, index = False)
        source_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-val.tsv".format(source)), sep = '\t', header = False, index = False)
        source_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-test.tsv".format(source)), sep = '\t', header = False, index = False)
        target_train_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-train.tsv".format(target)), sep = '\t', header = False, index = False)
        target_val_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-val.tsv".format(target)), sep = '\t', header = False, index = False)
        target_test_data[column_list].to_csv(os.path.join(out_path, task, 'kg', "{}-test.tsv".format(target)), sep = '\t', header = False, index = False)
        
        if not os.path.exists(os.path.join(out_path, task, 'seed_alignlinks')):
            os.mkdir(os.path.join(out_path, task, 'seed_alignlinks'))
        alignment_data.to_csv(os.path.join(out_path, task, 'seed_alignlinks', "{}-{}.tsv".format(source, target)), sep = '\t', header = False, index = False)
        
        if not os.path.exists(os.path.join(out_path, task, 'entity')):
            os.mkdir(os.path.join(out_path, task, 'entity'))
       
        with open(os.path.join(out_path, task, 'entity', "{}.tsv".format(source)), 'w') as f:
            source_ent_num = len(set(source_data.subject.unique()) | set(source_data.object.unique()))
            for i in range(source_ent_num):
                f.write('{}\n'.format(i))
            f.close()
            
        with open(os.path.join(out_path, task, 'entity', "{}.tsv".format(target)), 'w') as f:
            target_ent_num = emb_data.shape[0] - source_ent_num
            for i in range(target_ent_num):
                f.write('{}\n'.format(i))
            f.close()
        
data_transfer("./wiki")
        