diff --git a/README.md b/README.md index c95550e..cba1fa9 100644 --- a/README.md +++ b/README.md @@ -86,22 +86,22 @@ $ python train.py -h $ python train.py -mn TransE # Train using different KGE methods. -$ python train.py -mn [TransE|TransD|TransH|TransG|TransM|TransR|Complex|Complexn3|RotatE| - distmult|KG2E|KG2E_EL|NTN|Rescal|SLM|SME|SME_BL|HoLE|ConvE|ConvKB|Proje_pointwise] +$ python train.py -mn [TransE|TransD|TransH|TransG|TransM|TransR|Complex|Complexn3|CP|RotatE|Analogy| + DistMult|KG2E|KG2E_EL|NTN|Rescal|SLM|SME|SME_BL|HoLE|ConvE|ConvKB|Proje_pointwise] # For KGE using projection-based loss function, use more processes for batch generation. $ python train.py -mn [ConvE|ConvKB|Proje_pointwise] -npg [the number of processes, 4 or 6] # Train TransE model using different benchmark datasets. $ python train.py -mn TransE -ds [fb15k|wn18|wn18_rr|yago3_10|fb15k_237| - ks|nations|umls|dl50a] + ks|nations|umls|dl50a|nell_955] ``` Pykg2vec aims to include most of the state-of-the-art KGE methods. You can check [Implemented Algorithms](https://pykg2vec.readthedocs.io/en/latest/algos.html) for more details. Some models are still under development [Conv2D|TuckER]. To ensure the correctness of included KGE methods we also use the hyperparameter settings from original papers to see if the result is consistent. ```bash # train KGE method with the hyperparameters used in original papers. (FB15k supported only) -$ python train.py -mn [TransE|TransD|TransH|TransG|TransM|TransR|Complex|Complexn3|RotatE| +$ python train.py -mn [TransE|TransD|TransH|TransG|TransM|TransR|Complex|Complexn3|CP|RotatE|Analogy| distmult|KG2E|KG2E_EL|NTN|Rescal|SLM|SME|SME_BL|HoLE|ConvE|ConvKB|Proje_pointwise] -exp true -ds fb15k ``` diff --git a/pykg2vec/_version.py b/pykg2vec/_version.py index bf41adb..a3ebe30 100644 --- a/pykg2vec/_version.py +++ b/pykg2vec/_version.py @@ -1 +1 @@ -__version__ = "0.0.50" +__version__ = "0.0.51" diff --git a/pykg2vec/config/config.py b/pykg2vec/config/config.py index 3910ba9..0761d86 100644 --- a/pykg2vec/config/config.py +++ b/pykg2vec/config/config.py @@ -40,10 +40,12 @@ def __init__(self): self.model_path = "pykg2vec.core" self.config_path = "pykg2vec.config.config" - self.modelMap = {"complex": "Complex.Complex", + self.modelMap = {"analogy": "ANALOGY.ANALOGY", + "complex": "Complex.Complex", "complexn3": "Complex.ComplexN3", "conve": "ConvE.ConvE", "convkb": "ConvKB.ConvKB", + "cp": "CP.CP", "hole": "HoLE.HoLE", "distmult": "DistMult.DistMult", "kg2e": "KG2E.KG2E", @@ -52,6 +54,8 @@ def __init__(self): "proje_pointwise": "ProjE_pointwise.ProjE_pointwise", "rescal": "Rescal.Rescal", "rotate": "RotatE.RotatE", + "simple": "SimplE.SimplE", + "simple_ignr": "SimplE.SimplE_ignr", "slm": "SLM.SLM", "sme": "SME.SME", "sme_bl": "SME.SME_BL", @@ -63,10 +67,12 @@ def __init__(self): "transr": "TransR.TransR", "tucker": "TuckER.TuckER"} - self.configMap = {"complex": "ComplexConfig", + self.configMap = {"analogy": "ANALOGYConfig", + "complex": "ComplexConfig", "complexn3": "ComplexConfig", "conve": "ConvEConfig", "convkb": "ConvKBConfig", + "cp": "CPConfig", "hole": "HoLEConfig", "distmult": "DistMultConfig", "kg2e": "KG2EConfig", @@ -75,6 +81,8 @@ def __init__(self): "proje_pointwise": "ProjE_pointwiseConfig", "rescal": "RescalConfig", "rotate": "RotatEConfig", + "simple": "SimplEConfig", + "simple_ignr": "SimplEConfig", "slm": "SLMConfig", "sme": "SMEConfig", "sme_bl": "SMEConfig", @@ -1318,6 +1326,7 @@ def __init__(self, args=None): BasicConfig.__init__(self, args) + class TuckERConfig(BasicConfig): """This class defines the configuration for the TuckER Algorithm. @@ -1480,4 +1489,164 @@ def __init__(self, args=None): 'neg_rate': self.neg_rate, } - BasicConfig.__init__(self, args) \ No newline at end of file + BasicConfig.__init__(self, args) + + +class CPConfig(BasicConfig): + """This class defines the configuration for the Canonical Tensor Decomposition Algorithm. + + CPConfig inherits the BasicConfig and defines the local arguements used in the + algorithm. + + Attributes: + hyperparameters (dict): Defines the dictionary of hyperparameters to be used by bayesian optimizer for tuning. + + Args: + lambda (float) : Weigth applied to the regularization in the loss function. + learning_rate (float): Defines the learning rate for the optimization. + L1_flag (bool): If True, perform L1 regularization on the model parameters. + hidden_size (int): Defines the size of the latent dimension for entities and relations. + batch_size (int): Defines the batch size for training the algorithm. + epochs (int): Defines the total number of epochs for training the algorithm. + margin (float): Defines the margin used between the positive and negative triple loss. + data (str): Defines the knowledge base dataset to be used for training the algorithm. + optimizer (str): Defines the optimization algorithm such as adam, sgd, adagrad, etc. + sampling (str): Defines the sampling (bern or uniform) for corrupting the triples. + + """ + + def __init__(self, args=None): + self.lmbda = args.lmbda + self.learning_rate = args.learning_rate + self.hidden_size = args.hidden_size + self.batch_size = args.batch_training + self.epochs = args.epochs + self.data = args.dataset_name + self.optimizer = args.optimizer + self.sampling = args.sampling + self.neg_rate = args.negrate + + if args.exp is True: + paper_params = HyperparamterLoader().load_hyperparameter(args.dataset_name, 'cp') + for key, value in paper_params.items(): + self.__dict__[key] = value # copy all the setting from the paper. + + self.hyperparameters = { + 'lmbda': self.lmbda, + 'learning_rate': self.learning_rate, + 'hidden_size': self.hidden_size, + 'batch_size': self.batch_size, + 'epochs': self.epochs, + 'data': self.data, + 'optimizer': self.optimizer, + 'sampling': self.sampling, + 'neg_rate': self.neg_rate, + } + + BasicConfig.__init__(self, args) + + +class ANALOGYConfig(BasicConfig): + """This class defines the configuration for the ANALOGY Algorithm. + + ANALOGYConfig inherits the BasicConfig and defines the local arguements used in the + algorithm. + + Attributes: + hyperparameters (dict): Defines the dictionary of hyperparameters to be used by bayesian optimizer for tuning. + Args: + lambda (float) : Weigth applied to the regularization in the loss function. + learning_rate (float): Defines the learning rate for the optimization. + L1_flag (bool): If True, perform L1 regularization on the model parameters. + hidden_size (int): Defines the size of the latent dimension for entities and relations. + batch_size (int): Defines the batch size for training the algorithm. + epochs (int): Defines the total number of epochs for training the algorithm. + margin (float): Defines the margin used between the positive and negative triple loss. + data (str): Defines the knowledge base dataset to be used for training the algorithm. + optimizer (str): Defines the optimization algorithm such as adam, sgd, adagrad, etc. + sampling (str): Defines the sampling (bern or uniform) for corrupting the triples. + """ + + def __init__(self, args=None): + self.lmbda = args.lmbda + self.learning_rate = args.learning_rate + self.hidden_size = args.hidden_size + self.batch_size = args.batch_training + self.epochs = args.epochs + self.data = args.dataset_name + self.optimizer = args.optimizer + self.sampling = args.sampling + self.neg_rate = args.negrate + + if args.exp is True: + paper_params = HyperparamterLoader().load_hyperparameter(args.dataset_name, 'analogy') + for key, value in paper_params.items(): + self.__dict__[key] = value # copy all the setting from the paper. + + self.hyperparameters = { + 'lmbda': self.lmbda, + 'learning_rate': self.learning_rate, + 'hidden_size': self.hidden_size, + 'batch_size': self.batch_size, + 'epochs': self.epochs, + 'data': self.data, + 'optimizer': self.optimizer, + 'sampling': self.sampling, + 'neg_rate': self.neg_rate, + } + + BasicConfig.__init__(self, args) + + +class SimplEConfig(BasicConfig): + """This class defines the configuration for the SimplE Algorithm. + + SimplEConfig inherits the BasicConfig and defines the local arguements used in the + algorithm. + + Attributes: + hyperparameters (dict): Defines the dictionary of hyperparameters to be used by bayesian optimizer for tuning. + + Args: + lambda (float) : Weigth applied to the regularization in the loss function. + learning_rate (float): Defines the learning rate for the optimization. + L1_flag (bool): If True, perform L1 regularization on the model parameters. + hidden_size (int): Defines the size of the latent dimension for entities and relations. + batch_size (int): Defines the batch size for training the algorithm. + epochs (int): Defines the total number of epochs for training the algorithm. + margin (float): Defines the margin used between the positive and negative triple loss. + data (str): Defines the knowledge base dataset to be used for training the algorithm. + optimizer (str): Defines the optimization algorithm such as adam, sgd, adagrad, etc. + sampling (str): Defines the sampling (bern or uniform) for corrupting the triples. + + """ + + def __init__(self, args=None): + self.lmbda = args.lmbda + self.learning_rate = args.learning_rate + self.hidden_size = args.hidden_size + self.batch_size = args.batch_training + self.epochs = args.epochs + self.data = args.dataset_name + self.optimizer = args.optimizer + self.sampling = args.sampling + self.neg_rate = args.negrate + + if args.exp is True: + paper_params = HyperparamterLoader().load_hyperparameter(args.dataset_name, 'simple') + for key, value in paper_params.items(): + self.__dict__[key] = value # copy all the setting from the paper. + + self.hyperparameters = { + 'lmbda': self.lmbda, + 'learning_rate': self.learning_rate, + 'hidden_size': self.hidden_size, + 'batch_size': self.batch_size, + 'epochs': self.epochs, + 'data': self.data, + 'optimizer': self.optimizer, + 'sampling': self.sampling, + 'neg_rate': self.neg_rate, + } + + BasicConfig.__init__(self, args) diff --git a/pykg2vec/config/hyperparams.py b/pykg2vec/config/hyperparams.py index a54726f..fd932f5 100644 --- a/pykg2vec/config/hyperparams.py +++ b/pykg2vec/config/hyperparams.py @@ -31,8 +31,11 @@ def __init__(self): 'complex' : {'learning_rate': 0.05,'hidden_size':200,'batch_size':5000,'epochs':1000,'optimizer':'adagrad','sampling':"uniform",'neg_rate':1,'lmbda':0.0001}, 'distmult': {'learning_rate': 0.1,'hidden_size':100,'batch_size':50000,'epochs':1000,'optimizer':'adagrad','sampling':"uniform",'neg_rate':1,'lmbda':0.0001}, 'proje_po': {'learning_rate': 0.01,'hidden_dropout': 0.5, 'hidden_size':200,'batch_size':200,' epochs':100, 'optimizer':'adam','lmbda':0.00001}, - 'conve' : {'learning_rate': 0.003,'optimizer':'adam', 'label_smoothing':0.1, 'batch_size':128, 'hidden_size':200, 'hidden_size_1':20, 'input_dropout':0.2, 'feature_map_dropout':0.2, 'hidden_dropout':0.3,'neg_rate':0, 'epochs':100}, - 'convkb' : {'lmbda': 0.001,'filter_sizes':[1,2],'num_filters':50,'learning_rate': 0.0001,'optimizer':'adam','hidden_size': 100,'batch_size': 128,'epochs':200,'neg_rate':1} + 'conve' : {'learning_rate': 0.003,'optimizer':'adam', 'label_smoothing':0.1, 'batch_size':128, 'hidden_size':200, 'hidden_size_1':20, 'input_dropout':0.2, 'feature_map_dropout':0.2, 'hidden_dropout':0.3,'neg_rate':0}, + 'convkb' : {'lmbda': 0.001,'filter_sizes':[1,2],'num_filters':50,'learning_rate': 0.0001,'optimizer':'adam','hidden_size': 100,'batch_size': 128,'epochs':200,'neg_rate':1}, + 'cp': {'learning_rate': 0.01, 'hidden_size': 50, 'batch_size': 128, 'epochs': 50, 'optimizer': 'adagrad', 'sampling': "uniform", 'neg_rate': 1, 'lmbda': 0.0001}, + 'analogy': {'learning_rate': 0.1, 'hidden_size': 200, 'batch_size': 128, 'epochs': 500, 'optimizer': 'adagrad', 'sampling': "uniform", 'neg_rate': 1, 'lmbda': 0.0001}, + 'simple': {'learning_rate': 0.05, 'hidden_size': 100, 'batch_size': 128, 'epochs': 1000, 'optimizer': 'adagrad', 'sampling': "uniform", 'neg_rate': 1, 'lmbda': 0.1} } } @@ -698,4 +701,109 @@ def __init__(self): self.training_threshold = [1.0, 2.0, 3.0] self.ncluster = [3, 4, 5, 6, 7] self.CRP_factor = [0.01, 0.05, 0.1] - self.weight_norm = [True, False] \ No newline at end of file + self.weight_norm = [True, False] + + +class CPParams: + """This class defines the hyperameters and its ranges for tuning Canonical Tensor Decomposition algorithm. + + CPParams defines all the possibel values to be tuned for the algorithm. User may + + change these values directly for performing the bayesian optimization of the hyper-parameters + + Args: + lambda (list) : List of floating point values. + feature_map_dropout (list) :List of floating point values. + input_dropout (list) : List of floating point values. + hidden_dropout (list) : List of floating point values. + use_bias (list) :List of boolean values. + label_smoothing (list) : List of floating point values. + lr_decay (float) : List of floating point values. + learning_rate (list): List of floating point values. + L1_flag (list): List of boolean values. + hidden_size (list): List of integer values. + batch_size (list): List of integer values. + epochs (list): List of integer values. + margin (list): List of floating point values. + optimizer (list): List of strings defining the optimization algorithm to be used. + sampling (list): List of string defining the sampling to be used for generating negative examples. + + """ + + def __init__(self): + self.search_space = { + 'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.1)), + 'hidden_size': scope.int(hp.qloguniform('hidden_size', np.log(8), np.log(256),1)), + 'batch_size': scope.int(hp.qloguniform('batch_size', np.log(8), np.log(4096),1)), + 'lmbda': hp.loguniform('lmbda', np.log(0.00001), np.log(0.001)), + 'optimizer': hp.choice('optimizer', ["adam", "sgd", 'rms']), + 'epochs': hp.choice('epochs', [10]) # always choose 10 training epochs. + } + +class ANALOGYParams: + """This class defines the hyperameters and its ranges for tuning ANALOGY algorithm. + ANALOGYParams defines all the possibel values to be tuned for the algorithm. User may + change these values directly for performing the bayesian optimization of the hyper-parameters + Args: + lambda (list) : List of floating point values. + feature_map_dropout (list) :List of floating point values. + input_dropout (list) : List of floating point values. + hidden_dropout (list) : List of floating point values. + use_bias (list) :List of boolean values. + label_smoothing (list) : List of floating point values. + lr_decay (float) : List of floating point values. + learning_rate (list): List of floating point values. + L1_flag (list): List of boolean values. + hidden_size (list): List of integer values. + batch_size (list): List of integer values. + epochs (list): List of integer values. + margin (list): List of floating point values. + optimizer (list): List of strings defining the optimization algorithm to be used. + sampling (list): List of string defining the sampling to be used for generating negative examples. + """ + + def __init__(self): + self.search_space = { + 'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.1)), + 'hidden_size': scope.int(hp.qloguniform('hidden_size', np.log(8), np.log(256),1)), + 'batch_size': scope.int(hp.qloguniform('batch_size', np.log(8), np.log(4096),1)), + 'lmbda': hp.loguniform('lmbda', np.log(0.00001), np.log(0.001)), + 'optimizer': hp.choice('optimizer', ["adam", "sgd", 'rms']), + 'epochs': hp.choice('epochs', [10]) # always choose 10 training epochs. + } + +class SimplEParams: + """This class defines the hyperameters and its ranges for tuning SimplE algorithm. + + SimplEParams defines all the possibel values to be tuned for the algorithm. User may + + change these values directly for performing the bayesian optimization of the hyper-parameters + + Args: + lambda (list) : List of floating point values. + feature_map_dropout (list) :List of floating point values. + input_dropout (list) : List of floating point values. + hidden_dropout (list) : List of floating point values. + use_bias (list) :List of boolean values. + label_smoothing (list) : List of floating point values. + lr_decay (float) : List of floating point values. + learning_rate (list): List of floating point values. + L1_flag (list): List of boolean values. + hidden_size (list): List of integer values. + batch_size (list): List of integer values. + epochs (list): List of integer values. + margin (list): List of floating point values. + optimizer (list): List of strings defining the optimization algorithm to be used. + sampling (list): List of string defining the sampling to be used for generating negative examples. + + """ + + def __init__(self): + self.search_space = { + 'learning_rate': hp.loguniform('learning_rate', np.log(0.00001), np.log(0.1)), + 'hidden_size': scope.int(hp.qloguniform('hidden_size', np.log(8), np.log(256),1)), + 'batch_size': scope.int(hp.qloguniform('batch_size', np.log(8), np.log(4096),1)), + 'lmbda': hp.loguniform('lmbda', np.log(0.00001), np.log(0.001)), + 'optimizer': hp.choice('optimizer', ["adam", "sgd", 'rms']), + 'epochs': hp.choice('epochs', [10]) # always choose 10 training epochs. + } diff --git a/pykg2vec/core/ANALOGY.py b/pykg2vec/core/ANALOGY.py new file mode 100644 index 0000000..c0c2b92 --- /dev/null +++ b/pykg2vec/core/ANALOGY.py @@ -0,0 +1,101 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from pykg2vec.core.KGMeta import ModelMeta +from pykg2vec.utils.generator import TrainingStrategy + + +class ANALOGY(ModelMeta): + + def __init__(self, config): + super(ANALOGY, self).__init__() + self.config = config + self.model_name = 'ANALOGY' + self.training_strategy = TrainingStrategy.POINTWISE_BASED + + def def_parameters(self): + """Defines the model parameters. + + Attributes: + num_total_ent (int): Total number of entities. + num_total_rel (int): Total number of relations. + k (Tensor): Size of the latent dimesnion for entities and relations. + ent_embeddings (Tensor Variable): Lookup variable containing embedding of the entities. + rel_embeddings (Tensor Variable): Lookup variable containing embedding of the relations. + ent_embeddings_real (Tensor Variable): Lookup variable containing real values of the entities. + ent_embeddings_img (Tensor Variable): Lookup variable containing imaginary values of the entities. + rel_embeddings_real (Tensor Variable): Lookup variable containing real values of the relations. + rel_embeddings_img (Tensor Variable): Lookup variable containing imaginary values of the relations. + parameter_list (list): List of Tensor parameters. + """ + num_total_ent = self.config.kg_meta.tot_entity + num_total_rel = self.config.kg_meta.tot_relation + k = self.config.hidden_size + + emb_initializer = tf.initializers.glorot_normal() + self.ent_embeddings = tf.Variable(emb_initializer(shape=(num_total_ent, k)), name="ent_embedding") + self.rel_embeddings = tf.Variable(emb_initializer(shape=(num_total_rel, k)), name="rel_embedding") + self.ent_embeddings_real = tf.Variable(emb_initializer(shape=(num_total_ent, k // 2)), name="emb_e_real") + self.ent_embeddings_img = tf.Variable(emb_initializer(shape=(num_total_ent, k // 2)), name="emb_e_img") + self.rel_embeddings_real = tf.Variable(emb_initializer(shape=(num_total_rel, k // 2)), name="emb_rel_real") + self.rel_embeddings_img = tf.Variable(emb_initializer(shape=(num_total_rel, k // 2)), name="emb_rel_img") + self.parameter_list = [self.ent_embeddings, self.rel_embeddings, self.ent_embeddings_real, self.ent_embeddings_img, self.rel_embeddings_real, self.rel_embeddings_img] + + def embed(self, h, r, t): + """Function to get the embedding value. + + Args: + h (Tensor): Head entities ids. + r (Tensor): Relation ids of the triple. + t (Tensor): Tail entity ids of the triple. + + Returns: + Tensors: Returns head, relation and tail embedding Tensors. + """ + h_emb = tf.nn.embedding_lookup(self.ent_embeddings, h) + r_emb = tf.nn.embedding_lookup(self.rel_embeddings, r) + t_emb = tf.nn.embedding_lookup(self.ent_embeddings, t) + + return h_emb, r_emb, t_emb + + def embed2(self, h, r, t): + """Function to get the embedding value. + + Args: + h (Tensor): Head entities ids. + r (Tensor): Relation ids of the triple. + t (Tensor): Tail entity ids of the triple. + + Returns: + Tensors: Returns real and imaginary values of head, relation and tail embedding. + """ + h_emb_real = tf.nn.embedding_lookup(self.ent_embeddings_real, h) + h_emb_img = tf.nn.embedding_lookup(self.ent_embeddings_img, h) + + r_emb_real = tf.nn.embedding_lookup(self.rel_embeddings_real, r) + r_emb_img = tf.nn.embedding_lookup(self.rel_embeddings_img, r) + + t_emb_real = tf.nn.embedding_lookup(self.ent_embeddings_real, t) + t_emb_img = tf.nn.embedding_lookup(self.ent_embeddings_img, t) + + return h_emb_real, h_emb_img, r_emb_real, r_emb_img, t_emb_real, t_emb_img + + def forward(self, h, r, t): + h_e, r_e, t_e = self.embed(h, r, t) + h_e_real, h_e_img, r_e_real, r_e_img, t_e_real, t_e_img = self.embed2(h, r, t) + + complex_loss = -tf.reduce_sum(h_e_real * t_e_real * r_e_real + h_e_img * t_e_img * r_e_real + h_e_real * t_e_img * r_e_img - h_e_img * t_e_real * r_e_img, -1) + distmult_loss = -tf.reduce_sum(h_e * r_e * t_e, -1) + return complex_loss + distmult_loss + + def get_reg(self, h, r, t): + h_e, r_e, t_e = self.embed(h, r, t) + h_e_real, h_e_img, r_e_real, r_e_img, t_e_real, t_e_img = self.embed2(h, r, t) + + regul_term = tf.reduce_mean(tf.reduce_sum(h_e_real**2, -1) + tf.reduce_sum(h_e_img**2, -1) + tf.reduce_sum(r_e_real**2,-1) + + tf.reduce_sum(r_e_img**2, -1) + tf.reduce_sum(t_e_real**2, -1) + tf.reduce_sum(t_e_img**2, -1) + + tf.reduce_sum(h_e**2, -1) + tf.reduce_sum(r_e**2, -1) + tf.reduce_sum(t_e**2,-1)) + return self.config.lmbda*regul_term diff --git a/pykg2vec/core/CP.py b/pykg2vec/core/CP.py new file mode 100644 index 0000000..da0b647 --- /dev/null +++ b/pykg2vec/core/CP.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import tensorflow as tf + +from pykg2vec.core.KGMeta import ModelMeta +from pykg2vec.utils.generator import TrainingStrategy + + +class CP(ModelMeta): + + def __init__(self, config): + super(CP, self).__init__() + self.config = config + self.model_name = 'CP' + self.training_strategy = TrainingStrategy.POINTWISE_BASED + + def def_parameters(self): + """Defines the model parameters. + + Attributes: + num_total_ent (int): Total number of entities. + num_total_rel (int): Total number of relations. + k (Tensor): Size of the latent dimesnion for entities and relations. + sub_embeddings (Tensor Variable): Lookup variable containing embedding of the head entities. + rel_embeddings (Tensor Variable): Lookup variable containing embedding of the relations. + obj_embeddings (Tensor Variable): Lookup variable containing embedding of the tail entities. + parameter_list (list): List of Tensor parameters. + """ + num_total_ent = self.config.kg_meta.tot_entity + num_total_rel = self.config.kg_meta.tot_relation + k = self.config.hidden_size + + emb_initializer = tf.initializers.glorot_normal() + self.sub_embeddings = tf.Variable(emb_initializer(shape=(num_total_ent, k)), name="sub_embedding") + self.rel_embeddings = tf.Variable(emb_initializer(shape=(num_total_rel, k)), name="rel_embedding") + self.obj_embeddings = tf.Variable(emb_initializer(shape=(num_total_ent, k)), name="obj_embedding") + self.parameter_list = [self.sub_embeddings, self.rel_embeddings, self.obj_embeddings] + + + def embed(self, h, r, t): + """Function to get the embedding value. + + Args: + h (Tensor): Head entities ids. + r (Tensor): Relation ids of the triple. + t (Tensor): Tail entity ids of the triple. + + Returns: + Tensors: Returns head, relation and tail embedding Tensors. + """ + emb_h = tf.nn.embedding_lookup(self.sub_embeddings, h) + emb_r = tf.nn.embedding_lookup(self.rel_embeddings, r) + emb_t = tf.nn.embedding_lookup(self.obj_embeddings, t) + return emb_h, emb_r, emb_t + + def forward(self, h, r, t): + h_e, r_e, t_e = self.embed(h, r, t) + return -tf.reduce_sum(h_e * r_e * t_e, -1) + + def get_reg(self, h, r, t, type='N3'): + h_e, r_e, t_e = self.embed(h, r, t) + if type.lower() == 'f2': + regul_term = tf.reduce_mean(tf.reduce_sum(h_e**2, -1) + tf.reduce_sum(r_e**2, -1) + tf.reduce_sum(t_e**2,-1)) + elif type.lower() == 'n3': + regul_term = tf.reduce_mean(tf.reduce_sum(h_e**3, -1) + tf.reduce_sum(r_e**3, -1) + tf.reduce_sum(t_e**3,-1)) + else: + raise NotImplementedError('Unknown regularizer type: %s' % type) + + return self.config.lmbda * regul_term diff --git a/pykg2vec/core/DistMult.py b/pykg2vec/core/DistMult.py index e994aa9..a4ce13a 100644 --- a/pykg2vec/core/DistMult.py +++ b/pykg2vec/core/DistMult.py @@ -86,7 +86,7 @@ def forward(self, h, r, t): h_e, r_e, t_e = self.embed(h, r, t) return -tf.reduce_sum(h_e*r_e*t_e, -1) - def get_regul(self, h, r, t): + def get_reg(self, h, r, t): h_e, r_e, t_e = self.embed(h, r, t) regul_term = tf.reduce_mean(tf.reduce_sum(h_e**2, -1) + tf.reduce_sum(r_e**2, -1) + tf.reduce_sum(t_e**2,-1)) return self.config.lmbda*regul_term \ No newline at end of file diff --git a/pykg2vec/core/KGMeta.py b/pykg2vec/core/KGMeta.py index 800ec3a..5adc33a 100644 --- a/pykg2vec/core/KGMeta.py +++ b/pykg2vec/core/KGMeta.py @@ -8,7 +8,6 @@ from abc import ABCMeta, abstractmethod import tensorflow as tf -from pykg2vec.utils.generator import TrainingStrategy class ModelMeta(tf.keras.Model): """ Meta Class for knowledge graph embedding algorithms""" diff --git a/pykg2vec/core/SimplE.py b/pykg2vec/core/SimplE.py new file mode 100644 index 0000000..f6456bf --- /dev/null +++ b/pykg2vec/core/SimplE.py @@ -0,0 +1,112 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math +import tensorflow as tf + +from pykg2vec.core.KGMeta import ModelMeta +from pykg2vec.utils.generator import TrainingStrategy + + +class SimplE(ModelMeta): + + def __init__(self, config): + super(SimplE, self).__init__() + self.config = config + self.model_name = 'SimplE_avg' + self.training_strategy = TrainingStrategy.POINTWISE_BASED + + def def_parameters(self): + """Defines the model parameters. + + Attributes: + num_total_ent (int): Total number of entities. + num_total_rel (int): Total number of relations. + k (Tensor): Size of the latent dimesnion for entities and relations. + ent_head_embeddings (Tensor Variable): Lookup variable containing embedding of the head entities. + ent_tail_embeddings (Tensor Variable): Lookup variable containing embedding of the tail relations. + rel_embeddings (Tensor Variable): Lookup variable containing embedding of the entities. + rel_inv_embeddings (Tensor Variable): Lookup variable containing embedding of the inverse relations. + parameter_list (list): List of Tensor parameters. + """ + num_total_ent = self.config.kg_meta.tot_entity + num_total_rel = self.config.kg_meta.tot_relation + k = self.config.hidden_size + + emb_initializer = tf.initializers.glorot_normal() + self.ent_head_embeddings = tf.Variable(emb_initializer(shape=(num_total_ent, k)), name="ent_head_embedding") + self.ent_tail_embeddings = tf.Variable(emb_initializer(shape=(num_total_ent, k)), name="ent_tail_embedding") + self.rel_embeddings = tf.Variable(emb_initializer(shape=(num_total_rel, k)), name="rel_embedding") + self.rel_inv_embeddings = tf.Variable(emb_initializer(shape=(num_total_rel, k)), name="rel_inv_embedding") + self.parameter_list = [self.ent_head_embeddings, self.ent_tail_embeddings, self.rel_embeddings, self.rel_inv_embeddings] + + + def embed(self, h, r, t): + """Function to get the embedding value. + + Args: + h (Tensor): Head entities ids. + r (Tensor): Relation ids of the triple. + t (Tensor): Tail entity ids of the triple. + + Returns: + Tensors: Returns head, relation and tail embedding Tensors. + """ + emb_h1 = tf.nn.embedding_lookup(self.ent_head_embeddings, h) + emb_h2 = tf.nn.embedding_lookup(self.ent_head_embeddings, t) + emb_r1 = tf.nn.embedding_lookup(self.rel_embeddings, r) + emb_r2 = tf.nn.embedding_lookup(self.rel_inv_embeddings, r) + emb_t1 = tf.nn.embedding_lookup(self.ent_tail_embeddings, t) + emb_t2 = tf.nn.embedding_lookup(self.ent_tail_embeddings, h) + return emb_h1, emb_h2, emb_r1, emb_r2, emb_t1, emb_t2 + + def forward(self, h, r, t): + h1_e, h2_e, r1_e, r2_e, t1_e, t2_e = self.embed(h, r, t) + + init = tf.reduce_sum(h1_e*r1_e*t1_e, 1) + tf.reduce_sum(h2_e*r2_e*t2_e, 1) / 2.0 + return -tf.clip_by_value(init, -20, 20) + + def get_reg(self, h, r, t): + num_batch = math.ceil(self.config.kg_meta.tot_train_triples / self.config.batch_size) + regul_term = (tf.nn.l2_loss(self.ent_head_embeddings) + tf.nn.l2_loss(self.ent_tail_embeddings) + + tf.nn.l2_loss(self.rel_embeddings) + tf.nn.l2_loss(self.rel_inv_embeddings)) / num_batch**2 + return self.config.lmbda * regul_term + + +class SimplE_ignr(SimplE): + + def __init__(self, config): + super(SimplE, self).__init__() + self.config = config + self.model_name = 'SimplE_ignr' + self.training_strategy = TrainingStrategy.POINTWISE_BASED + + def embed(self, h, r, t): + """Function to get the embedding value. + + Args: + h (Tensor): Head entities ids. + r (Tensor): Relation ids of the triple. + t (Tensor): Tail entity ids of the triple. + + Returns: + Tensors: Returns head, relation and tail embedding Tensors. + """ + + emb_h = tf.concat([tf.gather(self.ent_head_embeddings, h), tf.gather(self.ent_head_embeddings, t)], 1) + emb_r = tf.concat([tf.gather(self.rel_embeddings, r), tf.gather(self.rel_inv_embeddings, r)], 1) + emb_t = tf.concat([tf.gather(self.ent_tail_embeddings, t), tf.gather(self.ent_tail_embeddings, h)], 1) + + return emb_h, emb_r, emb_t + + def forward(self, h, r, t): + h_e, r_e, t_e = self.embed(h, r, t) + + init = tf.reduce_sum(h_e*r_e*t_e, 1) + return -tf.clip_by_value(init, -20, 20) + + def get_reg(self, h, r, t): + return 2.0 * super().get_reg(h, r, t) + + diff --git a/pykg2vec/core/TransR.py b/pykg2vec/core/TransR.py index 492b062..d0031a2 100644 --- a/pykg2vec/core/TransR.py +++ b/pykg2vec/core/TransR.py @@ -101,7 +101,7 @@ def embed(self, h, r, t): transform_h_e = tf.matmul(h_e, matrix) transform_t_e = tf.matmul(t_e, matrix) - # [b, d, 1] = [b, 1, k] * [b, k, d] + # [b, 1, d] = [b, 1, k] * [b, k, d] h_e = tf.squeeze(transform_h_e, axis=1) t_e = tf.squeeze(transform_t_e, axis=1) diff --git a/pykg2vec/test/conftest.py b/pykg2vec/test/conftest.py new file mode 100644 index 0000000..54448cb --- /dev/null +++ b/pykg2vec/test/conftest.py @@ -0,0 +1,7 @@ +import pytest +import tensorflow as tf + +@pytest.fixture(scope="session", autouse=True) +def switch_on_eager_execution(request): + """Setup eager execution within the pytest runtime for better visibility to Coverage.py""" + tf.config.experimental_run_functions_eagerly(True) \ No newline at end of file diff --git a/pykg2vec/test/resource/dataset.tar.gz b/pykg2vec/test/resource/dataset.tar.gz new file mode 100644 index 0000000..15a255f Binary files /dev/null and b/pykg2vec/test/resource/dataset.tar.gz differ diff --git a/pykg2vec/test/resource/dataset.tgz b/pykg2vec/test/resource/dataset.tgz new file mode 100644 index 0000000..15a255f Binary files /dev/null and b/pykg2vec/test/resource/dataset.tgz differ diff --git a/pykg2vec/test/resource/dataset.zip b/pykg2vec/test/resource/dataset.zip new file mode 100644 index 0000000..f078da3 Binary files /dev/null and b/pykg2vec/test/resource/dataset.zip differ diff --git a/pykg2vec/test/test_inference.py b/pykg2vec/test/test_inference.py index 6dcad38..bda9a54 100644 --- a/pykg2vec/test/test_inference.py +++ b/pykg2vec/test/test_inference.py @@ -57,10 +57,12 @@ def testing_function_with_args(name, l1_flag, distance_measure=None, bilinear=No trainer.exit_interactive_mode() @pytest.mark.parametrize("model_name", [ + 'analogy', 'complex', 'complexn3', # 'conve', # 'convkb', + 'cp', 'distmult', 'hole', 'kg2e', @@ -69,6 +71,8 @@ def testing_function_with_args(name, l1_flag, distance_measure=None, bilinear=No # 'proje_pointwise', 'rotate', 'rescal', + 'simple', + 'simple_ignr', 'slm', 'sme', 'transd', diff --git a/pykg2vec/test/test_kg.py b/pykg2vec/test/test_kg.py index 546b475..1737406 100644 --- a/pykg2vec/test/test_kg.py +++ b/pykg2vec/test/test_kg.py @@ -1,5 +1,6 @@ import os, pytest -from pykg2vec.utils.kgcontroller import KnowledgeGraph +from pathlib import Path +from pykg2vec.utils.kgcontroller import KnowledgeGraph, KnownDataset @pytest.mark.parametrize("dataset_name", ["freebase15k", "wordnet18", "wordnet18_rr", "yago3_10"]) def test_benchmarks(dataset_name): @@ -39,7 +40,7 @@ def test_fb15k_meta(): def test_userdefined_dataset(): - custom_dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), "resource/custom_dataset") + custom_dataset_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource', 'custom_dataset') knowledge_graph = KnowledgeGraph(dataset="userdefineddataset", custom_dataset_path=custom_dataset_path) knowledge_graph.prepare_data() knowledge_graph.dump() @@ -62,3 +63,21 @@ def test_userdefined_dataset(): assert knowledge_graph.kg_meta.tot_valid_triples == 1 assert knowledge_graph.kg_meta.tot_entity == 6 assert knowledge_graph.kg_meta.tot_relation == 3 + +@pytest.mark.parametrize('file_name, new_ext', [ + ('dataset.tar.gz', 'tgz'), + ('dataset.tgz', 'tgz'), + ('dataset.zip', 'zip'), +]) +def test_extract_compressed_dataset(file_name, new_ext): + url = Path(os.path.join(os.path.dirname(os.path.abspath(__file__)), 'resource', file_name)).absolute().as_uri() + dataset_name = 'test_dataset_%s' % file_name.replace('.', '_') + dataset = KnownDataset(dataset_name, url, 'userdefineddataset-') + dataset_dir = os.path.join(dataset.dataset_home_path, dataset_name) + dataset_files = os.listdir(dataset_dir) + + assert len(dataset_files) == 4 + assert dataset_name + '.' + new_ext in dataset_files + assert 'userdefineddataset-train.txt' in dataset_files + assert 'userdefineddataset-test.txt' in dataset_files + assert 'userdefineddataset-valid.txt' in dataset_files diff --git a/pykg2vec/test/test_model.py b/pykg2vec/test/test_model.py index 11ec264..7eb30ae 100644 --- a/pykg2vec/test/test_model.py +++ b/pykg2vec/test/test_model.py @@ -5,10 +5,33 @@ """ import pytest - -from pykg2vec.config.config import * +from pykg2vec.config.config import KGEArgParser, Importer from pykg2vec.utils.trainer import Trainer from pykg2vec.utils.kgcontroller import KnowledgeGraph + + +@pytest.mark.parametrize("model_name", [ + 'analogy', + 'complex', + 'complexn3', + 'cp', + 'distmult', + 'hole', + 'proje_pointwise', + 'rescal', + 'rotate', + 'simple', + 'simple_ignr', + 'slm', + 'transe', + 'transh', + 'transr', + 'transd', + 'transm', +]) +def test_KGE_methods(model_name): + """Function to test a set of KGE algorithsm.""" + testing_function(model_name) @pytest.mark.skip(reason="This is a functional method.") @@ -16,7 +39,7 @@ def testing_function(name, distance_measure=None, bilinear=None, display=False, """Function to test the models with arguments.""" # getting the customized configurations from the command-line arguments. args = KGEArgParser().get_args(['-exp', 'True']) - + # Preparing data and cache the data for later usage knowledge_graph = KnowledgeGraph(dataset=args.dataset_name) knowledge_graph.prepare_data() @@ -24,7 +47,7 @@ def testing_function(name, distance_measure=None, bilinear=None, display=False, # Extracting the corresponding model config and definition from Importer(). config_def, model_def = Importer().import_model_config(name) config = config_def(args) - + config.epochs = 1 config.test_step = 1 config.test_num = 10 @@ -47,11 +70,6 @@ def testing_function(name, distance_measure=None, bilinear=None, display=False, trainer.build_model() trainer.train_model() -@pytest.mark.parametrize("model_name", ['complex', 'distmult', 'proje_pointwise', 'rescal', 'rotate', 'slm', 'transe', 'transh', 'transr', 'transd', 'transm', 'hole']) -def test_KGE_methods(model_name): - """Function to test a set of KGE algorithsm.""" - testing_function(model_name) - def test_NTN(): testing_function('ntn', ent_hidden_size=10, rel_hidden_size=10) # for avoiding OOM. @@ -63,7 +81,7 @@ def test_ConvKB(): def test_KG2E_EL_args(): """Function to test KG2E Algorithm with arguments.""" - testing_function('kg2e', distance_measure="expected_likelihood") + testing_function('kg2e_el', distance_measure="expected_likelihood") def test_KG2E_KL_args(): """Function to test KG2E Algorithm with arguments.""" @@ -75,7 +93,7 @@ def test_SMEL_args(): def test_SMEB_args(): """Function to test SME Algorithm with arguments.""" - testing_function('sme', bilinear=True) + testing_function('sme_bl', bilinear=True) def test_transE_display(): """Function to test transE display.""" diff --git a/pykg2vec/test/test_trainer.py b/pykg2vec/test/test_trainer.py index e693a1a..c310ee0 100644 --- a/pykg2vec/test/test_trainer.py +++ b/pykg2vec/test/test_trainer.py @@ -4,16 +4,11 @@ This module is for testing unit functions of training """ import pytest -import tensorflow as tf from pykg2vec.config.config import KGEArgParser, Importer from pykg2vec.utils.trainer import Trainer, Monitor from pykg2vec.utils.kgcontroller import KnowledgeGraph -@pytest.fixture(scope="session", autouse=True) -def run_tf_function_eagerly(request): - tf.config.experimental_run_functions_eagerly(True) - @pytest.mark.skip(reason="This is a functional method.") def get_model(result_path_dir, configured_epochs, patience, config_key): args = KGEArgParser().get_args([]) diff --git a/pykg2vec/test/test_tune_model.py b/pykg2vec/test/test_tune_model.py index acc8ee1..f7a5b73 100644 --- a/pykg2vec/test/test_tune_model.py +++ b/pykg2vec/test/test_tune_model.py @@ -31,20 +31,27 @@ def tunning_function(name): @pytest.mark.parametrize('model_name', [ + 'analogy', + 'complex', + 'complexn3', + 'cp', + 'distmult', + 'hole', + 'kg2e', + 'kg2e_el', + 'ntn', + 'rescal', + 'rotate', + 'simple', + 'simple_ignr', + 'slm', + 'sme', + 'sme_bl', 'transe', 'transh', 'transm', - 'rescal', - 'sme', 'transd', 'transr', - 'ntn', - 'slm', - 'hole', - 'rotate', - 'kg2e', - 'complex', - 'distmult' ]) def test_tuning(model_name): """Function to test the tuning function.""" diff --git a/pykg2vec/utils/bayesian_optimizer.py b/pykg2vec/utils/bayesian_optimizer.py index 4605e0f..30ae717 100644 --- a/pykg2vec/utils/bayesian_optimizer.py +++ b/pykg2vec/utils/bayesian_optimizer.py @@ -19,17 +19,50 @@ config_path = "pykg2vec.config.config" hyper_param_path = "pykg2vec.config.hyperparams" -modelMap = {"complex": "Complex", +moduleMap = {"analogy": "ANALOGY", + "complex": "Complex", + "complexn3": "Complex", + "conve": "ConvE", + "cp": "CP", + "hole": "HoLE", + "distmult": "DistMult", + "kg2e": "KG2E", + "kg2e_el": "KG2E", + "ntn": "NTN", + "proje_pointwise": "ProjE_pointwise", + "rescal": "Rescal", + "rotate": "RotatE", + "simple": "SimplE", + "simple_ignr": "SimplE", + "slm": "SLM", + "sme": "SME", + "sme_bl": "SME", + "transd": "TransD", + "transe": "TransE", + "transg": "TransG", + "transh": "TransH", + "transm": "TransM", + "transr": "TransR", + "tucker": "TuckER"} + +modelMap = {"analogy": "ANALOGY", + "complex": "Complex", + "complexn3": "ComplexN3", "conve": "ConvE", + "cp": "CP", "hole": "HoLE", "distmult": "DistMult", "kg2e": "KG2E", + "kg2e_el": "KG2E_EL", "ntn": "NTN", "proje_pointwise": "ProjE_pointwise", "rescal": "Rescal", "rotate": "RotatE", + "simple": "SimplE", + "simple_ignr": "SimplE_ignr", "slm": "SLM", "sme": "SME", + "sme_bl": "SME_BL", "transd": "TransD", "transe": "TransE", "transg": "TransG", @@ -38,18 +71,24 @@ "transr": "TransR", "tucker": "TuckER"} - -configMap = {"complex": "ComplexConfig", +configMap = {"analogy": "ANALOGYConfig", + "complex": "ComplexConfig", + "complexn3": "ComplexConfig", "conve": "ConvEConfig", + "cp": "CPConfig", "hole": "HoLEConfig", "distmult": "DistMultConfig", "kg2e": "KG2EConfig", + "kg2e_el": "KG2EConfig", "ntn": "NTNConfig", "proje_pointwise": "ProjE_pointwiseConfig", "rescal": "RescalConfig", "rotate": "RotatEConfig", + "simple": "SimplEConfig", + "simple_ignr": "SimplEConfig", "slm": "SLMConfig", "sme": "SMEConfig", + "sme_bl": "SMEConfig", "transd": "TransDConfig", "transe": "TransEConfig", "transg": "TransGConfig", @@ -58,18 +97,24 @@ "transr": "TransRConfig", "tucker": "TuckERConfig"} - -hypMap = {"complex": "ComplexParams", +hypMap = {"analogy": "ANALOGYParams", + "complex": "ComplexParams", + "complexn3": "ComplexParams", "conve": "ConvEParams", + "cp": "CPParams", "hole": "HoLEParams", "distmult": "DistMultParams", "kg2e": "KG2EParams", + "kg2e_el": "KG2EParams", "ntn": "NTNParams", "proje_pointwise": "ProjE_pointwiseParams", "rescal": "RescalParams", "rotate": "RotatEParams", + "simple": "SimplEParams", + "simple_ignr": "SimplEParams", "slm": "SLMParams", "sme": "SMEParams", + "sme_bl": "SMEParams", "transd": "TransDParams", "transe": "TransEParams", "transg": "TransGParams", @@ -111,7 +156,7 @@ def __init__(self, args=None): self.knowledge_graph = KnowledgeGraph(dataset=args.dataset_name, custom_dataset_path=args.dataset_path) hyper_params = None try: - self.model_obj = getattr(importlib.import_module(model_path + ".%s" % modelMap[model_name]), + self.model_obj = getattr(importlib.import_module(model_path + ".%s" % moduleMap[model_name]), modelMap[model_name]) self.config_obj = getattr(importlib.import_module(config_path), configMap[model_name]) hyper_params = getattr(importlib.import_module(hyper_param_path), hypMap[model_name])() diff --git a/pykg2vec/utils/generator.py b/pykg2vec/utils/generator.py index 9df5b3f..d09db62 100644 --- a/pykg2vec/utils/generator.py +++ b/pykg2vec/utils/generator.py @@ -325,6 +325,6 @@ def start_one_epoch(self, num_batch): class TrainingStrategy(Enum): - PROJECTION_BASED = "projection_based" - PAIRWISE_BASED = "pairwise_based" - POINTWISE_BASED = "pointwise_based" + PROJECTION_BASED = "projection_based" # matching models with neural network + PAIRWISE_BASED = "pairwise_based" # translational distance models + POINTWISE_BASED = "pointwise_based" # semantic matching models diff --git a/pykg2vec/utils/kgcontroller.py b/pykg2vec/utils/kgcontroller.py index ef5ecad..9831aa3 100644 --- a/pykg2vec/utils/kgcontroller.py +++ b/pykg2vec/utils/kgcontroller.py @@ -3,9 +3,7 @@ """ This module is for controlling knowledge graph """ - - -import shutil, tarfile, pickle, time +import shutil, tarfile, pickle, time, os, zipfile import urllib.request from pathlib import Path from collections import defaultdict @@ -91,7 +89,7 @@ def __init__(self, tot_entity=None, self.tot_entity = tot_entity -def extract(tar_path, extract_path='.'): +def extract_tar(tar_path, extract_path='.'): """This function extracts the tar file. Most of the knowledge graph dataset are donwloaded in a compressed @@ -108,7 +106,23 @@ def extract(tar_path, extract_path='.'): for item in tar: tar.extract(item, extract_path) if item.name.find(".tgz") != -1 or item.name.find(".tar") != -1: - extract(item.name, "./" + item.name[:item.name.rfind('/')]) + extract_tar(item.name, "./" + item.name[:item.name.rfind('/')]) + +def extract_zip(zip_path, extract_path='.'): + """This function extracts the zip file. + + Most of the knowledge graph dataset are donwloaded in a compressed + zip format. This function is used to extract them + + Args: + zip_path (str): Location of the zip folder. + extract_path (str): Path where the files will be decompressed. + + Todo: + * Move this module to utils! + """ + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(extract_path) class KnownDataset: @@ -152,13 +166,14 @@ def __init__(self, name, url, prefix): self.dataset_home_path = self.dataset_home_path.resolve() self.root_path = self.dataset_home_path / self.name self.tar = self.root_path / ('%s.tgz' % self.name) + self.zip = self.root_path / ('%s.zip' % self.name) if not self.root_path.exists(): self.download() self.extract() path_eq_root = ['YAGO3_10', 'WN18RR', 'FB15K_237', 'Kinship', - 'Nations', 'UMLS'] + 'Nations', 'UMLS', 'NELL_995'] if self.name == 'WN18': self.dataset_path = self.root_path / 'wordnet-mlj12' elif self.name in path_eq_root: @@ -194,17 +209,29 @@ def download(self): self._logger.info("Downloading the dataset %s" % self.name) self.root_path.mkdir() - with urllib.request.urlopen(self.url) as response, open(str(self.tar), 'wb') as out_file: - shutil.copyfileobj(response, out_file) + if self.url.endswith('.tar.gz') or self.url.endswith('.tgz'): + with urllib.request.urlopen(self.url) as response, open(str(self.tar), 'wb') as out_file: + shutil.copyfileobj(response, out_file) + elif self.url.endswith('.zip'): + with urllib.request.urlopen(self.url) as response, open(str(self.zip), 'wb') as out_file: + shutil.copyfileobj(response, out_file) + else: + raise NotImplementedError("Unknown compression format") def extract(self): - ''' Extract the downloaded tar under the folder with the given dataset name''' - self._logger.info("Extracting the downloaded dataset from %s to %s" % (self.tar, self.root_path)) + ''' Extract the downloaded file under the folder with the given dataset name''' try: - extract(str(self.tar), str(self.root_path)) + if (os.path.exists(self.tar)): + self._logger.info("Extracting the downloaded dataset from %s to %s" % (self.tar, self.root_path)) + extract_tar(str(self.tar), str(self.root_path)) + return + if (os.path.exists(self.zip)): + self._logger.info("Extracting the downloaded dataset from %s to %s" % (self.zip, self.root_path)) + extract_zip(str(self.zip), str(self.root_path)) + return except Exception as e: - self._logger.info("Could not extract the tgz file!") + self._logger.info("Could not extract the target file!") self._logger.info("%s %s" % (type(e), e.args)) def read_metadata(self): @@ -403,6 +430,26 @@ def __init__(self): KnownDataset.__init__(self, name, url, prefix) +class NELL_995(KnownDataset): + """This data structure defines the necessary information for downloading NELL-995 dataset. + + NELL-995 module inherits the KnownDataset class for processing + the knowledge graph dataset. + + Attributes: + name (str): Name of the datasets + url (str): The full url where the dataset resides. + prefix (str): The prefix of the dataset given the website. + + """ + def __init__(self): + name = "NELL_995" + url = "https://github.com/louisccc/KGppler/raw/master/datasets/NELL_995.zip" + prefix = '' + + KnownDataset.__init__(self, name, url, prefix) + + class UserDefinedDataset(object): """The class consists of modules to handle the user defined datasets. @@ -533,6 +580,8 @@ def __init__(self, dataset='Freebase15k', custom_dataset_path=None): self.dataset = Nations() elif dataset.lower() == 'umls': self.dataset = UMLS() + elif dataset.lower() == 'nell_995': + self.dataset = NELL_995() else: # if the dataset does not match with existing one, check if it exists in user's local space. # if it still can't find corresponding folder, raise exception in UserDefinedDataset.__init__() diff --git a/pykg2vec/utils/trainer.py b/pykg2vec/utils/trainer.py index b13696e..b9d026c 100644 --- a/pykg2vec/utils/trainer.py +++ b/pykg2vec/utils/trainer.py @@ -204,7 +204,7 @@ def train_step_pointwise(self, h, r, t, y): loss = tf.reduce_mean(tf.nn.softplus(y*preds)) - if hasattr(self.model, 'get_reg'): # for complex & complex-N3 & DistMult + if hasattr(self.model, 'get_reg'): # for complex & complex-N3 & DistMult & CP & ANALOGY loss += self.model.get_reg(h, r, t) gradients = tape.gradient(loss, self.model.trainable_variables) diff --git a/pykg2vec/utils/visualization.py b/pykg2vec/utils/visualization.py index 7244823..d51c2ee 100644 --- a/pykg2vec/utils/visualization.py +++ b/pykg2vec/utils/visualization.py @@ -52,10 +52,10 @@ def __init__(self, model=None, vis_opts=None): self.model = model - self.algo_list = ['Complex', 'ConvE','HoLE', 'DistMult', 'DistMult2', 'KG2E_EL','KG2E_KL', - 'KGMeta', 'NTN', 'ProjE_pointwise', 'Rescal', - 'RotatE', 'SLM', 'SME_Bilinear','SME_Linear', 'TransD', 'TransE', 'TransH', - 'TransM', 'TransR', 'TuckER'] + self.algo_list = ['ANALOGY', 'Complex', 'ComplexN3', 'ConvE', 'CP', 'DistMult', 'DistMult2', 'HoLE', + 'KG2E_EL', 'KG2E_KL', 'KGMeta', 'NTN', 'ProjE_pointwise', 'Rescal', 'RotatE', 'SimplE_avg', + 'SimplE_ignr', 'SLM', 'SME_Bilinear', 'SME_Linear', 'TransD', 'TransE', 'TransH', 'TransM', + 'TransR', 'TuckER'] self.h_name = [] self.r_name = [] @@ -170,19 +170,21 @@ def plot_train_result(self): file_no = len([c for c in files_lwcase if a.lower() in c if 'training' in c]) if file_no < 1: continue - with open(str(path / (a + '_Training_results_' + str(file_no - 1) + '.csv')), 'r') as fh: - df_2 = pd.read_csv(fh) - if df.empty: - df['Epochs'] = df_2['Epochs'] - df['Loss'] = df_2['Loss'] - df['Algorithm'] = [a] * len(df_2) - else: - df_3 = pd.DataFrame() - df_3['Epochs'] = df_2['Epochs'] - df_3['Loss'] = df_2['Loss'] - df_3['Algorithm'] = [a] * len(df_2) - frames = [df, df_3] - df = pd.concat(frames) + file_path = str(path / (a + '_Training_results_' + str(file_no - 1) + '.csv')) + if os.path.exists(file_path): + with open(str(path / (a + '_Training_results_' + str(file_no - 1) + '.csv')), 'r') as fh: + df_2 = pd.read_csv(fh) + if df.empty: + df['Epochs'] = df_2['Epochs'] + df['Loss'] = df_2['Loss'] + df['Algorithm'] = [a] * len(df_2) + else: + df_3 = pd.DataFrame() + df_3['Epochs'] = df_2['Epochs'] + df_3['Loss'] = df_2['Loss'] + df_3['Algorithm'] = [a] * len(df_2) + frames = [df, df_3] + df = pd.concat(frames) plt.figure() ax = seaborn.lineplot(x="Epochs", y="Loss", hue="Algorithm", markers=True, dashes=False, data=df) files = os.listdir(str(result))