In [5]:
import os
import tensorflow as tf
import numpy as np
import math
import random

In [8]:
%cd ~/code/umls-embeddings/python
%ls

/shared/hltdir1/disk1/home/max/code/umls-embeddings/python
[0m[01;34mbin[0m/  [01;34mdata[0m/  [01;34meukg[0m/  __init__.py  [01;34m__pycache__[0m/  results.txt  Untitled.ipynb


In [9]:
from eukg.tf_util import Trainer, ModelSaver

from eukg.emb import EmbeddingModel
from eukg.gan import Generator, train_gan, Discriminator, DisGen
from eukg import Config
from eukg.data import data_util, DataGenerator, TfDataGenerator
from eukg.emb import AceModel
from eukg.tf_util import checkpoint_utils


In [18]:
class TestConfig:
  def __init__(self):
    pass

config = TestConfig()
config.val_proportion = 0.1
config.embedding_size = 50
config.energy_norm_ord = 1
config.seed = 1337
config.mode = 'disgen'
config.model = 'transd-distmult'
config.run_name = 'transd-dm-disgen-ace-7'
config.ace_model = True
config.no_semantic_network = True
config.train_bert = False
config.learning_rate = 1e-5
config.batch_size = 16
config.val_batch_size = 16
config.num_epochs = 100
config.data_dir = '/users/max/data/artifacts/umls-embeddings'
config.secondary_data_dir = '/users/max/data/artifacts/umls-embeddings-compressed'
config.model_dir = '/users/max/data/models/umls-embeddings'
config.summaries_dir = '/shared/hltdir4/disk1/max/logs'
config.eval_mode = 'save'
config.eval_dir = '/users/max/data/artifacts/umls-embeddings'
config.load = True
config.num_workers = 6
config.buffer_size = 1
config.gpu_memory_growth = True
config.num_generator_samples = 30
config.lm_encoder_size = 768


In [20]:
seed = config.seed
random.seed(seed)
np.random.seed(seed)

# init model dir
all_models_dir = config.model_dir
config.model_dir = os.path.join(config.model_dir, config.model, config.run_name)
if not os.path.exists(config.model_dir):
  os.makedirs(config.model_dir)

# init summaries dir
config.summaries_dir = os.path.join(config.summaries_dir, config.run_name)
if not os.path.exists(config.summaries_dir):
  os.makedirs(config.summaries_dir)

# load data
cui2id, data, train_idx, val_idx = data_util.load_metathesaurus_data(config.data_dir, config.val_proportion)
config.val_progress_update_interval = int(math.floor(float(len(val_idx)) / config.val_batch_size))
config.batches_per_epoch = int(math.floor(float(len(train_idx)) / config.batch_size))
if not config.no_semantic_network:
  type2cuis = data_util.load_semantic_network_data(config.data_dir, data)
else:
  type2cuis = None

data_generator = TfDataGenerator.TfDataGenerator(
  data,
  train_idx,
  val_idx,
  config.data_dir,
  config.secondary_data_dir,
  config.num_generator_samples,
  config.batch_size,
  config.num_epochs,
  config.lm_encoder_size,
  config.num_workers,
  config.buffer_size
)

if config.gpu_memory_growth:
  gpu_config = tf.ConfigProto()
  gpu_config.gpu_options.allow_growth = True
else:
  gpu_config = None

session = tf.Session(config=gpu_config)
tf.set_random_seed(seed)

In [22]:
dg = data_generator
dg.create_iterator()

subjs_emb = dg.subjs_emb
rels_emb = dg.rels_emb
objs_emb = dg.objs_emb
nsubjs_embs = dg.nsubjs_embs
nobjs_embs = dg.nobjs_embs

subjs_lengths = dg.subjs_lengths
rels_lengths = dg.rels_lengths
objs_lengths = dg.objs_lengths
nsubjs_lengths = dg.nsubjs_lengths
nobjs_lengths = dg.nobjs_lengths

neg_shape = tf.shape(nsubjs_embs)
bsize, nsamples, seq_len = neg_shape[0], neg_shape[1], neg_shape[2]
total_neg_size = bsize * nsamples

W1228 21:00:53.533342 140252633818944 deprecation_wrapper.py:119] From /shared/hltdir1/disk1/home/max/code/umls-embeddings/python/eukg/data/TfDataGenerator.py:72: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W1228 21:00:53.899137 140252633818944 deprecation_wrapper.py:119] From /shared/hltdir1/disk1/home/max/code/umls-embeddings/python/eukg/data/TfDataGenerator.py:230: The name tf.sparse_tensor_to_dense is deprecated. Please use tf.sparse.to_dense instead.

W1228 21:00:54.007996 140252633818944 deprecation.py:323] From /users/max/miniconda3/envs/tf-1.14/lib/python3.7/site-packages/tensorflow/python/ops/array_ops.py:1354: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


(?, ?, ?, 768)
(?, ?)
(?, ?)
(?, ?, ?, 768)
(?, ?)
(?, ?, ?, 768)
(?, ?)


W1228 21:00:54.227475 140252633818944 deprecation.py:323] From /shared/hltdir1/disk1/home/max/code/umls-embeddings/python/eukg/data/TfDataGenerator.py:393: DatasetV1.make_initializable_iterator (from tensorflow.python.data.ops.dataset_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use `for ... in dataset:` to iterate over a dataset. If using `tf.estimator`, return the `Dataset` object directly from your input function. As a last resort, you can use `tf.compat.v1.data.make_initializable_iterator(dataset)`.


In [23]:
dg.load_train(session)

In [24]:
f = session.run([dg.subjs_lengths, dg.rels_lengths, dg.objs_lengths, dg.nsubjs_lengths, dg.nobjs_lengths])

In [31]:
sl, rl, ol, nsl, nol = f
b_idx = 2
ex = sl[b_idx], rl[b_idx], ol[b_idx], list(zip(nsl[b_idx], nol[b_idx]))
ex

(3,
 5,
 10,
 [(5, 10),
  (31, 10),
  (11, 10),
  (8, 10),
  (9, 10),
  (4, 10),
  (10, 10),
  (5, 10),
  (9, 10),
  (15, 10),
  (9, 10),
  (7, 10),
  (8, 10),
  (3, 10),
  (16, 10),
  (3, 25),
  (3, 31),
  (3, 31),
  (3, 12),
  (3, 15),
  (3, 6),
  (3, 7),
  (3, 8),
  (3, 11),
  (3, 16),
  (3, 17),
  (3, 14),
  (3, 7),
  (3, 13),
  (3, 6)])

In [32]:
session.close()