## Set Up 

In [1]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
num_ns = 4

In [5]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [6]:
#path_to_file = '/Users/niyone/Desktop/word2vec/word2vec_marea/sample10000abstracts.tsv'
#path_to_file = '/Users/niyone/Desktop/wn2vc_marea/data/pubmed_wn2vec/100000_wn2vec.tsv' #100k wn2vec 
#path_to_file = '/Users/niyone/Desktop/wn2vc_marea/data/pubmed_wn2vec/wn2vec2.0.tsv' #2.0 wn2vec 

#path_to_file = '/Users/niyone/Desktop/wn2vc_marea/data/pubmed_cr/100000pubmed_cr.tsv' #100k marea 
path_to_file = '/Users/niyone/Desktop/wn2vc_marea/data/pubmed_cr/pubmed_cr2.0.tsv' #2.0 marea 


In [7]:
"""""
Use this when the data has 2 columns
"""""
# lines = []
# with open(path_to_file) as f:
#     for line in f:
#         columns = line.split('\t')
#         if len(columns) != 2:
#             raise ValueError(f'Malformed marea line: {line}')
#         abstract = columns[1]  #columns[0]: pmid,  columns[1] year, columns[2] abstract text
#         lines.append(abstract)
# for line in lines[:20]:
#   print(line)

'""\nUse this when the data has 2 columns\n'

In [8]:
"""""
Use this when the data has 3 columns
"""""
lines = []
with open(path_to_file) as f:
    for line in f:
        columns = line.split('\t')
        if len(columns) != 3:
            raise ValueError(f'Malformed marea line: {line}')
        abstract = columns[2]  #columns[0]: pmid,  columns[1] year, columns[2] abstract text
        lines.append(abstract)
for line in lines[:20]:
  print(line)

well differentiate meshd018208 like morphology ncbigene4193 ncbigene4193 ncbigene1649 ncbigene1649 co amplification well differentiate meshd008080 wdl one common soft tissue meshd012509 adult predilection middle age male arises deep seat location retroperitoneum mediastinum spermatic cord occurrence young individual hypopharyngeal region exceedingly rare event meshd018208 ml like change seldom occur case wdl make diagnosis wdl challenge amplification ncbigene1649 ncbigene1649 gene subset case wdl show associate unique morphology herein present case 36 year old gentleman present difficulty breathing swallow 3 month duration ct scan neck reveal lesion along posterior wall hypopharynx measure 3 5 cm histopathologic examination reveal meshd009369 compose lobule oval spindle cell prominent myxoid stroma delicate ncbitaxon9031 ncbitaxon9031 wire vasculature vicinity lobule compose variably size adipocytes separate thick fibrous septum contains atypical hyperchromatic spindle cell immunohisto

In [9]:
text_ds = tf.data.TextLineDataset(path_to_file).filter(lambda x: tf.cast(tf.strings.length(x), bool))

2022-07-15 17:53:47.150721: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
sequence_length = 10

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [11]:
vectorize_layer.adapt(text_ds.batch(1024))

In [12]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'patient', 'meshd009369', 'cell', '0', 'study', '1', 'treatment', 'use', 'case', 'expression', '2', 'high', 'year', '3', 'survival', 'analysis', 'group', 'result']


In [13]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [14]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

350871


In [15]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[   1  120   56  461    1  424 1241 3396 3396 3278] => ['[UNK]', '2021', 'well', 'differentiate', '[UNK]', 'like', 'morphology', 'ncbigene4193', 'ncbigene4193', 'ncbigene1649']
[   1  927 1393 1131   46  217  322  508  569   58] => ['[UNK]', '2022', 'radiomics', 'nomogram', 'base', 'multiple', 'sequence', 'magnetic', 'resonance', 'image']
[   1  927    1 3751  243   30    1 1441    1 2708] => ['[UNK]', '2022', '[UNK]', 'immunotherapeutic', 'strategy', 'meshd006528', '[UNK]', 'vaccine', '[UNK]', 'adaptive']
[   1  927 2001 1186    1 1448    1    1    1    1] => ['[UNK]', '2022', 'meshd015464', 'double', '[UNK]', 'chromosome', '[UNK]', '[UNK]', '[UNK]', '[UNK]']
[   1  927    1 1563  261   41    1 2033  308    8] => ['[UNK]', '2022', '[UNK]', 'diet', 'combine', 'low', '[UNK]', 'dietary', 'pattern', 'treatment']


In [16]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|█████████████████████████████████| 350871/350871 [03:57<00:00, 1476.02it/s]




targets.shape: (1828549,)
contexts.shape: (1828549, 5)
labels.shape: (1828549, 5)


In [17]:
# BATCH_SIZE = 400
# BUFFER_SIZE = 3000
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [18]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 5), dtype=tf.int64, name=None))>


In [19]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [20]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [21]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [22]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [23]:
word2vec.fit(dataset, epochs=30, callbacks=[tensorboard_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f819f40c670>

In [24]:
#docs_infra: no_execute
%tensorboard --logdir logs

### Embedding lookup and analysis 

In [36]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [37]:
out_v = io.open('pubmed_cr2.0_vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('pubmed_cr2.0_metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()

In [38]:
try:
  from google.colab import files
  files.download('pubmed_cr2.0_vectors.tsv')
  files.download('pubmed_cr2.0_metadata.tsv')
except Exception:
  pass