In [4]:
#loading data
filename = 'q3_q5combined_08-12-21.txt'
file = open(filename, 'rt')
text = file.read()
file.close()

In [2]:
import io
import re
import string
import tqdm

import numpy as np

import tensorflow as tf
from tensorflow.keras import layers

In [2]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

In [3]:
with open('q3_q5combined_08-12-21.txt') as f: 
  lines = f.read().splitlines()
for line in lines[:20]:
  print(line)


1. BMC Med Inform Decis Mak. 2021 Dec 4;21(1):339. doi: 10.1186/s12911-021-01707-3.

Key components and critical factors for developing a telehealth business
framework: a qualitative study.

Velayati F(1), Ayatollahi H(2), Hemmat M(3), Dehghan R(4).

Author information: 
(1)Department of Health Information Management, School of Health Management and
Information Sciences, Iran University of Medical Sciences, Tehran, Iran.
(2)Health Management and Economics Research Center, Heath Management Research
Institute, Iran University of Medical Sciences, Tehran, Iran.
ayatollahi.h@iums.ac.ir.
(3)Department of Health Information Technology, Saveh University of Medical
Sciences, Saveh, Iran.
(4)Department of Health Entrepreneurship, Virtual University of Medical Sciences,
Tehran, Iran.

BACKGROUND: Telehealth technology and related products can help to solve the


In [4]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [5]:
text_ds = tf.data.TextLineDataset('q3_q5combined_08-12-21.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))

Metal device set to: Apple M1 Pro

systemMemory: 16.00 GB
maxCacheSize: 5.33 GB



2021-12-24 14:30:44.999497: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-12-24 14:30:45.000344: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [7]:
tf.config.experimental.set_visible_devices([], 'GPU')

RuntimeError: Visible devices cannot be modified after being initialized

In [8]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')

In [10]:
# Define the vocabulary size and number of words in a sequence.
vocab_size = 2625363
sequence_length = 10

In [11]:
# Use the TextVectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = layers.TextVectorization(
    standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [12]:
vectorize_layer.adapt(text_ds.batch(1024))

2021-12-24 14:31:52.502484: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-12-24 14:31:52.557321: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [13]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', 'of', 'the', 'and', 'to', 'in', 'for', 'a', 'university', 'health', 'with', 'doi', 'data', 'medical', 'information', 'on', 'were', 'is', 'was']


In [14]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [15]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

40743


In [16]:
for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[  128   537    96   100  1509  1453    22  1249 33555    12] => ['1', 'bmc', 'med', 'inform', 'decis', 'mak', '2021', 'dec', '4211339', 'doi']
[192 879   4 287 126   7 494   8 700 806] => ['key', 'components', 'and', 'critical', 'factors', 'for', 'developing', 'a', 'telehealth', 'business']
[212   8 463  37   0   0   0   0   0   0] => ['framework', 'a', 'qualitative', 'study', '', '', '', '', '', '']
[17631   961 31130  1409 13016   896 29034  2595     0     0] => ['velayati', 'f1', 'ayatollahi', 'h2', 'hemmat', 'm3', 'dehghan', 'r4', '', '']
[31 15  0  0  0  0  0  0  0  0] => ['author', 'information', '', '', '', '', '', '', '', '']


In [17]:
# Set the number of negative samples per positive context.
num_ns = 2

In [18]:
#generating training data:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=5,
    num_ns=2,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)


In [36]:
print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")



targets.shape: (572640,)
contexts.shape: (572640, 3)
labels.shape: (572640, 3)


In [27]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

NameError: name 'targets' is not defined

In [26]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

NameError: name 'dataset' is not defined

In [23]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [24]:
embedding_dim = 128
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])

In [19]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")


In [43]:
word2vec.fit(dataset, epochs=10, callbacks=[tensorboard_callback])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x15f9c3790>

In [20]:
%tensorboard --logdir logs

In [4]:
import functools
import warnings

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

import tensorflow.compat.v2 as tf
import tensorflow_probability as tfp

from tensorflow_probability import bijectors as tfb
from tensorflow_probability import distributions as tfd

tf.enable_v2_behavior()

plt.style.use("ggplot")
warnings.filterwarnings('ignore')



ModuleNotFoundError: No module named 'tensorflow_probability'