In [1]:
import time
import tqdm
import numpy as np

import tensorflow as tf
import keras
from tensorflow.keras import layers

from PixelCorpora import PixelCorpus, PixelCorpusRW

2022-11-09 07:13:35.301297: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [209]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

ds_ids = None
train = "slurm_job/theos_recom/No2_neg/"
fdr = 0.1
window = 5
int_per = 0.5
quan = 0
pix_per = 0.5
ind_name = None
stride = 4

corpus = PixelCorpusRW(ds_ids = ds_ids, ds_dir = train, ind_name = ind_name, fdr_thresh = fdr,
                     pix_per = pix_per, int_per = int_per, window = window, quan = quan, stride = stride)

ions2idx = corpus.get_ions2ids()
vocab_size = len(ions2idx) + 2

num_ns = 10
SEED = 42
window_size = 5
AUTOTUNE = tf.data.AUTOTUNE
embedding_dim = 20
BATCH_SIZE = 1024
BUFFER_SIZE = 100000

INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:CPU:0',)
Number of devices: 1


In [None]:
batches = chunks(corpus, batchsize=15)
for batch in tqdm.tqdm(batches):
    targets, contexts, labels = generate_training_data(batch, 5, 10, vocab_size, 42)


In [67]:
chunked = chunks(corpus)

In [66]:
from itertools import chain, islice

def chunks(iterable, batchsize=10):
    iterator = iter(iterable)
    for first in iterator:
        yield chain([first], islice(iterator, batchsize - 1))

In [156]:
batches = chunks(corpus, batchsize=10000) # batches consists of generators, generating 15 sequences
#for gen in batches:
#    generate_training_data(gen, window_size, num_ns, vocab_size, 42)

In [None]:
batches = chunks(corpus, batchsize=15)
t0 = time.time()
generator_list = [gen for gen in batches]
t1 = time.time()
total = t1-t0
print(total)

In [None]:
gen0 = generator_list[1]
generate_training_data(gen0, 5, 10, vocab_size, 42)

In [None]:
[gen for gen in list(batches)]

In [148]:
for gen in batches:
    print(gen)

In [None]:
chunked = chunks(corpus, batchsize=15)
for gen in tqdm.tqdm(chunked):
    targets, contexts, labels = generate_training_data(gen, 5, 10, vocab_size, 42)
                

In [210]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
            negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [211]:
class DataGenerator(keras.utils.Sequence):
    'Generates data for Keras'
    def __init__(self, corpus, window_size, num_ns, vocab_size, seed, batch_size=32, labels = None, shuffle=False ):
        'Initialization'
        self.batch_size = batch_size
        self.labels = labels
        self.shuffle = shuffle
        
        self.corpus = corpus
        self.batches = self.init_batches(self.corpus, self.batch_size)
        self.window_size = window_size
        self.num_ns = num_ns
        self.vocab_size = vocab_size
        self.seed = seed
        self.on_epoch_end()
    
    def init_batches(self, corpus, batchsize):
        batched_corpus = chunks(corpus, batchsize)
        batches = [gen for gen in batched_corpus]
        return batches

    def __len__(self):
        'Denotes the number of batches per epoch'
        return int(np.floor(len(generator_list)))

    def __getitem__(self, index):
        'Generate one batch of data'
        # Generate indexes of the batch
        batch_idx = self.indexes[index]
        batch = self.batches[batch_idx] 
        targets, contexts, labels = self.__data_generation(batch)

        return targets, contexts, labels
    
    def on_epoch_end(self):
        'Updates indexes after each epoch'
        self.indexes = np.arange(len(self.corpus))
        if self.shuffle == True:
            np.random.shuffle(self.indexes)
    
    def __data_generation(self, batched_gen):
        # Generate data
        targets, contexts, labels = generate_training_data(batch, window_size, num_ns, vocab_size, seed = 42)
        
            



In [None]:
DataGenerator(corpus, window_size, num_ns, vocab_size, seed = 42, batch_size = 10000).__getitem__(4)

In [212]:
training_generator = DataGenerator(generator_list, window_size, num_ns, vocab_size, seed = 42, batch_size = 10000)

model = Word2Vec(vocab_size, embedding_dim)


In [213]:
t0 = time.time()
with strategy.scope():
    model.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)
    model.fit_generator(generator=training_generator,
                        use_multiprocessing=True,
                        workers = 6, 
                        epochs = 20)
t1 = time.time()
total = t1 - t0

  model.fit_generator(generator=training_generator,
0it [00:00, ?it/s]


TypeError: cannot unpack non-iterable NoneType object

In [130]:
targets, contexts, labels = generate_training_data(corpus,
                            window_size= window_size,
                            num_ns=num_ns,
                            vocab_size=vocab_size,
                            seed = SEED)
 
targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

23482it [01:14, 316.87it/s]


KeyboardInterrupt: 

In [None]:
text_ds = tf.data.TextLineDataset('corpus_text_shuffled.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))


In [None]:

vectorize_layer = layers.TextVectorization(
    max_tokens = vocab_size, 
    output_mode='int', 
    standardize = None,
    split = 'whitespace',
    output_sequence_length= 12221
    )

In [None]:
vectorize_layer.adapt(text_ds.batch(1024))

In [None]:
inverse_vocab = vectorize_layer.get_vocabulary()


In [None]:
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()


In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())


In [None]:
sequences[0]

In [None]:
print(len(sequences)) #39040 for output_sequence_length = None

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size= window_size,
    num_ns= num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

In [None]:
labels[0]

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 100000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

In [32]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb) 
        # dots: (batch, context)
        return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)


In [None]:
t0 = time.time()
model = Word2Vec(vocab_size, embedding_dim)
model.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)
model.fit(dataset, epochs=20)
t1 = time.time()
total = t1 - t0

In [None]:
print('training time: ', np.round(total/60, 2), ' mins')

In [None]:
t0_g = time.time()
model = Word2Vec(vocab_size, embedding_dim)
model.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)
with strategy.scope():
    model.fit(dataset, epochs=20)
t1_g = time.time()
total_g = t1_g - t0_g

In [None]:
total_g

In [None]:
weights_g = model.get_layer('w2v_embedding').get_weights()[0]
vocab_g = vectorize_layer.get_vocabulary()

In [None]:
out_v_g = io.open('vectors_model_testing.tsv', 'w', encoding='utf-8')
out_m_g = io.open('metadata_model_testing.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v_g.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m_g.write(word + "\n")
out_v_g.close()
out_m_g.close()
