In [None]:
# dependencies

import os
import time 
import tqdm
import io

import numpy as np, pandas as pd
import pickle
import logging
import random

#import tensorflow as tf
#from tensorflow.keras import layers

from numpy import percentile, nan as np_nan

import scipy
from sklearn.metrics.pairwise import pairwise_kernels
from gensim.utils import RepeatCorpusNTimes

from PixelCorpora import PixelCorpus, PixelCorpusRW

In [None]:
ds_ids = None
ds_dir = 'slurm_job/mouse_brain/example_data/'
#with open('ind_names.pickle', 'rb') as remaining_ions:
#ind_name = pickle.load(remaining_ions)

ind_name = None
fdr = 0.1
pix_per = 0.01
i = 0.5
window = 5
q = 99.
quan = 00.
int_per = 0.5
no_samples = 5

logger = logging.getLogger(__name__)

In [None]:
num_ns = 10
SEED = 42
window_size = 5
AUTOTUNE = tf.data.AUTOTUNE
embedding_dim = 20

In [None]:
def random_coloc_walk(coloc_matrix, n=5):
    tmp = coloc_matrix.copy()
    np.fill_diagonal(tmp, 0)
    #transition_matrix = np.apply_along_axis(scipy.special.softmax, 0, tmp)
    sequence = [np.random.choice(range(tmp.shape[0]))]
    for i in range(n):
        sequence.append(random.choices(range(tmp.shape[0]), weights=tmp[:, sequence[-1]])[0])
    return sequence[1:]
#random_coloc_walk(pairwise_kernels(ion_array, metric='cosine'))

In [None]:
# for testing corpus building
ion_ids = {}
for f in os.listdir(ds_dir)[:1]:
    try:
        ds_df = pd.read_pickle(os.path.join(ds_dir,f))
    except IsADirectoryError:
        continue
    all_ions = ds_df.drop(columns=['y','x']).columns.tolist()
    logger.info("ds_df pixel size %i ", len(ds_df[['x', 'y']].drop_duplicates().index))

    if ind_name != None:                                         
        ion_names = list(set(ind_name).intersection(all_ions)) # intersection between all ions in the ds and specified ions
        pop_ions = list(set(all_ions).difference(set(ion_names)))
        ds_df = ds_df.drop(columns=pop_ions) # drop ions not in ind_name
    else: ion_names = all_ions # either the specified ion names or all ions

    if not ion_names:
        pass # skip empty iterations
    
    for ion in ion_names:
        ion_ids[ion] = ion_ids.get(ion, len(list(ion_ids.values())) +1 )
    
    # fiter out rows based on intensity and quantile param
    int_thresh = percentile(ds_df, quan) * int_per  # this now uses a general intensity threshold,
                                                   # could also use an ion specific one
    filt_df = ds_df.rename(columns=ion_ids)
    filt_df = filt_df.loc[~(filt_df.drop(columns=['y', 'x'])==0).all(axis=1)] # drop all zero rows
    #filt_df[ion_names] = ds_df[ion_names][ds_df[ion_names] > int_thresh]
    #filt_df = filt_df.astype(pd.SparseDtype("float", np_nan))

    # sample pixels
    if pix_per != 1.0:
        sampled_coord_df = filt_df.dropna(how='all').drop_duplicates().sample(frac=pix_per)
    else: 
        sampled_coord_df = filt_df.dropna(how='all').drop_duplicates()
    #logging.info("%i pixels selected for %s", len(sampled_coord_df.index), f)

    for _, c_row in sampled_coord_df.iloc[::2].iterrows(): #iloc[::2] takes every second row, inducing stride length of 2
        x = c_row['x']  # center x coordinate
        y = c_row['y']  # center y coordinate
        # find rows corresponding to pixels around the sampled pixel coordinates
        window_rows = filt_df[(filt_df['x'].between(x - w, x + w, inclusive = 'both')) 
                             & (filt_df['y'].between(y - w, y + w, inclusive = 'both'))]
        
        ion_rows = window_rows.drop(columns=['y','x'])
        ion_rows = ion_rows.loc[:, (ion_rows != 0).any(axis=0)] # drop columns with all zero entries
        coloc_matrix = pairwise_kernels(ion_rows.T, metric='cosine') # Don't forget the Transpose
        for i in range(no_samples):
            ions_idx = random_coloc_walk(coloc_matrix, n = 10)
            ion_rows.columns[ions_idx].tolist()
            

In [None]:
ion_rows

In [None]:
# for testing corpus building
def vanilla_gen(shuff=1):
    ion_ids = {}
    for f in os.listdir(ds_dir)[:1]:
        try:
            ds_df = pd.read_pickle(os.path.join(ds_dir,f))
        except IsADirectoryError:
            continue
        all_ions = ds_df.drop(columns=['y','x']).columns.tolist()
        logger.info("ds_df pixel size %i ", len(ds_df[['x', 'y']].drop_duplicates().index))

        if ind_name != None:                                         
            ion_names = list(set(ind_name).intersection(all_ions)) # intersection between all ions in the ds and specified ions
            pop_ions = list(set(all_ions).difference(set(ion_names)))
            ds_df = ds_df.drop(columns=pop_ions) # drop ions not in ind_name
        else: ion_names = all_ions # either the specified ion names or all ions

        if not ion_names:
            pass # skip empty iterations

        for ion in ion_names:
            ion_ids[ion] = ion_ids.get(ion, len(list(ion_ids.values())) +1 )

        # fiter out rows based on intensity and quantile param
        int_thresh = percentile(ds_df, quan) * int_per  # this now uses a general intensity threshold,
                                                       # could also use an ion specific one
        filt_df = ds_df.rename(columns=ion_ids)
        filt_df = filt_df.loc[~(filt_df.drop(columns=['y', 'x'])==0).all(axis=1)] # drop all zero rows
        #filt_df[ion_names] = ds_df[ion_names][ds_df[ion_names] > int_thresh]
        #filt_df = filt_df.astype(pd.SparseDtype("float", np_nan))

        # sample pixels
        if pix_per != 1.0:
            sampled_coord_df = filt_df.dropna(how='all').drop_duplicates().sample(frac=pix_per)
        else: 
            sampled_coord_df = filt_df.dropna(how='all').drop_duplicates()
        #logging.info("%i pixels selected for %s", len(sampled_coord_df.index), f)

        for _, c_row in sampled_coord_df.iloc[::2].iterrows(): #iloc[::2] takes every second row, inducing stride length of 2
            x = c_row['x']  # center x coordinate
            y = c_row['y']  # center y coordinate
            # find rows corresponding to pixels around the sampled pixel coordinates
            window_rows = filt_df[(filt_df['x'].between(x - w, x + w, inclusive = 'both')) 
                                 & (filt_df['y'].between(y - w, y + w, inclusive = 'both'))]

            exp_inds = []
            ind_counts = dict(window_rows.drop(columns=['y', 'x']).count())
            for ind in ind_counts:
                for i in range(0, ind_counts[ind]): exp_inds.append(ind)
            if shuff == 1:
                random.shuffle(exp_inds) # shuffle ions in window
            #random.shuffle(exp_inds) # shuffle ions in window
            yield exp_inds

In [None]:
class PixelCorpus(object):
    def __init__(self, fdr_thresh=0.1, pix_per=0.5, int_per=0.5, window=5, quan=99., ind_name=None, ds_dir=None, ds_ids=None, stride = 1, shuffle = 1):
        self.fdr = fdr_thresh
        self.ds_dir = ds_dir
        self.p = pix_per
        self.i = int_per
        self.w = window
        self.q = quan
        
        self.ind_name = ind_name
        self.ds_ids = ds_ids #new: list of metaspace dataset ids used as training data
        self.stride = stride
        self.shuffle = shuffle

    def __iter__(self):

        for f in os.listdir(self.ds_dir):
            try:
                ds_df = pd.read_pickle(os.path.join(self.ds_dir,f))
            except IsADirectoryError:
                continue
            all_ions = ds_df.drop(columns=['y','x']).columns.tolist()

            if self.ind_name != None:                                         
                ion_names = list(set(self.ind_name).intersection(all_ions)) # intersection between all ions in the ds and specified ions
                pop_ions = list(set(all_ions).difference(set(ion_names)))
                ds_df = ds_df.drop(columns=pop_ions) # drop ions not in ind_name
                
            else: ion_names = all_ions # either the specified ion names or all ions

            if not ion_names:
                pass # skip empty iterations

            # fiter out rows based on intensity and quantile param
            int_thresh = percentile(ds_df, self.q) * self.i  # this now uses a general intensity threshold,
                                                           # could also use an ion specific one
            filt_df = ds_df
            filt_df[ion_names] = ds_df[ion_names][ds_df[ion_names] > int_thresh]
            filt_df = filt_df.astype(pd.SparseDtype("int", np_nan)) # int should be replaced, when we care about the exact number of intensity

            # sample pixels
            if self.p != 1.0:
                sampled_coord_df = filt_df.dropna(how='all').drop_duplicates().sample(frac=self.p)
            else: 
                sampled_coord_df = filt_df.dropna(how='all').drop_duplicates()
            #logging.info("%i pixels selected for %s", len(sampled_coord_df.index), f)

            for _, c_row in sampled_coord_df.iloc[::self.stride].iterrows():
                x = c_row['x']
                y = c_row['y']

                # find rows corresponding to pixels around the sampled pixel coordinates
                window_rows = filt_df[(filt_df['x'].between(x - self.w, x + self.w, inclusive = 'both')) 
                                     & (filt_df['y'].between(y - self.w, y + self.w, inclusive = 'both'))]
                # depending on how many items an ion/or formula (depending on self.ind_name parameter)
                # occurs in the window, yield it
                exp_inds = []
                ind_counts = dict(window_rows[ion_names].count())
                for ind in ind_counts:
                    for i in range(0, ind_counts[ind]): exp_inds.append(ind)
                if self.shuffle == 1:
                    random.shuffle(exp_inds) # shuffle ions in window
                yield exp_inds

In [None]:
corpus = PixelCorpus(fdr_thresh=0.1, pix_per=0.1, int_per=0, window=1, quan=50, ind_name=None, ds_dir='slurm_job/mouse_brain/example_data/', ds_ids=None, stride = 1, shuffle = 1)

In [None]:
from word2vec_pix import Word2Vec

In [None]:
ds_ids = None; train = 'slurm_job/mouse_brain/example_data/'; ind_name = None; size = 100; threads = 6; min_count = 5; window = 1; sample = 1e-3; 
skipgram = 0; hs = 0; negative = 5; cbow_mean = 1; epochs = 1; fdr = 0.1; int_per = 0; pix_per = 0.01; quan = 50
model = Word2Vec(
        corpus=corpus,ds_ids = ds_ids, train=train, dict_size=None, ind_name= ind_name, 
        size=size, min_count=min_count, workers=threads,
        window=window, sample=sample, sg=skipgram, hs=hs,
        negative=negative, cbow_mean=1, iter=epochs, fdr=fdr, 
        int_per=int_per, pix_per=pix_per, quan = quan)

In [None]:
def train_cbow_pair(model, word, input_word_indices, l1, alpha, learn_vectors=True, learn_hidden=True, compute_loss=False):
    neu1e = zeros(l1.shape)

    if model.hs:
        l2a = model.syn1[word.point]  # 2d matrix, codelen x layer1_size
        prod_term = dot(l1, l2a.T)
        fa = expit(prod_term)  # propagate hidden -> output
        ga = (1. - word.code - fa) * alpha  # vector of error gradients multiplied by the learning rate
        if learn_hidden:
            model.syn1[word.point] += outer(ga, l1)  # learn hidden -> output
        neu1e += dot(ga, l2a)  # save error

        # loss component corresponding to hierarchical softmax
        if compute_loss:
            sgn = (-1.0)**word.code  # ch function, 0-> 1, 1 -> -1
            model.running_training_loss += sum(-log(expit(-sgn * prod_term)))

    if model.negative:
        # use this word (label = 1) + `negative` other random words not from this sentence (label = 0)
        word_indices = [word.index]
        while len(word_indices) < model.negative + 1:
            w = model.cum_table.searchsorted(model.random.randint(model.cum_table[-1]))
            if w != word.index:
                word_indices.append(w)
        l2b = model.syn1neg[word_indices]  # 2d matrix, k+1 x layer1_size
        prod_term = dot(l1, l2b.T)
        fb = expit(prod_term)  # propagate hidden -> output
        gb = (neg_labels - fb) * alpha  # vector of error gradients multiplied by the learning rate
        if learn_hidden:
            model.syn1neg[word_indices] += outer(gb, l1)  # learn hidden -> output
        neu1e += dot(gb, l2b)  # save error

        # loss component corresponding to negative sampling
        if compute_loss:
            model.running_training_loss -= sum(log(expit(-1 * prod_term[1:])))  # for the sampled words
            model.running_training_loss -= log(expit(prod_term[0]))  # for the output word

    if learn_vectors:
        # learn input -> hidden, here for all words in the window separately
        if not model.cbow_mean and input_word_indices:
            print(input_word_indices)
            neu1e /= len(input_word_indices)
        for i in input_word_indices:
            print('input_word_indices', input_word_indices, '\n')
            model.wv.syn0[i] += neu1e * model.syn0_lockf[i] #replaced syn0 by vectors
            print('neu1e', neu1e, '\n')
    return neu1e

In [None]:
if random.shuffle == 0:
    print('som')

In [None]:
from scipy.special import expit

neg_labels = zeros(model.negative + 1)
neg_labels[0] = 1.

alpha=0.025; compute_loss = False; zeros = np.zeros; dot = np.dot; outer = np.outer

In [None]:
# shuffle
result = 0
word_id_lists = corpus
for word_id_list in word_id_lists:
    word_vocabs = [model.wv.vocab[str(w)] for w in word_id_list if str(w) in model.wv.vocab and
                   model.wv.vocab[str(w)].sample_int > model.random.rand() * 2**32]

    for pos, word in enumerate(word_vocabs):
        word2_indices = [word2.index for pos2, word2 in enumerate(word_vocabs) if (word2 is not None and pos2 != pos)]
        l1 = np.sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size #replaced attribute syn0 by vectors
        if word2_indices and model.cbow_mean:
            l1 /= len(word2_indices)
        train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
    result += len(word_vocabs)

In [None]:
# no shuffle
result = 0
word_id_lists = corpus
for word_id_list in word_id_lists:
    word_vocabs = [model.wv.vocab[str(w)] for w in word_id_list if str(w) in model.wv.vocab and
                   model.wv.vocab[str(w)].sample_int > model.random.rand() * 2**32]

    for pos, word in enumerate(word_vocabs):
        word2_indices = [word2.index for pos2, word2 in enumerate(word_vocabs) if (word2 is not None and pos2 != pos)]
        l1 = np.sum(model.wv.syn0[word2_indices], axis=0)  # 1 x vector_size #replaced attribute syn0 by vectors
        if word2_indices and model.cbow_mean:
            l1 /= len(word2_indices)
        train_cbow_pair(model, word, word2_indices, l1, alpha, compute_loss=compute_loss)
    result += len(word_vocabs)

In [None]:
#ions2idx = corpus.get_ions2ids()


#vocab_size = len(ions2idx) + 1

In [None]:
def generate_training_data(sequences,
                           vocab_size,
                           window_size,
                           num_ns,
                           seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
            negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [None]:
targets, contexts, labels = generate_training_data(corpus,
                            window_size= window_size,
                            num_ns=num_ns,
                            vocab_size=vocab_size,
                            seed=SEED)

In [None]:
np.max(contexts)

In [None]:
targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
labels[1000]

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb) 
        # dots: (batch, context)
        return dots

In [None]:
len(ions2idx)

In [None]:
from tensorflow.keras import layers

t0 = time.time()
model = Word2Vec(vocab_size, 20)
model.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)
model.fit(dataset, epochs=20)
t1 = time.time()
total = t1 - t0

In [None]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
vocab = ions2idx

In [None]:
for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    print('\t'.join([str(x) for x in vec]) + "\n")
    print(word + "\n")

In [None]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
vocab = ions2idx

out_v = io.open('vectors_rw.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata_rw.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()


In [None]:
# Now with theos sets
ds_dir = 'slurm_job/theos_recom/No1'
ind_name = None

corpus = PixelCorpusRW(ds_ids = ds_ids, ds_dir = ds_dir, ind_name = ind_name, fdr_thresh = fdr,
                     pix_per = pix_per, int_per = int_per, window = window, quan = quan, no_samples=5)

repeater = RepeatCorpusNTimes(corpus, 1)

ions2idx = corpus.get_ions2ids()
vocab_size = len(ions2idx) + 2

In [None]:
vocab_size

In [None]:
for sentence in corpus:
    print(sentence)

In [None]:
targets, contexts, labels = generate_training_data(corpus,
                            window_size= window_size,
                            num_ns=num_ns,
                            vocab_size=vocab_size,
                            seed=SEED)

In [None]:
targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
t0 = time.time()
model_th = Word2Vec(vocab_size, 20)
model_th.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)
model_th.fit(dataset, epochs=20)
t1 = time.time()
total = t1 - t0

In [None]:
weights = model_th.get_layer('w2v_embedding').get_weights()[0]
vocab = ions2idx

out_v = io.open('vectors_th_No1_rw.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata_th_No1_rw.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()


In [None]:
# for testing corpus building

for f in os.listdir(ds_dir):
        try:
            ds_df = pd.read_pickle(os.path.join(ds_dir,f))
        except IsADirectoryError:
            continue
        all_ions = ds_df.drop(columns=['y','x']).columns.tolist()
        logger.info("ds_df pixel size %i ", len(ds_df[['x', 'y']].drop_duplicates().index))

        if ind_name != None:                                         
            ion_names = list(set(ind_name).intersection(all_ions)) # intersection between all ions in the ds and specified ions
            pop_ions = list(set(all_ions).difference(set(ion_names)))
            ds_df = ds_df.drop(columns=pop_ions) # drop ions not in ind_name
        else: ion_names = all_ions # either the specified ion names or all ions

        if not ion_names:
            pass # skip empty iterations

        # fiter out rows based on intensity and quantile param
        int_thresh = percentile(ds_df, quan) * int_per  # this now uses a general intensity threshold,
                                                       # could also use an ion specific one
        filt_df = ds_df
        filt_df[ion_names] = ds_df[ion_names][ds_df[ion_names] > int_thresh]
        filt_df = filt_df.astype(pd.SparseDtype("float", np_nan))

        # sample pixels
        if p != 1.0:
            sampled_coord_df = filt_df.dropna(how='all').drop_duplicates().sample(frac=p)
        else: 
            sampled_coord_df = filt_df.dropna(how='all').drop_duplicates()
        #logging.info("%i pixels selected for %s", len(sampled_coord_df.index), f)

        for _, c_row in sampled_coord_df.iterrows():
            x = c_row['x']
            y = c_row['y']

            # find rows corresponding to pixels around the sampled pixel coordinates
            window_rows = filt_df[(filt_df['x'].between(x - w, x + w, inclusive = 'both')) 
                                 & (filt_df['y'].between(y - w, y + w, inclusive = 'both'))]
            # depending on how many items an ion/or formula (depending on self.ind_name parameter)
            # occurs in the window, yield it
            exp_inds = []
            ind_counts = dict(window_rows[ion_names].count()) # count() number of column entries
            for ind in ind_counts:
                for i in range(0, ind_counts[ind]): exp_inds.append(ind)
            # yield exp_inds


In [None]:
exp_inds

In [None]:
import tensorflow as tf
import tqdm
import io

In [None]:
from itertools import chain
big_list = list(chain.from_iterable(corpus_list))

token_set = list(set(big_list))

vocab, index = {}, 1 # start indexing from 1
vocab['<pad>'] = 0 # add a padding token
for token in token_set:
    if token not in vocab:
        vocab[token] = index
        index += 1
        
vocab_size = len(vocab)


In [None]:
print(vocab)

In [None]:
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

In [None]:
example_tokens = big_list

example_sequence = [vocab[word] for word in example_tokens]

In [None]:
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))


In [None]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")


In [None]:
# Set the number of negative samples per positive context.
num_ns = 4
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [None]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]



context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])


In [None]:
# Add a dimension so you can use concatenation (in the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

# Concatenate a positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label the first context word as `1` (positive) followed by `num_ns` `0`s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

# Reshape the target to shape `(1,)` and context and label to `(num_ns+1,)`.
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)


In [None]:
print(f"target_index    : {target}")
print(f"target_word     : {inverse_vocab[target_word]}")
print(f"context_indices : {context}")
print(f"context_words   : {[inverse_vocab[c.numpy()] for c in context]}")
print(f"label           : {label}")

In [None]:
print("target  :", target)
print("context :", context)
print("label   :", label)

In [None]:
# assumes log_uniform distribution (Zipf)
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

In [None]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for `vocab_size` tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in the dataset.
    for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class,
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size,
                seed=seed,
                name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
            negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [None]:
# in case we have a new corpus
with open('corpus_text_shuffled.txt', 'w') as f:
    for window in corpus_list:
        if window:         # write only, when not empty
            for ion in window:
                f.write(ion + ' ')
            f.write('\n')

In [None]:
text_ds = tf.data.TextLineDataset('corpus_text_shuffled.txt').filter(lambda x: tf.cast(tf.strings.length(x), bool))


In [None]:
from tensorflow.keras import layers

vectorize_layer = layers.TextVectorization(
    max_tokens = vocab_size, 
    output_mode='int', 
    standardize = None,
    split = 'whitespace',
    output_sequence_length=10
    )

In [None]:
vectorize_layer.adapt(text_ds.batch(1024))

In [None]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

In [None]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()


In [None]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

In [None]:
for seq in sequences[:5]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")


In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=4,
    num_ns=10,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")


In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 100000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)


In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb) 
        # dots: (batch, context)
        return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)


In [None]:
embedding_dim = 20
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], 
                run_eagerly=True)


In [None]:
%load_ext tensorboard

In [None]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
word2vec.fit(dataset, epochs=20, callbacks=[tensorboard_callback])

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

out_v = io.open('vectors_norm_shuffled.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata_norm_shuffled.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()


In [None]:
try:
    from google.colab import files
    files.download('vectors_norm_shuffled.tsv')
    files.download('metadata_norm_shuffled.tsv')
except Exception:
    pass
