In [32]:
# load the libraries
import io
import re
import string
import tqdm
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
import os
import pandas as pd

In [33]:
# set the working directory
os.chdir(r"D:\Gene_Project")

In [34]:
# example sentence
sentence = "The wide road shimmered in the hot sun"
tokens = list(sentence.lower().split())
print(len(tokens))

8


In [35]:
# vocabulary index
vocab, index = {}, 1  # start indexing from 1
vocab['<pad>'] = 0  # add a padding token
for token in tokens:
  if token not in vocab:
    vocab[token] = index
    index += 1
vocab_size = len(vocab)
print(vocab)

{'<pad>': 0, 'the': 1, 'wide': 2, 'road': 3, 'shimmered': 4, 'in': 5, 'hot': 6, 'sun': 7}


In [7]:
# inverse vocabulary
inverse_vocab = {index: token for token, index in vocab.items()}
print(inverse_vocab)

{0: '<pad>', 1: 'the', 2: 'wide', 3: 'road', 4: 'shimmered', 5: 'in', 6: 'hot', 7: 'sun'}


In [8]:
# map example sentence to tokens
example_sequence = [vocab[word] for word in tokens]
print(example_sequence)

[1, 2, 3, 4, 5, 1, 6, 7]


In [9]:
# generate skip-grams
# i.e positive occurences
window_size = 2
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))

26


In [10]:
# see some example
for target, context in positive_skip_grams:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(3, 5): (road, in)
(6, 1): (hot, the)
(1, 5): (the, in)
(6, 5): (hot, in)
(5, 3): (in, road)
(1, 7): (the, sun)
(1, 4): (the, shimmered)
(3, 4): (road, shimmered)
(4, 1): (shimmered, the)
(1, 3): (the, road)
(4, 3): (shimmered, road)
(2, 1): (wide, the)
(5, 6): (in, hot)
(3, 2): (road, wide)
(4, 5): (shimmered, in)
(1, 2): (the, wide)
(5, 1): (in, the)
(3, 1): (road, the)
(7, 1): (sun, the)
(1, 6): (the, hot)
(2, 4): (wide, shimmered)
(6, 7): (hot, sun)
(5, 4): (in, shimmered)
(4, 2): (shimmered, wide)
(2, 3): (wide, road)
(7, 6): (sun, hot)


In [11]:
# seed
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [12]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([2 1 4 3], shape=(4,), dtype=int64)
['wide', 'the', 'shimmered', 'road']


In [13]:
# Add a dimension so you can use concatenation (on the next step).
negative_sampling_candidates = tf.expand_dims(negative_sampling_candidates, 1)

In [14]:
# Concat positive context word with negative sampled words.
context = tf.concat([context_class, negative_sampling_candidates], 0)

# Label first context word as 1 (positive) followed by num_ns 0s (negative).
label = tf.constant([1] + [0]*num_ns, dtype="int64")

In [15]:
context

<tf.Tensor: shape=(5, 1), dtype=int64, numpy=
array([[5],
       [2],
       [1],
       [4],
       [3]], dtype=int64)>

In [16]:
# Reshape target to shape (1,) and context and label to (num_ns+1,).
target = tf.squeeze(target_word)
context = tf.squeeze(context)
label = tf.squeeze(label)

In [17]:
 target

<tf.Tensor: shape=(), dtype=int32, numpy=3>

In [18]:
context

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([5, 2, 1, 4, 3], dtype=int64)>

In [19]:
label

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0], dtype=int64)>

In [20]:
target, context

(<tf.Tensor: shape=(), dtype=int32, numpy=3>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([5, 2, 1, 4, 3], dtype=int64)>)

In [23]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for vocab_size tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=SEED,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [21]:
# read the unsupervised data
unsupervised_gene_data = pd.read_csv("all_unsupervised_genes.csv")

In [22]:
# unsupervised data
unsupervised_gene_data.head()

Unnamed: 0,sequence
0,"1.A.72,MerR,GH23"
1,"3.A.1,CE4"
2,"3.A.23,9.A.5,MCPsignal,2.A.21,2.A.22,TetR_N,3...."
3,"CE4,8.A.5,3.A.1,3.A.1,3.A.1"
4,"GT51,Peripla_BP_2,3.A.1,3.A.1,9.B.169"


In [24]:
unsupervised_gene_data.shape

(771293, 1)

In [36]:
# read in the new supervised data as well

In [37]:
new_supervised = pd.read_csv("pul_seq_low_high_substr_year.tsv", sep = "\t")

In [38]:
new_supervised.head()

Unnamed: 0,PULid,sig_gene_seq,low_level_substr,high_level_substr,Pub_year
0,PUL0001,"GH1,8.A.49,CE2,GH130,GH130,3.A.1,3.A.1,SBP_bac...",beta-mannan,beta-mannan,2019
1,PUL0002,GH16,lichenan,beta-glucan,1996
2,PUL0003,"GH30_8,GH43_16|CBM6",xylan,xylan,2016
3,PUL0004,"4.A.1,GH1","glucose,cellobiose,maltose",multiple_substrates,2016
4,PUL0005,"GH94,GH3","beta-glucan,sophorose,laminaribiose",multiple_substrates,2016


In [39]:
# get the genes from the supervised data

In [44]:
all_seq_supervised = new_supervised["sig_gene_seq"].values

In [53]:
# split the genes in supervised data
all_genes_supervised = [gene for seq in all_seq_supervised for gene in str(seq).split(",")]

In [None]:
# unsupervised

In [54]:
all_seq_unsupervised = unsupervised_gene_data["sequence"].values

In [55]:
# split the genes in supervised data
all_genes_unsupervised = [gene for seq in all_seq_unsupervised for gene in str(seq).split(",")]

In [57]:
# get Counter for the unsupervised genes

In [None]:
from collections import Counter