In [1]:
import pandas as pd
from tqdm import tqdm
import gensim
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 
# os.chdir(r"D://Proposal_Defense//Simulations")
from Utils.Script_utils import get_data_splits, first_LSTM_training, get_data_splits_old_algo
from joblib import Parallel, delayed
import sys
from scipy.stats import norm

In [2]:
# Seed value
# Apparently you may use different seed values at each stage
seed_value= 42

# 1. Set the `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(seed_value)

# 2. Set the `python` built-in pseudo-random generator at a fixed value
import random
random.seed(seed_value)

# 3. Set the `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(seed_value)

# # 4. Set the `tensorflow` pseudo-random generator at a fixed value
import tensorflow as tf
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

if tf.test.gpu_device_name():
    print('GPU found')
else:
    print("No GPU found")
# tf.config.set_visible_devices([], 'GPU')

from tqdm.notebook import tqdm
tf.random.set_seed(seed_value)
# for later versions: 
# tf.compat.v1.set_random_seed(seed_value)

# 5. Configure a new global `tensorflow` session
# from keras import backend as K
# session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
# sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
# K.set_session(sess)

No GPU found


In [3]:
import matplotlib.pyplot as plt

import pickle

In [4]:
model_cbow = gensim.models.word2vec.Word2Vec.load(r"word2vec_sg")

In [5]:
lr = 1e-3
# from sklearn.model_selection import train_test_split


## Read the data
data = pd.read_csv(r"doc2vec_dbow.csv")

## We only using xylan and pectin
to_keep = ["pectin", "xylan"]

data = data[data["high_level_substr"].isin(to_keep)]

data = data.reset_index(drop = True)

features = [seq.replace("|", ",").replace(",", " ") for seq in data["sig_gene_seq"].values]

features  = np.array(features)

features = features.reshape(-1,1)

In [6]:
data["high_level_substr"].value_counts()

pectin    39
xylan     35
Name: high_level_substr, dtype: int64

In [7]:
data.head()

Unnamed: 0,sig_gene_seq,high_level_substr,0,1,2,3,4,5,6,7,...,40,41,42,43,44,45,46,47,48,49
0,"null,null,HTH_DeoR,HTH_AraC|HTH_AraC,8.A.59,PL...",pectin,0.098224,-0.016721,0.340086,-0.103025,-0.108523,-0.186498,-0.142543,0.572389,...,0.169946,-0.143168,0.421863,0.088525,0.044629,0.218964,0.507033,-0.179407,-0.099657,0.128027
1,"CE10,1.B.35",pectin,-0.114699,0.105488,0.111531,-0.11643,-0.133199,0.25912,0.133353,0.196669,...,-0.042859,0.02752,0.208792,-0.271193,0.064108,0.049114,-0.236957,0.039852,0.002365,0.016162
2,"3.A.1,3.A.1,3.A.1,3.A.1,PL2_2",pectin,-0.278164,0.085645,0.272704,-0.02893,0.022241,0.520536,0.059369,0.058101,...,-0.059951,-0.108473,0.103364,-0.210818,0.063985,-0.065916,-0.171995,-0.13206,-0.036154,-0.029065
3,"MarR,null,null,AraC_binding,null,null,PfkB,nul...",pectin,-0.195133,-0.111228,0.434263,-0.097529,-0.028071,0.497665,0.207075,0.449561,...,0.175302,0.190835,0.10237,-0.651666,-0.101355,-0.007362,-0.107131,-0.218027,0.537375,0.115762
4,"PL10_1,CE8,PL11",pectin,-0.289301,0.035218,0.544027,0.116612,-0.201045,0.170767,-0.295381,0.142336,...,-0.043412,0.048884,0.392662,-0.174932,0.16309,0.25203,0.078837,-0.145878,0.05471,-0.264735


In [8]:
reps = 50

In [9]:
## generate 100 realizations of train valid and test
# catch = Parallel(n_jobs=15, verbose = 10, backend = "loky")(delayed(get_data_splits_old_algo)(data,  features,  i) for i in range(reps)) 

In [10]:
# with open('true_data_doc2vec_lstm_var_0.0001.pkl', 'wb') as f:
#     pickle.dump(catch, f)


with open('true_data_doc2vec_lstm_var_0.0001.pkl', 'rb') as f:
    catch = pickle.load(f)  

In [11]:
rate = 0.15

In [12]:
def attention_lstm_model(training): 
    padding_vector = np.zeros((1, model_cbow.wv.vectors.shape[1]))
    unknown_vector = np.zeros((1, model_cbow.wv.vectors.shape[1]))
    weight_vectors = np.vstack((padding_vector, unknown_vector))
    weight_vectors = np.vstack((weight_vectors, model_cbow.wv.vectors))
    embedding_layer = tf.keras.layers.Embedding(len(weight_vectors),
                            weight_vectors.shape[1],
                            weights=[weight_vectors],
                            mask_zero = False,
                            trainable=False)

    
    vectorize_layer = tf.keras.layers.TextVectorization(
                     output_mode='int',
                     vocabulary=model_cbow.wv.index_to_key, 
                     standardize = None)
    
    input_layer = tf.keras.layers.Input(shape = (1,), dtype = tf.string)

    vectorize = vectorize_layer(input_layer)

    vectorize.shape

    emb_output = embedding_layer(vectorize)

    emb_output.shape
    
    # spatial_drop = tf.keras.layers.SpatialDropout1D(0.5)
    
    # spatial_out = spatial_drop(emb_output, training = training)

    lstm_layer = tf.keras.layers.LSTM(25, return_sequences = True, dropout = rate)

    lstm_output = lstm_layer(emb_output, training = training)

#     x_a = tf.keras.layers.Dense(lstm_output.get_shape()[-1]//2, kernel_initializer = 'glorot_uniform', activation="tanh", name="tanh_mlp")(lstm_output) 
    
    # x_a = tf.keras.layers.SpatialDropout1D(0.75)(lstm_output, training = training)
    
    x_a = tf.keras.layers.Dense(1, kernel_initializer = 'glorot_uniform', activation='linear', name="word-level_context", kernel_regularizer=tf.keras.regularizers.L2())(lstm_output)

    x_a = tf.keras.layers.Flatten()(x_a)

    att_out = tf.keras.layers.Activation('softmax')(x_a) 

    x_a2 = tf.keras.layers.RepeatVector(lstm_output.get_shape()[-1])(att_out)

    x_a2 = tf.keras.layers.Permute([2,1])(x_a2)

    out = tf.keras.layers.Multiply()([lstm_output,x_a2])
    
    out = tf.keras.layers.Lambda(lambda x : tf.math.reduce_sum(x, axis = 1), name='expectation_over_words')(out)
    
    # dropout_layer = tf.keras.layers.Dropout(0.65)(out, training = training)

    pred_head = tf.keras.layers.Dense(1, kernel_regularizer=tf.keras.regularizers.L2())

    pred_output = pred_head(out)

    model = tf.keras.models.Model(input_layer, pred_output)
    
    model.compile(loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), 
                 optimizer = tf.keras.optimizers.Adam(learning_rate = lr), 
                 metrics=tf.keras.metrics.BinaryAccuracy())
    
    return model

In [13]:
def first_LSTM_training(idx):

    first_lstm = attention_lstm_model(False)
    # model_word2vec = simple_lstm(False, model_cbow)
    # init_weights = first_lstm.get_weights()
    
    first_lstm.fit(catch[idx][0], catch[idx][3], epochs = 2000, verbose = 0, 
                  callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 30,
                                                              restore_best_weights=True), 
                      validation_data = (catch[idx][1], catch[idx][4]))
    
    X_train_logits = first_lstm.predict(catch[idx][0], verbose = 0)
    X_valid_logits = first_lstm.predict(catch[idx][1], verbose = 0)
    X_test_logits = first_lstm.predict(catch[idx][2], verbose = 0)
    
    lstm_extract = tf.keras.models.Model(first_lstm.input, first_lstm.layers[-3].output)
    
    train_lstm_extract = lstm_extract(catch[idx][0])
    valid_lstm_extract = lstm_extract(catch[idx][1])
    test_lstm_extract = lstm_extract(catch[idx][2])
    
    first_lstm.save("Real_World_LSTM_Models_Low_Dropout" + "//" + "Model_" + str(idx))
    
    return X_train_logits, X_valid_logits , X_test_logits,  train_lstm_extract, valid_lstm_extract, test_lstm_extract

In [14]:
first_lstm = Parallel(n_jobs=15, verbose = 10, backend = "loky")(delayed(first_LSTM_training)( i) for i in range(reps))

[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done   2 tasks      | elapsed:   16.3s
[Parallel(n_jobs=15)]: Done  11 tasks      | elapsed:   20.2s
[Parallel(n_jobs=15)]: Done  20 tasks      | elapsed:   32.9s
[Parallel(n_jobs=15)]: Done  27 out of  50 | elapsed:   37.4s remaining:   31.9s
[Parallel(n_jobs=15)]: Done  33 out of  50 | elapsed:   45.9s remaining:   23.7s
[Parallel(n_jobs=15)]: Done  39 out of  50 | elapsed:   48.2s remaining:   13.6s
[Parallel(n_jobs=15)]: Done  45 out of  50 | elapsed:   55.4s remaining:    6.2s
[Parallel(n_jobs=15)]: Done  50 out of  50 | elapsed:   58.3s finished


In [15]:
with open('first_lstm_with_doc2vec_lstm_var_0.0001_real_world_low_dropout.pkl', 'wb') as f:
    pickle.dump(first_lstm, f)


# with open('first_lstm_with_lstm.pkl', 'rb') as f:
#     first_lstm = pickle.load(f)  

In [16]:
# ens_model = tf.keras.models.load_model("Real_World_LSTM_Models" + "//" + "Model_" + str(0))

In [17]:
# ens_model.summary()

In [18]:
# base_model = attention_lstm_model(True)

In [19]:
# base_model.set_weights(ens_model.get_weights())

In [20]:
# base_model.predict(catch[0][0])