In [2]:
import os

In [3]:
os.chdir(r"D:\Gene_Project")

In [4]:
import pandas as pd

In [5]:
all_files = os.listdir(r"unsupervised_CGC\output")

In [6]:
unsupervised_file_path = r"unsupervised_CGC\output\\"

In [7]:
def read_the_data(file):
    # read one file
    data = pd.read_csv(unsupervised_file_path + file, sep = " ", header = None)
    return data

In [8]:
from joblib import Parallel, delayed

In [9]:
all_files_df = Parallel(n_jobs=6, verbose = 3)(delayed(read_the_data)(i) for i in all_files)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done 908 tasks      | elapsed:    3.2s
[Parallel(n_jobs=6)]: Done 4716 tasks      | elapsed:    7.5s
[Parallel(n_jobs=6)]: Done 11884 tasks      | elapsed:   16.2s
[Parallel(n_jobs=6)]: Done 19519 tasks      | elapsed:   27.4s
[Parallel(n_jobs=6)]: Done 19582 out of 19582 | elapsed:   27.4s finished


In [10]:
len(all_files_df)

19582

In [11]:
all_unsupervised = pd.concat(all_files_df, ignore_index = True)

In [12]:
all_unsupervised.columns = ["sequence"]

In [13]:
all_unsupervised.head()

Unnamed: 0,sequence
0,"1.A.72,MerR,GH23"
1,"3.A.1,CE4"
2,"3.A.23,9.A.5,MCPsignal,2.A.21,2.A.22,TetR_N,3...."
3,"CE4,8.A.5,3.A.1,3.A.1,3.A.1"
4,"GT51,Peripla_BP_2,3.A.1,3.A.1,9.B.169"


In [15]:
all_unsupervised.shape

(771293, 1)

In [16]:
all_unsupervised["sequence"].value_counts()

GT19,1.B.33                                                   2749
1.B.33,GT19                                                   2415
9.B.146,2.A.103,GT28                                          2152
GT28,2.A.103,9.B.146                                          2111
2.A.1,GT2                                                     2082
                                                              ... 
2.A.51,GT1                                                       1
2.A.7,CE4|GT2                                                    1
3.A.1,PAS|SpoIIE|GAF,3.A.1,2.A.4,NUDIX,9.B.106,3.A.16,GT83       1
4.B.1,GH18,Sigma70_r4_2|Sigma70_r2,MarR,2.A.1,2.A.40             1
GT25,Rhodanese,3.A.3,9.B.105,2.A.1,9.B.27,3.A.3,2.A.6            1
Name: sequence, Length: 240622, dtype: int64

In [17]:
all_unsupervised = all_unsupervised.drop_duplicates()

In [18]:
all_unsupervised.shape

(240622, 1)

In [None]:
all_unsupervised.to_csv("all_unsupervised_genes.csv", index = False)

In [None]:
all_unsupervised_split = [seq.split(",") for seq in all_unsupervised["sequence"]]

In [None]:
# get the lengths
len_gene_seqs = [len(seq) for seq in all_unsupervised_split]

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.hist(len_gene_seqs)
plt.show()

In [None]:
all_genes = [gene for seq_split in all_unsupervised_split for gene in seq_split]

In [None]:
from collections import Counter

In [None]:
unsupervised_genes = Counter(all_genes)

In [None]:
len(Counter(all_genes))

In [None]:
# what if we also split on the |

In [None]:
all_unsupervised_split_1 = [seq.replace("|", ",").split(",") for seq in all_unsupervised["sequence"]]

In [None]:
all_genes_1 = [gene for seq_split in all_unsupervised_split_1 for gene in seq_split]

In [None]:
unsupervised_genes_1 = Counter(all_genes_1)

In [None]:
len(Counter(unsupervised_genes_1))

In [None]:
vocab_genes = [k for k, v in Counter(unsupervised_genes_1).items() if v>= 10]

In [None]:
# vocab_genes

In [None]:
# quite a significant difference

In [None]:
all_unsupervised = [seq.replace("|", ",").replace(",", " ") for seq in all_unsupervised["sequence"]]

In [None]:
# check the lengths
len_all_unsupervised_split = [len(seq.split()) for seq in all_unsupervised]

In [None]:
plt.hist(len_all_unsupervised_split)
plt.show()

In [None]:
# split the sequences into train and test
# generate some random indexes
import random

In [None]:
import numpy as np


In [None]:
train_idxs = random.sample(range(0, len(all_unsupervised)), int(len(all_unsupervised)*0.8))

In [None]:
train_sequences = np.array(all_unsupervised)[train_idxs]

In [None]:
val_idxs = list(set(range(0, len(all_unsupervised))).difference(train_idxs))

In [None]:
val_sequences = np.array(all_unsupervised)[val_idxs]

In [None]:
len(train_sequences), len(val_sequences)

In [None]:
# maybe can cut it off around 30

In [None]:
import tensorflow as tf

In [None]:
text_vec_layer = tf.keras.layers.TextVectorization(max_tokens = 1400, standardize = None, 
                                                  ngrams = 3, output_mode = "int", 
                                                  output_sequence_length = 20, pad_to_max_tokens = True)

In [None]:
text_vec_layer.adapt(train_sequences, batch_size = 100000)

In [None]:
# make the input layer
input_layer = tf.keras.layers.Input(shape = (), dtype = tf.string)

In [None]:
# pass through text vec
text_vec_output = text_vec_layer(input_layer)

In [None]:
# emb layer
emb_layer = tf.keras.layers.Embedding(1400, 256, mask_zero = True)

In [None]:
# emb output
emb_output = emb_layer(text_vec_output)

In [None]:
# pass through lstm
lstm_layer = tf.keras.layers.LSTM(128,return_state = True)

In [None]:
# pass through lstm
lstm_output, lstm_hidden, lstm_carry = lstm_layer(emb_output)

In [None]:
# encoder hidden state
encoder_state = [lstm_hidden, lstm_carry]

In [None]:
# repeat vector
repeat_vec_output = tf.keras.layers.RepeatVector(20)(lstm_output)

In [None]:
# decoder lstm
decoder_lstm_output = tf.keras.layers.LSTM(128, return_sequences = True)(repeat_vec_output, initial_state = encoder_state)

In [None]:
# time distributed dense
time_distributed_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1400,
                                                                               activation = "softmax"))(decoder_lstm_output)

In [None]:
model = tf.keras.models.Model(input_layer, time_distributed_output)

In [None]:
model.summary()

In [None]:
tf.keras.utils.plot_model(model, show_shapes = True)

In [None]:
model.compile(loss = "sparse_categorical_crossentropy", optimizer = tf.keras.optimizers.Adam(), 
             metrics = tf.keras.metrics.SparseCategoricalAccuracy())

In [None]:
# we need to create the numeric outputs
train_ints = text_vec_layer(train_sequences)

In [None]:
val_ints = text_vec_layer(val_sequences)

In [None]:
import numpy as np

In [None]:
model.fit(train_sequences, train_ints, batch_size = 256, epochs = 500, shuffle = True, 
          validation_data = (val_sequences, val_ints), verbose = 1, 
         callbacks = tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 5, mode = "min", 
                                                     restore_best_weights=True), 
         validation_batch_size = 1000)

In [None]:
model.save("autoencoder")

In [None]:
model.layers[3].output[0]

In [None]:
model_extract = tf.keras.models.Model(input_layer, model.layers[3].output[0] )

In [None]:
# read the data in

In [None]:
# set the directory to where the data is
import os

os.chdir(r"D:\Gene_Project")

In [None]:
# pandas for dealing with the data
import pandas as pd
# setting for seeing the entire string
pd.options.display.max_colwidth = None
pd.set_option('display.max_rows', 500)

In [None]:
# load the data - new data that was provided
data = pd.read_csv(r"pul_seq_low_high_substr_year_corrected.tsv", sep = "\t").dropna().sample(frac = 1.0)

In [None]:
data.head()

In [None]:
to_keep = data["high_level_substr"].value_counts()[:5].keys()

In [None]:
classes = [classes if classes in to_keep else "others" for classes in data["high_level_substr"]]

In [None]:
data["high_level_substr"] = classes

In [None]:
data["high_level_substr"].value_counts()

In [None]:
all_supervised = [seq.replace("|", ",").replace(",", " ") for seq in data["sig_gene_seq"]]

In [None]:
len(all_supervised)

In [None]:
data.shape

In [None]:
# model_extract = tf.keras.models.load_model(r"autoencoder")

In [None]:
data_extract = model_extract(np.array(all_supervised))

In [None]:
np.save("data_extract.npy", data_extract)

In [None]:
import numpy as np

In [None]:
data_extract = np.load("data_extract.npy")

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs = 6)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {"n_estimators": [100]}

In [None]:
search = GridSearchCV(rf, param_grid, n_jobs=6 , verbose = 3, cv = 5, scoring = "accuracy")

In [None]:
data["high_level_substr"].value_counts()

In [None]:
search.fit(data_extract, data["high_level_substr"])

In [None]:
search.best_score_