In [193]:
%load_ext autoreload
%autoreload 2
%cd /group/transreg/sathi/DeepDifE 

import pickle
import importlib
import esparto
import optuna
import numpy as np
import pandas as pd
from evoaug_tf import evoaug, augment
from src.diff_expression_model import get_model, get_siamese_model, post_hoc_conjoining, get_auroc
from skopt.utils import use_named_args

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/storage/nas6/group/biocomp/projects/transreg/sathi/DeepDifE


## Prepare the data

For now we start from this data pickle as I'm not aware how Helder did DE analysis and generated the labels

In [8]:
ppath = "/group/transreg/heopd/dlpipe/results/ath/aba/dlresults/predetermined_dataset/dataset_solid_chrome.pkl"
with open(ppath, 'rb') as f:
    data = pickle.load(f)

In [12]:
data.columns

Index(['Category', 'GeneFamily', 'seqs', 'ohs', 'rcohs', 'ohsDuo',
       'in_original_balanced', 'set', 'npshap-single', 'npshap-posthoc'],
      dtype='object')

In [69]:
data["set"].value_counts()

train    1900
valid     257
test      241
Name: set, dtype: int64

To show how to subdivide the dataset into train-test split we only take the following columns

In [17]:
dataset = data.reset_index()
dataset = dataset[["geneID", "Category", "GeneFamily", "seqs"]]
dataset.rename(columns={"geneID":"GeneID", "Category":"Label", "seqs": "Sequence"}, inplace=True)

In [18]:
dataset

Unnamed: 0,GeneID,Label,GeneFamily,Sequence
0,AT4G27120,0,HOM04D000881,TAGAGAAGACAAGCGGTTATTTCGTAATTTCCCAGCGACTTTGAAA...
1,AT4G19600,0,HOM04D000740,GTCAAGTAGTGAAATCAAGGTGTGAAGTAAGCTGAGGACAGATAAT...
2,AT3G60880,0,HOM04D003119,AGTTGATATTGAATGAAATCTTCATGTTTTTTGATAAATGATTATA...
3,AT5G06960,0,HOM04D000319,CACTTGTCAGATTCTTCTTACCAAATCCATCAACAAATAAGCAAAT...
4,AT1G14890,0,HOM04D000273,TTGATATAACAGATTCAACACTAAAAATGAGTAAAATCTAAAAAAG...
...,...,...,...,...
2393,AT5G64230,1,HOM04D003278,AAGAAAGAAAAACCGTACATAAACACCCATCTGGTATACCATCGTC...
2394,AT5G64780,1,HOM04D002552,TTTTAGAAAGAAGAAGAAGGATTATTGCCTTATTGGTGAAGGGAAG...
2395,AT4G30470,1,HOM04D000082,TATGTACAGTCTCTACATTTTTTCAAATACATTTTTTTCTTTTTCA...
2396,AT3G51895,1,HOM04D000270,TGGTAAATAATTAAATATATAAGAACATTATTCTAAAGCGTTGAAT...


### One-hot-encode & reverse-complement

In [184]:
from src.prepare_dataset import one_hot_encode_series, reverse_complement_series, reverse_complement_sequence
dataset["One_hot_encoded"] = one_hot_encode_series(dataset["Sequence"])

In [39]:
dataset["RC_one_hot_encoded"] = reverse_complement_series(dataset["One_hot_encoded"])

In [106]:
dataset

Unnamed: 0,GeneID,Label,GeneFamily,Sequence,One_hot_encoded,RC_one_hot_encoded
0,AT4G27120,0,HOM04D000881,TAGAGAAGACAAGCGGTTATTTCGTAATTTCCCAGCGACTTTGAAA...,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 1, 0], [1,...","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0,..."
1,AT4G19600,0,HOM04D000740,GTCAAGTAGTGAAATCAAGGTGTGAAGTAAGCTGAGGACAGATAAT...,"[[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1,...","[[0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
2,AT3G60880,0,HOM04D003119,AGTTGATATTGAATGAAATCTTCATGTTTTTTGATAAATGATTATA...,"[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,...","[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
3,AT5G06960,0,HOM04D000319,CACTTGTCAGATTCTTCTTACCAAATCCATCAACAAATAAGCAAAT...,"[[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0,...","[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1,..."
4,AT1G14890,0,HOM04D000273,TTGATATAACAGATTCAACACTAAAAATGAGTAAAATCTAAAAAAG...,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [1,...","[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1], [1,..."
...,...,...,...,...,...,...
2393,AT5G64230,1,HOM04D003278,AAGAAAGAAAAACCGTACATAAACACCCATCTGGTATACCATCGTC...,"[[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1,...","[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,..."
2394,AT5G64780,1,HOM04D002552,TTTTAGAAAGAAGAAGAAGGATTATTGCCTTATTGGTGAAGGGAAG...,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0,...","[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
2395,AT4G30470,1,HOM04D000082,TATGTACAGTCTCTACATTTTTTCAAATACATTTTTTTCTTTTTCA...,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1], [0,...","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0,..."
2396,AT3G51895,1,HOM04D000270,TGGTAAATAATTAAATATATAAGAACATTATTCTAAAGCGTTGAAT...,"[[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [0,...","[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."


### Train-test split

In [85]:
from src.prepare_dataset import grouped_shuffle_split
train_df, validation_test_df = grouped_shuffle_split(dataset, dataset["GeneFamily"], 0.2)

In [86]:
validation_df, test_df  = grouped_shuffle_split(validation_test_df, validation_test_df["GeneFamily"], 0.5)

In [87]:
print(f"Length of training set: {train_df.shape[0]}")
print(f"Length of validation set: {validation_df.shape[0]}")
print(f"Length of test set: {test_df.shape[0]}")

Length of training set: 1900
Length of validation set: 257
Length of test set: 241


In [162]:
def get_input_and_labels(df):
	ohe_np = np.stack(df["One_hot_encoded"])
	rc_np = np.stack(df["RC_one_hot_encoded"])

	x = np.append(ohe_np, rc_np, axis=0)
	x = x.astype('float32')
	y = np.append(df["Label"], df["Label"])
	return x, y

In [163]:
x_train, y_train = get_input_and_labels(train_df)
x_validation, y_validation = get_input_and_labels(validation_df)

## Prepare model

As the model uses evo augmentation, a list of possible nucleotide operations needs to be given

In [164]:
augment_list = [
    augment.RandomRC(rc_prob=0.5),
    augment.RandomInsertionBatch(insert_min=0, insert_max=20),
    augment.RandomDeletion(delete_min=0, delete_max=30),
    augment.RandomTranslocationBatch(shift_min=0, shift_max=20),
    augment.RandomMutation(mutate_frac=0.05),
    augment.RandomNoise()
]

Get the shape of the input data

In [165]:
input_shape = train_df["One_hot_encoded"].iloc[0].shape

Initialize the model

In [166]:
model = get_model(input_shape=input_shape, perform_evoaug=True, augment_list=augment_list ,learning_rate=0.001)

## Train the model

Define the callback

In [167]:
# early stopping callback
import tensorflow as tf

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
											patience=20,
											verbose=1,
											mode='min',
											restore_best_weights=True)
# reduce learning rate callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
												factor=0.1,
												patience=5,
												min_lr=1e-7,
												mode='min',
												verbose=1)
callbacks = [early_stopping_callback, reduce_lr]

In [171]:
history = model.fit(x_train,
					y_train,
					epochs=100,
					batch_size=100,
					validation_data=(x_validation, y_validation),
					callbacks=callbacks
					)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 36: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 43: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 48: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 53: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 54

## Test set

We perform post-hoc conjoing

In [181]:
siamese_model = get_siamese_model(model.model)

Prepare the test set data

In [203]:
x_test = np.stack(test_df["One_hot_encoded"])
x_test_rc = np.stack(test_df["RC_one_hot_encoded"])

In [209]:
y_test = test_df["Label"].to_numpy()

As we used evo aug in our model all the sequences which were trained on, were of length 620. We use the evoaug padding function to pad the test set to the same length

In [204]:
x_test = model._pad_end(x_test)
x_test_rc = model._pad_end(x_test_rc)

In [206]:
predictions_categories, predictions = post_hoc_conjoining(siamese_model, x_test, x_test_rc)



In [211]:
get_auroc(y_test, predictions)

0.8308385308385308

In [212]:
a=1