In [1]:
%load_ext autoreload
%autoreload 2
%cd -q ..

import pickle
import importlib
import esparto
import optuna
import numpy as np
import pandas as pd
from evoaug_tf import evoaug, augment
from src.diff_expression_model import get_model, get_siamese_model, post_hoc_conjoining, get_auroc
from skopt.utils import use_named_args

2025-05-22 10:57:12.139252: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-22 10:57:12.268541: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-22 10:57:12.272894: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/modules/easybuild/software/Gho

## Prepare the data

For now we start from this data pickle as I'm not aware how Helder did DE analysis and generated the labels

In [2]:
ppath = "data/dataset_solid_chrome.csv"
data = pd.read_csv(ppath)

In [3]:
data["set"].value_counts()

train    1900
valid     257
test      241
Name: set, dtype: int64

To show how to subdivide the dataset into train-test split we only take the following columns

In [4]:
dataset = data.reset_index()
dataset = dataset[["geneID", "Category", "GeneFamily", "seqs"]]
dataset.rename(columns={"geneID":"GeneID", "Category":"Label", "seqs": "Sequence"}, inplace=True)

In [5]:
dataset

Unnamed: 0,GeneID,Label,GeneFamily,Sequence
0,AT4G27120,0,HOM04D000881,TAGAGAAGACAAGCGGTTATTTCGTAATTTCCCAGCGACTTTGAAA...
1,AT4G19600,0,HOM04D000740,GTCAAGTAGTGAAATCAAGGTGTGAAGTAAGCTGAGGACAGATAAT...
2,AT3G60880,0,HOM04D003119,AGTTGATATTGAATGAAATCTTCATGTTTTTTGATAAATGATTATA...
3,AT5G06960,0,HOM04D000319,CACTTGTCAGATTCTTCTTACCAAATCCATCAACAAATAAGCAAAT...
4,AT1G14890,0,HOM04D000273,TTGATATAACAGATTCAACACTAAAAATGAGTAAAATCTAAAAAAG...
...,...,...,...,...
2393,AT5G64230,1,HOM04D003278,AAGAAAGAAAAACCGTACATAAACACCCATCTGGTATACCATCGTC...
2394,AT5G64780,1,HOM04D002552,TTTTAGAAAGAAGAAGAAGGATTATTGCCTTATTGGTGAAGGGAAG...
2395,AT4G30470,1,HOM04D000082,TATGTACAGTCTCTACATTTTTTCAAATACATTTTTTTCTTTTTCA...
2396,AT3G51895,1,HOM04D000270,TGGTAAATAATTAAATATATAAGAACATTATTCTAAAGCGTTGAAT...


### One-hot-encode & reverse-complement

In [6]:
from src.prepare_dataset import one_hot_encode_series, reverse_complement_series, reverse_complement_sequence
dataset["One_hot_encoded"] = one_hot_encode_series(dataset["Sequence"])

In [7]:
dataset["RC_one_hot_encoded"] = reverse_complement_series(dataset["One_hot_encoded"])

In [8]:
dataset

Unnamed: 0,GeneID,Label,GeneFamily,Sequence,One_hot_encoded,RC_one_hot_encoded
0,AT4G27120,0,HOM04D000881,TAGAGAAGACAAGCGGTTATTTCGTAATTTCCCAGCGACTTTGAAA...,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 1, 0], [1,...","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0,..."
1,AT4G19600,0,HOM04D000740,GTCAAGTAGTGAAATCAAGGTGTGAAGTAAGCTGAGGACAGATAAT...,"[[0, 0, 1, 0], [0, 0, 0, 1], [0, 1, 0, 0], [1,...","[[0, 0, 1, 0], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
2,AT3G60880,0,HOM04D003119,AGTTGATATTGAATGAAATCTTCATGTTTTTTGATAAATGATTATA...,"[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,...","[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
3,AT5G06960,0,HOM04D000319,CACTTGTCAGATTCTTCTTACCAAATCCATCAACAAATAAGCAAAT...,"[[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 0, 0], [0,...","[[1, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [1,..."
4,AT1G14890,0,HOM04D000273,TTGATATAACAGATTCAACACTAAAAATGAGTAAAATCTAAAAAAG...,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [1,...","[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1], [1,..."
...,...,...,...,...,...,...
2393,AT5G64230,1,HOM04D003278,AAGAAAGAAAAACCGTACATAAACACCCATCTGGTATACCATCGTC...,"[[1, 0, 0, 0], [1, 0, 0, 0], [0, 0, 1, 0], [1,...","[[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1], [0,..."
2394,AT5G64780,1,HOM04D002552,TTTTAGAAAGAAGAAGAAGGATTATTGCCTTATTGGTGAAGGGAAG...,"[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 0, 1], [0,...","[[0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0], [0,..."
2395,AT4G30470,1,HOM04D000082,TATGTACAGTCTCTACATTTTTTCAAATACATTTTTTTCTTTTTCA...,"[[0, 0, 0, 1], [1, 0, 0, 0], [0, 0, 0, 1], [0,...","[[1, 0, 0, 0], [0, 0, 1, 0], [0, 0, 1, 0], [0,..."
2396,AT3G51895,1,HOM04D000270,TGGTAAATAATTAAATATATAAGAACATTATTCTAAAGCGTTGAAT...,"[[0, 0, 0, 1], [0, 0, 1, 0], [0, 0, 1, 0], [0,...","[[0, 0, 0, 1], [0, 0, 0, 1], [0, 0, 1, 0], [1,..."


### Train-test split

In [9]:
from src.prepare_dataset import grouped_shuffle_split
train_df, test_df = grouped_shuffle_split(dataset, dataset["GeneFamily"], 0.2)

In [10]:
print(f"Length of training set: {train_df.shape[0]}")
print(f"Length of test set: {test_df.shape[0]}")

Length of training set: 1900
Length of test set: 498


## Cross validation

We first initialize the object that are static throughout the cross validation

In [11]:
augment_list = [
    augment.RandomRC(rc_prob=0.5),
    augment.RandomInsertionBatch(insert_min=0, insert_max=20),
    augment.RandomDeletion(delete_min=0, delete_max=30),
    augment.RandomTranslocationBatch(shift_min=0, shift_max=20),
    augment.RandomMutation(mutate_frac=0.05),
    augment.RandomNoise()
]

2025-05-22 10:57:24.224762: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/modules/easybuild/software/Ghostscript/10.01.2-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/GTK3/3.24.37-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Wayland/1.22.0-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libepoxy/1.5.10-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Mesa/23.1.4-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/LLVM/16.0.6-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libunwind/1.6.2-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libglvnd/1.6.0-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libdrm/2.4.115-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Pango/1.50.14-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/FriBidi/1.0.12-GCCcore-12.3.0/lib:/mnt/modules/easybuild/so

In [12]:
# early stopping callback
import tensorflow as tf

early_stopping_callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
											patience=20,
											verbose=1,
											mode='min',
											restore_best_weights=True)
# reduce learning rate callback
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss',
												factor=0.1,
												patience=5,
												min_lr=1e-7,
												mode='min',
												verbose=1)
callbacks = [early_stopping_callback, reduce_lr]

In [13]:
input_shape = train_df["One_hot_encoded"].iloc[0].shape

We prepare the data

In [14]:
def get_input_and_labels(df):
	ohe_np = np.stack(df["One_hot_encoded"])
	rc_np = np.stack(df["RC_one_hot_encoded"])

	x = np.append(ohe_np, rc_np, axis=0)
	x = x.astype('float32')
	y = np.append(df["Label"], df["Label"])
	return x, y

In [15]:
X, Y = get_input_and_labels(train_df)

We will now create the groups. Because we are using both forward and reverse complement we have to concat them with itself

In [16]:
groups = pd.concat([train_df["GeneFamily"], train_df["GeneFamily"]], axis = 0) 

Initialize the splitter

In [17]:
def objective(trial, x_train, y_train, x_val, y_val, i):
	learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
	kernel_size = trial.suggest_categorical('kernel_size', [(8,4), (12,4), (16,4), (20,4), (24,4)])
	batch_size_trial = trial.suggest_int("batch_size", 50, 200, 25)
	number_of_convolutions = trial.suggest_int("number_of_convolutions", 2,3,1)

	model = get_model(input_shape=input_shape, 
				   perform_evoaug=True, 
				   augment_list=augment_list, 
				   learning_rate=learning_rate, 
				   kernel_size=kernel_size,
				   number_of_convolutions=number_of_convolutions)

	# We add validation here, because one of the callbacks relies on val_loss metric
	model.fit(x_train,
			y_train,
			epochs=1,	
			batch_size=batch_size_trial,
			validation_data=(x_val, y_val),
			callbacks=callbacks
			)
	score = model.evaluate(x_val, y_val, verbose=0)
	return score[0]

In [18]:
from sklearn.model_selection import GroupKFold

def objective_cv(trial):

	# Get the MNIST dataset.
	group_kfold = GroupKFold(n_splits=5)
	
	scores = []
	for i, (train_index, validation_index) in enumerate(group_kfold.split(X, Y, groups)):
		x_train = X[train_index]
		y_train = Y[train_index]

		x_val = X[validation_index]
		y_val = Y[validation_index]

		loss = objective(trial, x_train, y_train, x_val, y_val, i)
		scores.append(loss)
	return np.mean(scores)

In [19]:
study = optuna.create_study(direction='minimize')
study.optimize(objective_cv, n_trials=2)
best_params = study.best_params
print(best_params)

[I 2025-05-22 10:57:24,631] A new study created in memory with name: no-name-fc20f370-2752-4e51-8dee-2139889ddf01
  batch_size_trial = trial.suggest_int("batch_size", 50, 200, 25)
  number_of_convolutions = trial.suggest_int("number_of_convolutions", 2,3,1)


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


[I 2025-05-22 10:57:45,148] Trial 0 finished with value: 0.6937584996223449 and parameters: {'learning_rate': 3.879388960285182e-05, 'kernel_size': (8, 4), 'batch_size': 100, 'number_of_convolutions': 3}. Best is trial 0 with value: 0.6937584996223449.
  batch_size_trial = trial.suggest_int("batch_size", 50, 200, 25)
  number_of_convolutions = trial.suggest_int("number_of_convolutions", 2,3,1)




[I 2025-05-22 10:58:02,642] Trial 1 finished with value: 0.6929933667182923 and parameters: {'learning_rate': 0.0007979132423974926, 'kernel_size': (20, 4), 'batch_size': 100, 'number_of_convolutions': 3}. Best is trial 1 with value: 0.6929933667182923.


{'learning_rate': 0.0007979132423974926, 'kernel_size': (20, 4), 'batch_size': 100, 'number_of_convolutions': 3}
