# Tutorial to perform TFModisco and create a report #

In [1]:
%load_ext autoreload
%autoreload 2
%cd -q ..

import sys
import os

from Bio import SeqIO
from evoaug_tf import evoaug, augment
from modiscolite import tfmodisco, report
from src.deepExplain_tool import getDeepExplainerBackground, deepExplain, plotResults
from src.diff_expression_model import load_model, get_siamese_model
import numpy as np
import importlib
import pickle
import modiscolite


2025-05-22 10:59:31.810012: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI AVX512_BF16 AVX_VNNI AMX_TILE AMX_INT8 AMX_BF16 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-05-22 10:59:31.961367: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-22 10:59:31.965872: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/modules/easybuild/software/Gho

Retrieve the siamese model based on the weight of Helder

In [2]:

model_path = "data/model_single.json"
weights_path = "data/weights_single.h5"

augment_list = [
	augment.RandomRC(rc_prob=0.5),
	augment.RandomInsertionBatch(insert_min=0, insert_max=20),
	augment.RandomDeletion(delete_min=0, delete_max=30),
	augment.RandomTranslocationBatch(shift_min=0, shift_max=20),
	augment.RandomMutation(mutate_frac=0.05),
	augment.RandomNoise()
]

model = load_model(model_path, weights_path)
siamese_model = get_siamese_model(model)



Loaded model from disk


2025-05-22 10:59:48.256344: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /mnt/modules/easybuild/software/Ghostscript/10.01.2-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/GTK3/3.24.37-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Wayland/1.22.0-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libepoxy/1.5.10-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Mesa/23.1.4-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/LLVM/16.0.6-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libunwind/1.6.2-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libglvnd/1.6.0-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/libdrm/2.4.115-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/Pango/1.50.14-GCCcore-12.3.0/lib:/mnt/modules/easybuild/software/FriBidi/1.0.12-GCCcore-12.3.0/lib:/mnt/modules/easybuild/so

Generate a background based on N random samples. In our case we start from a pickle containing 100 one-hot-encoded sequences with both the forward and reverse strand.

The numpy array passed to getDeepExplainerBackground should have the following shape (2, #samples, #sequence length, 4)

In [3]:
bg_path = "data/background_100samples.pkl"
with open(bg_path, 'rb') as f:
    background_data = pickle.load(f)

# Conver to numpy
np_bg = np.array(background_data)

# Remove axis of dimension 1
np_bg = np.squeeze(np_bg)

# Convert to float 64, to be compatible with deepExplain
np_bg = np.float64(np_bg)

background = getDeepExplainerBackground(background_samples=np_bg, shuffle=False, post_hoc_conjoining=True)

Retrieve the sequences, to speed up the SeqIO parsing you could chunk larger fasta files

In [4]:
sequence_list = []
gene_id_list = []

for chunk in range(1,3):
	fasta_path = f"data/fasta/example.fasta_c{chunk}"
	fasta_sequences = SeqIO.parse(open(fasta_path),'fasta')
	for index, fasta in enumerate(fasta_sequences):
		if index > 100: break
		name, sequence = fasta.id, str(fasta.seq)
		gene_id_list.append(name)
		sequence_list.append(sequence)

One hot encode the sequences and creat ethe reverse compliment

In [5]:
ohe_list = []
ohe_rc_list = []
for sequence in sequence_list:
	ohe_seq = []
	ohe_rc_seq = []
	for i in sequence:
		if i == 'A':
			ohe_seq.append([1., 0., 0., 0.])
			ohe_rc_seq.append([0., 0., 0., 1.])
		elif i == 'C':
			ohe_seq.append([0., 1., 0., 0.])
			ohe_rc_seq.append([0., 0., 1., 0.])
		elif i == 'G':
			ohe_seq.append([0., 0., 1., 0.])
			ohe_rc_seq.append([0., 1., 0., 0.])
		elif i == 'T':
			ohe_seq.append([0., 0., 0., 1.])
			ohe_rc_seq.append([1., 0., 0., 0.])
		else:
			ohe_seq.append([0., 0., 0., 0.])
			ohe_rc_seq.append([0., 0., 0., 0.])
	ohe_list.append(ohe_seq)
	ohe_rc_list.append(list(reversed(ohe_rc_seq)))
ohe_np = np.array(ohe_list)
ohe_rc_np = np.array(ohe_rc_list)

np_deepexplain_samples = np.stack((ohe_np, ohe_rc_np))

In [6]:
shap_values = deepExplain(samples=np_deepexplain_samples,
                          loaded_model=siamese_model, 
                          bg=background, 
                          post_hoc_conjoining=True, 
                          augment_list=augment_list, 
                          pad_background=True, 
                          pad_samples=False,
                          evo_aug=True)

keras is no longer supported, please use tf.keras instead.
Your TensorFlow version is newer than 2.4.0 and so graph support has been removed in eager mode and some static graphs may not be supported. See PR #1483 for discussion.
`tf.keras.backend.set_learning_phase` is deprecated and will be removed after 2020-10-11. To update it, simply pass a True/False value to the `training` argument of the `__call__` method of your layer or model.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In [7]:
np.array(shap_values).shape

(202, 2, 620, 4)

In [8]:
shap_values_forward = np.array(shap_values)[:,0,:,:]
pos_patterns, neg_patterns = tfmodisco.TFMoDISco(
                                    min_metacluster_size = 2,
                                    hypothetical_contribs=shap_values_forward, 
                                    one_hot=np_deepexplain_samples[0],
                                    verbose=True
                                )
pos_patterns


Using 633 positive seqlets
Extracted 722 negative seqlets


[<modiscolite.core.SeqletSet at 0x7f7890db3580>,
 <modiscolite.core.SeqletSet at 0x7f7890db36d0>,
 <modiscolite.core.SeqletSet at 0x7f78911da9d0>,
 <modiscolite.core.SeqletSet at 0x7f78911dab20>,
 <modiscolite.core.SeqletSet at 0x7f789068dd00>,
 <modiscolite.core.SeqletSet at 0x7f7890d41d30>,
 <modiscolite.core.SeqletSet at 0x7f7890d41e80>,
 <modiscolite.core.SeqletSet at 0x7f789068d760>]

In [9]:
neg_patterns

[<modiscolite.core.SeqletSet at 0x7f7890e864c0>,
 <modiscolite.core.SeqletSet at 0x7f7890db3310>,
 <modiscolite.core.SeqletSet at 0x7f7890db3ee0>,
 <modiscolite.core.SeqletSet at 0x7f7890db31f0>,
 <modiscolite.core.SeqletSet at 0x7f7890db3370>,
 <modiscolite.core.SeqletSet at 0x7f7890db3040>,
 <modiscolite.core.SeqletSet at 0x7f7890db3970>]

In [10]:
output_dir = "report/"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

In [11]:
h5_result_path = "report/modisco_output.h5"

modiscolite.io.save_hdf5(h5_result_path, pos_patterns, neg_patterns, 4)

Make sure MEME is available on the system

In [15]:
output_dir = "report/"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

report.report_motifs(modisco_h5py = h5_result_path,
                    output_dir = output_dir,
                    img_path_suffix = '../' + output_dir,
                    meme_motif_db = "data/jaspar_cisbp_other_source_no_high_similarity_tfbs_annot.meme",
                    is_writing_tomtom_matrix = True
                    )

In [16]:
from IPython.core.display import display, HTML

display(HTML(filename='report/motifs.html'))

Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display


pattern,num_seqlets,modisco_cwm_fwd,modisco_cwm_rev,match0,qval0,match0_logo,match1,qval1,match1_logo,match2,qval2,match2_logo
pos_patterns.pattern_0,57,,,MA0552.2_bHLH,0.014887,,MA0965.3_bHLH,0.014887,,MA0966.2_bHLH,0.014887,
pos_patterns.pattern_1,51,,,MX0089_bZIP,0.809561,,M01782_bZIP,0.809561,,MX0051_Unknown,0.809561,
pos_patterns.pattern_2,50,,,MX0446_AHL,0.002918,,M07354_ZF-HD,0.002945,,M08606_Sox,0.003798,
pos_patterns.pattern_3,46,,,M06704_B3,0.001285,,M06861_Dof,0.001297,,M06868_Dof,0.002841,
pos_patterns.pattern_4,44,,,M06704_B3,0.000653,,M06861_Dof,0.000743,,M06868_Dof,0.000778,
pos_patterns.pattern_5,35,,,MX0162_bHLH,0.199247,,MX0188_RAV,0.981446,,M01755_bHLH,0.981446,
pos_patterns.pattern_6,27,,,MX0162_bHLH,1.0,,M07078_MYB,1.0,,,,
pos_patterns.pattern_7,27,,,M06861_Dof,0.108782,,M06868_Dof,0.108782,,M06704_B3,0.108782,
neg_patterns.pattern_0,105,,,M01675_AT_hook,1.0,,,,,,,
neg_patterns.pattern_1,56,,,M07157_MYB_related,0.366589,,M06805_C2H2,0.479799,,MA2378.1_ERF,0.479799,
