In [1]:
from __future__ import print_function
from optparse import OptionParser
import json
import os
import pdb
import sys

import h5py
import numpy as np
import pandas as pd
from scipy.stats import pearsonr, spearmanr
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import explained_variance_score
from sklearn.preprocessing import scale
import tensorflow as tf

from basenji.basenji import dataset #must change from basenji import dataset to from basenji.basenji import dataset
from basenji.basenji import seqnn #must change import statements in each of these files too
from basenji.basenji import trainer
from qnorm import quantile_normalize #must pip install qnorm

# For one hot encoding
import pysam
from basenji.dna_io import dna_1hot
from tqdm import tqdm

2023-11-21 10:31:06.671269: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-11-21 10:31:06.718560: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [28]:
# Load files
params_file = 'params_human.json'
model_file = 'model_human.h5'
targets_file = 'targets_human.txt'
hg38_genome_path = '/home/sbrener/repos/hg38.fa'
sequences_bed_path = '/home/sbrener/repos/sequences.bed'
dataset_filepath = '/media/longterm_hdd/sbrener/basenji_embeddings/embeddings.h5'

# One hot encode the sequences & save to file!
hg38_fasta = pysam.FastaFile(hg38_genome_path)
seqbedlen = 38171

# Load params
with open(params_file) as params_open:
    params = json.load(params_open)
params_model = params['model']
params_train = params['train']

#Load model
seqnn_model = seqnn.SeqNN(params_model)
seqnn_model.restore(model_file)
seqnn_model.build_embed(-1) # this gets embeddings from one layer before the final layer. To get the final layer embeddings, don't run this line.

# One hot encode the sequences, obtain embeddings, and save to file
with open(sequences_bed_path,'r') as bed_file, h5py.File(dataset_filepath,'w') as embed_file:
    embed_file.create_dataset(
        'embeddings',
        shape=(seqbedlen,896,1536),
        chunks=(1,896,1536),
        compression='gzip',
        compression_opts=9,
        dtype='float16'
    )    
    progress_bar = tqdm(enumerate(bed_file), total=seqbedlen)
    for index,bed_line in progress_bar:
        location_fields = bed_line.split('\t')
        chrom = str(location_fields[0])
        start = int(location_fields[1])
        end = int(location_fields[2])
        seq_type = location_fields[3].strip()
        progress_bar.set_description(f"Processing {seq_type} sequences")  # Update the progress bar description
        seq = hg38_fasta.fetch(chrom,start,end)
        onehot_seq = dna_1hot(seq)
        embeddings = np.array(seqnn_model.predict(onehot_seq[None,:,:],verbose=0))
        embed_file['embeddings'][index,:,:] = embeddings

        

Model: "model_15"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 sequence (InputLayer)       [(None, 131072, 4)]          0         []                            
                                                                                                  
 tf.nn.gelu_217 (TFOpLambda  (None, 131072, 4)            0         ['sequence[0][0]']            
 )                                                                                                
                                                                                                  
 conv1d_210 (Conv1D)         (None, 131072, 288)          17280     ['tf.nn.gelu_217[0][0]']      
                                                                                                  
 batch_normalization_210 (B  (None, 131072, 288)          1152      ['conv1d_210[0][0]']   

Processing test sequences: 100%|██████████| 38171/38171 [6:34:08<00:00,  1.61it/s]   
