# Imports and Setup

>* import required modules 
>* setup required environment variables for logging and gpu

In [1]:
import os
import logging
logger = logging.getLogger()
logger.setLevel('DEBUG')
# config for server
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = '2'
os.environ['PATH'] = r'C:\Users\Rudolf\Documents\v9.0\bin' + os.path.pathsep + os.environ['PATH'] 
import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files (x86)/Graphviz2.38/bin/'

In [2]:
import tensorflow as tf
from keras import backend as K
num_CPU = 8
num_GPU = 1
config = tf.ConfigProto( device_count = {'CPU' : num_CPU, 'GPU' : num_GPU})
session = tf.Session(config=config)
K.set_session(session)

Using TensorFlow backend.


In [3]:
import pickle
import shutil
import numpy as np
import gc
import h5py

# Utility Functions

In [4]:
class DebugObject(Exception):
    def __init__(self, obj, message):
        self.content = obj
        self.message = message
        Exception.__init__(self, message)

In [5]:
import time
import datetime
def func_metrics_display(funk):
    def metricized(*args, **kwargs):
        start = time.time()
        obj = funk(*args, **kwargs)
        end = time.time()
        dt = end - start
        if dt > 1:
            format_str = ('%H hours' if dt >= 3600 else '') + ('%M minutes' if dt >= 60 else '') + '%S seconds'
            logging.info('The function {} took {}'.format(funk.__name__, 
                                                          time.strftime( format_str ,time.gmtime(dt)) ))
        else:
            logging.info('The function {} took {:>10.4f}s'.format(funk.__name__, dt))
        return obj
    metricized.__name__=funk.__name__
    return metricized

In [6]:
@func_metrics_display
def use_bw(bw_file):
    import pyBigWig
    import numpy as np
    bigwig = pyBigWig.open(bw_file)
    def get_values(chrom, start, stop):
        temp = np.array(bigwig.values(chrom, start, stop))
        temp[np.isnan(temp)] = 0
        return temp
    return get_values

In [7]:
def get_bigwig_celline_file(cell_line):
    return [f for f in os.listdir() if f.endswith('1x.bw') and f[:f.index('.')].lower()==cell_line.lower()][0]

In [8]:
import scipy.sparse
from scipy.sparse import csc_matrix
import tqdm
@func_metrics_display
def create_bwchrcsc(bw_file, base_filename='duke_unique'):
    bigwig = pyBigWig.open(bw_file)
    chroms = bigwig.chroms()    
    ext = '.npz'
    for chrom in tqdm.tqdm(chroms):
        logging.info('Creating sparse matrix for {}'.format(chrom))
        prefix = '{}_'.format(chrom)
        filename = prefix+base_filename+ext
        if not all_paths_exists(filename):
            csc = csc_matrix(bigwig.values(chrom, 0, chroms[chrom]))
            logging.info('Saving to {}'.format(filename))
            scipy.sparse.save_npz(filename, csc)
        else:
            logging.info('Skipping {}; already exists'.format(chrom))
    bigwig.close()

In [9]:
@func_metrics_display
def gen_chrcsc(chromosome, base_filename):
    prefix = '{}_'.format(chromosome)
    ext = '.npz'
    filename = prefix+base_filename+ext
    csc = scipy.sparse.load_npz(filename)
    return csc

@func_metrics_display
def gen_chr2csc(bw_filename, base_filename):
    import re
    import os
    regex = re.compile(r'\w+_%s.npz'%base_filename, re.IGNORECASE)
    if not [file for file in os.listdir('.') if regex.match(file)]:
        create_bwchrcsc(bw_filename, base_filename)
    
    chr2csc = {}
    for file in os.listdir('.'):
        if regex.match(file):
            chrom = file.split('_')[0]
            logging.info('Loading sparse for {}'.format(chrom))
            chr2csc[chrom] = gen_chrcsc(chrom, base_filename)
    return chr2csc

In [10]:
import h5py
@func_metrics_display
def create_h5py4bw(bw_filename, base_filename):
    bigwig = pyBigWig.open(bw_file)
    chroms = bigwig.chroms()   
    ext = '.hdf5'
    filename = base_filename+ext
    h5 = h5py.File(filename, "w")
    if not all_paths_exists(filename):
        for chrom in tqdm.tqdm(chroms):
            logging.info('Creating h5py entry for {}'.format(chrom))
            data = bigwig.values(chrom, 0, chroms[chrom])
            h5.create_dataset(chrom, data=data, compression='lzf', chunks=(400,))
    else:
        logging.info('Skipping {}; already exists'.format(chrom))
    h5.close()
    return filename

def gen_h5py4bw(bw_filename, base_filename):
    ext = '.hdf5'
    filename = base_filename+ext
    if not os.path.exists(filename):
        create_h5py4bw(bw_filename, base_filename)
    return h5py.File(filename, 'r')

In [11]:
import mmap
def get_num_lines(file_path):
    fp = open(file_path, "r+")
    buf = mmap.mmap(fp.fileno(), 0)
    lines = 0
    while buf.readline():
        lines += 1
    return lines

# Helper Functions

In [12]:
class InsufficientChromosomesException(Exception):
    def __init__(self, chrom_avail, chrom_req, msg=None):
        self.chrom_avail = chrom_avail
        self.chrom_req = chrom_req
        if not msg:
            msg = 'Found {} chromosomes. Needed more than {} chromosomes.'.format(chrom_avail, chrom_req)
        super(InsufficientChromosomesException, self).__init__(msg)

In [13]:
def is_in(a, domain):
    return domain[0]<=a<=domain[1]

def is_in_domain(peak, domain):
    '''
    Checks to see if peak (tuple of ints) overlaps with domain (tuple of ints) 
    '''
    p_start, p_end = peak
    d_start, d_end = domain
    return is_in(p_start, domain) or is_in(p_end, domain) or (is_in(d_start, peak) or is_in(d_end, peak))

def any_in_domain(peaks, domain):
    for peak in peaks:
        if is_in_domain(peak, domain):
            return True
    return False

def which_in_domain(peaks,domain):
    for peak in peaks:
        if is_in_domain(peak,domain):
            return peak
    return None

def domain2seq(chromosomes, chromosome, domain):
    return chromosomes[chromosome][domain[0]:domain[1]]

def where_in_domain(chromosomes, chromosome, peak, domain):
    peak_start, peak_end = peak
    domain_start, domain_end = domain
    
    seq = domain2seq(chromosomes, chromosome, domain)

    labels = []
    for i in range(domain_end-domain_start):
        dna_location_ptr = domain_start+i
        if dna_location_ptr<peak_start:
            labels.append('O')
        elif dna_location_ptr==peak_start:
            labels.append('B')
        elif peak_start<dna_location_ptr<peak_end:
            labels.append('I')
        elif dna_location_ptr==peak_end:
            labels.append('E')
        else:
            labels.append('O')
    return ''.join(labels), (chromosome, domain)

In [14]:
@func_metrics_display
def if_not_pickled(pkl_file, generator,log=True,gen_name=None):
    try:
        name = gen_name if gen_name else generator.__name__
    except AttributeError:
        name = generator.func.__name__
        
    def generate_pkl():
        obj = generator()
        with open(pkl_file, 'wb') as pkl:
            pickle.dump(obj, pkl)
        return obj
    
    if not os.path.exists(pkl_file):
        if log:
            logging.info('No pickle for {} is found. Generating anew.'.format(name))               
        obj = generate_pkl()
    else:
        if log:
            logging.info('Loading pickled {}'.format(name))
        try:
            with open(pkl_file, 'rb') as pkl:
                obj = pickle.load(pkl)
        except (EOFError,pickle.UnpicklingError) as e :
            logging.error('A pickle file was corrupted: {}'.format(pkl_file))
            if log:
                logging.info('Regenerating corrupted pickle: {}'.format(pkl_file))
            obj = generate_pkl()
    if log:
        logging.info('Finished setting up {}'.format(name))
    return obj

In [15]:
@func_metrics_display
def gen_chr2locNbound(label_file, _cellline, log=True):
    from tqdm import tqdm
    chr2locNbound = {}
    with open(label_file) as labels:
        line_gen = (line for line in labels)
        column_names = next(line_gen).strip().split() 
        # columns names are chr start stop <cell line 1> ... <cell line n>
        _, _, _, *celllines = column_names
        prev_chrom = None
        for line in tqdm(line_gen, total=get_num_lines(label_file)):
            chromosome, start, stop, *bound_per_cellline = [x.strip() for x in line.strip().split()]
            try:
                start,stop = [int(x) for x in [start,stop]]
            except ValueError:
                # ill formatted entry
                continue
            if log and prev_chrom!=chromosome:
                prev_chrom=chromosome
                logging.info('Working on {}\n'.format(chromosome))
            for cellline, is_bound in zip(celllines, bound_per_cellline):
                if cellline.lower() != _cellline.lower():
                    continue
                try:
                    chr2locNbound[chromosome].append(((start,stop), is_bound))
                except KeyError:
                    chr2locNbound[chromosome] = [((start,stop), is_bound)]                
    return chr2locNbound

In [16]:
@func_metrics_display
def gen_hg19(hg_genome_fasta):
    chromosomes = {}
    with open(hg_genome_fasta) as hg19:
        chromosome = None
        for line in hg19:
            if line.startswith('>'):
                chromosome = line[1:].strip()
                chromosomes[chromosome] = []
            else: 
                chromosomes[chromosome].append(line.strip().upper())

    for k,v in chromosomes.items():
        chromosomes[k] = ''.join(v)   
    return chromosomes

In [17]:
@func_metrics_display
def gen_chr2filter_locs(filter_file):
    chr2filter_locs = {}
    with open(filter_file) as filter_f:
        for line in filter_f:
            chromosome, start, end = line.strip().split()
            start, end = [int(x) for x in (start, end)]
            try:
                chr2filter_locs[chromosome].append((start,end))
            except KeyError:
                chr2filter_locs[chromosome] = [(start,end)]
    return chr2filter_locs

In [18]:
@func_metrics_display
def gen_chr2locNpeaks(celllineNtf_peakfile, filter_file=None, chr2filter_locs=None):
    chr2locNpeaks = {}
    from tqdm import tqdm
    with open(celllineNtf_peakfile) as peaks:
        for line in tqdm(peaks, total=get_num_lines(celllineNtf_peakfile)):
            chromosome, start, stop, name, score, strand, signal, p, q, peak = line.strip().split()
            start, stop = [int(x) for x in [start,stop]]
            try:
                chr2locNpeaks[chromosome]
            except KeyError:
                chr2locNpeaks[chromosome]=[]
                
            if filter_file:
                if not chr2filter_locs:
                    raise ValueError('Need chr2filter_locs if using filter_file')
                filter_locs = chr2filter_locs[chromosome]
                overlap=False
                for loc in chr2filter_locs[chromosome]:
                    if loc[0]<=start<=loc[1] or loc[0]<=stop<=loc[1]:
                        overlap=True
                        break
                if not overlap:
                    continue
            chr2locNpeaks[chromosome].append((start,
                                              stop,
                                             {'name' : name,
                                             'score': int(score),
                                             'strand': strand,
                                             'p-value':float(p),
                                             'q-value':float(q),
                                             'peak':int(peak)}))
        for chromosome, lst in chr2locNpeaks.items():
            chr2locNpeaks[chromosome] = sorted(lst,key= lambda x:x[0])
    
    return chr2locNpeaks

In [19]:
def seq2sequence(seq, chromosomes):
    chromosome=seq[0]
    domain=seq[1]
    return domain2seq(chromosomes, chromosome, domain)

@func_metrics_display
def gen_chr2labelsNseq(chromosomes, chr2locNbound, chr2locNpeaks):
    chr2labelsNseq = {}
    from tqdm import tqdm
    shared_chromosomes = set(chr2locNbound.keys()) & set(chr2locNpeaks.keys())
    logging.info('Generating labels and seq for this set of chromosomes {}'.format(str(shared_chromosomes)))
    for chromosome in tqdm(shared_chromosomes):
        logging.info('Generating labels and seq for {}'.format(chromosome))
        bound_locs = [x for x in chr2locNbound[chromosome] if x[1]=='B']
        bound_locs = [loc[0] for loc in bound_locs]
        peaks_locs = [x[:2] for x in chr2locNpeaks[chromosome]]
        
#         peak_offsets = [x[2]['peak'] for x in chr2locNpeaks[chromosome]]
        count = 0
        for bound_loc in bound_locs:
            # generate peak information
            the_peak = which_in_domain(peaks_locs, bound_loc)
            if not the_peak:
                logging.warning('Cannot find peak {}'.format(bound_loc))
                continue
            the_peak_i = peaks_locs.index(the_peak)
            labels, seq = where_in_domain(chromosomes,
                                          chromosome,
                                          the_peak,
                                          bound_loc)
            if not ('B' in labels or 'I' in labels or 'E' in labels):
                raise ValueError('BIE not found in labels\nsequence:{}\nlabels{}'.format(seq,labels))
            try:
                chr2labelsNseq[chromosome].append((labels, seq))
            except KeyError:
                chr2labelsNseq[chromosome] = [(labels, seq)]
            count += 1
        logging.info('Found {}/{} seqs for bound locations'.format(count, len(bound_locs)))        
        unbound_locs = [x for x in chr2locNbound[chromosome] if x[1]=='U']
        unbound_locs = [loc[0] for loc in unbound_locs]
        count = 0
        for unbound_loc in unbound_locs:
            seq = (chromosome, unbound_loc)
            try:
                chr2labelsNseq[chromosome].append((None, seq))
            except KeyError:
                chr2labelsNseq[chromosome] = [(None, seq)]
            count += 1
        logging.info('Found {}/{} seqs for bound locations'.format(count, len(unbound_locs)))  
    return chr2labelsNseq

In [20]:
def read_structure_file(filename):
    with open(filename, 'r') as f:
        lines = []
        agg = []
        for line in f:
            if line.startswith('>'):
                lines.append(','.join(agg))
                agg = []
            else:
                agg.append(line.strip())
        lines.append(','.join(agg))
    del lines[0]
    return lines

In [21]:
def map_paths_exist(*paths):
    import os
    return [os.path.exists(path) for path in paths]

def all_paths_exists(*paths):
    from functools import reduce
    return reduce(lambda acc,x: acc and x ,map_paths_exist(*paths))

In [22]:
def rmem_manage(rfunc):
    from rpy2 import robjects
    import gc
    def rmem_func(*args,**kwargs):
        gc.collect()
        result = rfunc(*args,**kwargs)
        robjects.r('rm(list = ls(all.names=TRUE))')
        gc.collect()
        return result
    return rmem_func

# PickleManager
module used for using list like file writing

In [23]:
import pickle
class PickleManager:
    def __init__(self, pkl_file_path, overwrite=False):
        self.file_path = pkl_file_path
        self.file_writer_ptr = None
        self._length = 0
        
        self.overwrite=overwrite
        
    def __enter__(self):
        self.file_write_ptr = open(self.file_path, 'wb' if self.overwrite else 'ab')
    def __exit__(self):
        self.file_write_ptr.close()
        
    def __len__(self):
        if not self._length:
            self._length = len(list(self.__iter__()))
        return self._length
    
    @property
    def file_dumper(self):
        if not self.file_writer_ptr or self.file_writer_ptr.closed:
            self.file_writer_ptr = open(self.file_path, 'wb' if self.overwrite else 'ab' )
        return self.file_writer_ptr
    
    def dump(self, obj):
        pickle.dump(obj, self.file_dumper)
        
    def __iter__(self):
        if self.file_writer_ptr and not self.file_writer_ptr.closed:
            self.file_dumper.flush()
        pkl_file = open(self.file_path, 'rb')
        try:
            while True:
                self._length += 1
                yield pickle.load(pkl_file)
        except EOFError:
            pkl_file.close()
            return 
    
    def __iadd__(self, other):
        self.extend(other)
        
    def extend(self, iterable):
        for thing in iterable:
            self.append(thing)
            
    def append(self, element):
        self.dump(element)

# DataManager module
module used for collecting io info and preparing datasets

In [24]:
import keras
import numpy as np

class FeaturesSequence(keras.utils.Sequence):
    def __init__(self, x_sequence, x_meta, y_set, 
                         batch_size, sample_length, feature_dimensions, 
                         duke_unique, dnase):

        assert len(x_sequence) == len(x_meta) == len(y_set)
        self.x_sequence, self.x_meta, self.y = x_sequence, x_meta, y_set
        self.y_are_labels = type(y_set[0])==str
        logging.info('Found y are labels; using OBIE')
        if self.y_are_labels:
            self.label2onehot = {'O':np.array([1,0,0,0]),'B':np.array([0,1,0,0]),'I':np.array([0.,0.,1.,0.]),'E':np.array([0.,0.,0.,1.]),}
        self.sample_length=sample_length
        self.n_samples = len(self.x_meta)
        self.feature_dimensions=feature_dimensions
        self.batch_size = batch_size
        self.onehot = {'A':np.array([1,0,0,0]),'C':np.array([0,1,0,0]),'G':np.array([0.,0.,1.,0.]),'T':np.array([0.,0.,0.,1.]),}
        self.duke_unique = duke_unique
        self.dnase = dnase
            
    def __len__(self):
        return int(np.ceil(self.n_samples / float(self.batch_size))) 

    def __getitem__(self, idx):
        # setup y
        batch_y = []
        for i in range(self.batch_size):
            if self.y_are_labels:
                labels = self.y[(i+idx*self.batch_size)%self.n_samples]
                labels_onehot = np.empty((len(labels), 4))
                for j in range(len(labels)):
                    labels_onehot[j, :] = self.label2onehot[labels[j]]
                batch_y.append(labels_onehot)
            else:
                batch_y.append(self.y[(i+idx*self.batch_size)%self.n_samples])

        # setup x
        batch_x = []
        for i in range(self.batch_size):
            seq, (chromosome, (start,stop)) = self.x_sequence[(i+idx*self.batch_size)%self.n_samples], self.x_meta[(i+idx*self.batch_size)%self.n_samples]
            # this part is potentially very slow #
            uniqueness = np.array(self.duke_unique(chromosome, start, stop))
            openness = np.array(self.dnase(chromosome, start, stop))
            uniqueness[np.isnan(uniqueness)] = 0
            openness[np.isnan(openness)] = 0
            # # # # # # # # # # # # # # # # # # #
            features = np.empty((self.sample_length, self.feature_dimensions))
            for j in range(self.sample_length):
                features[j,:] = (np.concatenate((self.onehot[seq[j]], [openness[j]], [uniqueness[j]])))
            batch_x.append(features)
            
        # setup sample weights
#         batch_weight = []
#         for i in range(self.batch_size):
#             batch_weight.append(self.sample_weights[(i+idx*self.batch_size)%self.n_samples])
        return np.array(batch_x), np.array(batch_y) #, np.array(batch_weight)

In [25]:
import keras
import numpy as np

class TrainingFeaturesSequence(keras.utils.Sequence):
    def __init__(self, x_sequence, x_meta, y_set, 
                         batch_size, sample_length, feature_dimensions, 
                         duke_unique, dnase, sample_weights=None):

        assert len(x_sequence) == len(x_meta) == len(y_set)
        self.x_sequence_pos, self.x_meta_pos, self.y_pos = x_sequence[0], x_meta[0], y_set[0]
        self.x_sequence_neg, self.x_meta_neg = x_sequence[1], x_meta[1]
        self.y_are_labels = type(y_set[0][0])==str
        logging.info('Found y are labels; using OBIE')
        if self.y_are_labels:
            self.label2onehot = {'O':np.array([1,0,0,0]),'B':np.array([0,1,0,0]),'I':np.array([0.,0.,1.,0.]),'E':np.array([0.,0.,0.,1.]),}
        self.sample_length=sample_length
        self.n_samples_pos, self.n_samples_neg = len(self.x_meta_pos), len(self.x_meta_neg)
        self.feature_dimensions=feature_dimensions
        self.batch_size = batch_size
        self.onehot = {'A':np.array([1,0,0,0]),'C':np.array([0,1,0,0]),'G':np.array([0.,0.,1.,0.]),'T':np.array([0.,0.,0.,1.]),}
        self.duke_unique = duke_unique
        self.dnase = dnase
            
    def __len__(self):
        return int(np.ceil(self.n_samples_pos / float(self.batch_size))) 

    def __getitem__(self, idx):
        # setup y
        half_batch_size = int(self.batch_size/2)
        batch_y = []
        for i in range(self.batch_size):
            if i<half_batch_size:
                # work on positive samples
                if self.y_are_labels:
                    labels = self.y_pos[(i+idx*half_batch_size)%self.n_samples_pos]
                    labels_onehot = np.empty((len(labels), 4))
                    for j in range(len(labels)):
                        labels_onehot[j, :] = self.label2onehot[labels[j]]
                    batch_y.append(labels_onehot)
                else:
                    batch_y.append(1)
            else:
                if self.y_are_labels:
#                     labels = self.y_neg[(i+idx*self.batch_size/2)%self.n_samples_neg]
                    labels_onehot = np.empty((self.sample_length, 4))
                    for j in range(self.sample_length):
                        labels_onehot[j, :] = self.label2onehot['O']
                    batch_y.append(labels_onehot)
                else:
                    batch_y.append(0)

        # setup x
        batch_x = []
        for i in range(self.batch_size):
            if i<half_batch_size:
                seq, (chromosome, (start,stop)) = self.x_sequence_pos[(i+idx*half_batch_size)%self.n_samples_pos], self.x_meta_pos[(i+idx*half_batch_size)%self.n_samples_pos]
                # this part is potentially very slow #
                uniqueness = np.array(self.duke_unique(chromosome, start, stop))
                openness = np.array(self.dnase(chromosome, start, stop))
                uniqueness[np.isnan(uniqueness)] = 0
                openness[np.isnan(openness)] = 0
                # # # # # # # # # # # # # # # # # # #
                features = np.empty((self.sample_length, self.feature_dimensions))
                for j in range(self.sample_length):
                    features[j,:] = (np.concatenate((self.onehot[seq[j]], [openness[j]], [uniqueness[j]])))
                batch_x.append(features)
            else:
                seq, (chromosome, (start,stop)) = self.x_sequence_neg[(i+idx*half_batch_size-half_batch_size)%self.n_samples_neg], self.x_meta_neg[(i+idx*half_batch_size-half_batch_size)%self.n_samples_neg]
                # this part is potentially very slow #
                uniqueness = np.array(self.duke_unique(chromosome, start, stop))
                openness = np.array(self.dnase(chromosome, start, stop))
                uniqueness[np.isnan(uniqueness)] = 0
                openness[np.isnan(openness)] = 0
                # # # # # # # # # # # # # # # # # # #
                features = np.empty((self.sample_length, self.feature_dimensions))
                for j in range(self.sample_length):
                    features[j,:] = (np.concatenate((self.onehot[seq[j]], [openness[j]], [uniqueness[j]])))
                batch_x.append(features)
            
        # setup sample weights
#         batch_weight = []
#         for i in range(self.batch_size):
#             batch_weight.append(self.sample_weights[(i+idx*self.batch_size)%self.n_samples])
        return np.array(batch_x), np.array(batch_y) #,np.array(batch_weight)

In [26]:
import os
import pickle
import logging
from functools import partial 
import tqdm
import numpy as np

_cache = {}
class DataManager:
    
    def __init__(self,label_file, celllineNtf_peakfile, 
                 bigwig_duke_unique_file = './wgEncodeDukeMapabilityUniqueness35bp.bigWig',
                 bigwig_dnase_file = None,
                 filter_file=None,
                 use_pickler=False, memory_avail=8192, 
                 output_dir='./',
                 only_label_dataset = True,
                 move_finished_src = None, 
                 reduce_negative_samples=True, 
                 chr_valid = ['chr11'], chr_test = ['chr1', 'chr8', 'chr21'], 
                 check_set_ratio=9, debug={}):
        
        self.label_file = label_file
        self.celllineNtf_peakfile = celllineNtf_peakfile
        self.bigwig_duke_unique_file = bigwig_duke_unique_file
        self.duke_bw = bigwig_duke_unique_file
        self.use_pickler = use_pickler
        self.memory_avail=memory_avail
        self.output_dir = output_dir
        self.only_label_dataset=only_label_dataset
        self.move_finished_src = move_finished_src
        self.filter_file = filter_file
        self.reduce_negative_samples = reduce_negative_samples
        self.chr_valid = chr_valid
        self.chr_test = chr_test
        self.check_set_ratio = check_set_ratio
        self.debug = debug
        self.exp_type, self._cellline, self.tf_name,_, self.set_name, self.peak_type = os.path.basename(celllineNtf_peakfile).split('.')
        self.dnase_bw = bigwig_dnase_file if bigwig_dnase_file else '%s.1x.bw' % self._cellline.lower()
        self.exp_id = '{}_{}'.format(self.tf_name, self._cellline)
#         self.bigwig_cellline_file = get_bigwig_celline_file(self._cellline)
        self.positive_samples_out, self.negative_samples_out, self.labels_out = [self.exp_id+'_'+x for x in ['positive_samples.txt', 'negative_samples.txt', 'labels.txt']]
        self.chr_positive_out, self.chr_negative_out = [self.exp_id+'_chr_%s.npy' % sign for sign in ['positive', 'negative']]

    def _cacher_(func):
        def to_cache_func(self, *args, **kwargs):
            if func.__name__ in _cache:
                return _cache[func.__name__]
            obj = func(self, *args, **kwargs)
            _cache[func.__name__] = obj
            return obj
        return to_cache_func
    
    def release_memory(self):
        import gc
        _cache = {}
        gc.collect()
    
    @_cacher_
    def _chr2locNbound(self):
        chr2locNbound_pklfile = '{}_chr2locNbound.pkl'.format(self.exp_id)
        pgen_chr2locNbound = partial(gen_chr2locNbound, self.label_file, self._cellline)
        chr2locNbound = if_not_pickled(chr2locNbound_pklfile,
                                       pgen_chr2locNbound)
        return chr2locNbound
    @property
    def chr2locNbound(self):
        return self._chr2locNbound()
    
    @_cacher_
    def _chr2filter_locs(self):
        chr2filter_locs_pklfile = '{}_chr2filter_locs.pkl'.format(self.exp_id)
        pgen_chr2filter_locs = partial(gen_chr2filter_locs, 
                                       self.filter_file)
        chr2filter_locs = if_not_pickled(chr2filter_locs_pklfile, 
                                         pgen_chr2filter_locs)
        return chr2filter_locs
    @property
    def chr2filter_locs(self):
        return self._chr2filter_locs()
    
    @_cacher_
    def _chr2locNpeaks(self):
        chr2locNpeaks_pklfile = '{}_chr2locNpeaks.pkl'.format(self.exp_id) if self.filter_file else '{}_chr2locNpeaks_full.pkl'.format(self.exp_id)
        p_genchr2locNpeaks = partial(gen_chr2locNpeaks, 
                                     self.celllineNtf_peakfile, 
                                     self.filter_file, 
                                     self.chr2filter_locs if self.filter_file else None)
        chr2locNpeaks = if_not_pickled(chr2locNpeaks_pklfile,
                                       p_genchr2locNpeaks)
        return chr2locNpeaks
    @property
    def chr2locNpeaks(self):
        return self._chr2locNpeaks()
    
    @_cacher_
    def _chromosomes(self):
        hg_pkl = 'hg19.pkl'
        hg_genome_fasta = './hg19.genome.fa'
        pgen_hg19 = partial(gen_hg19, hg_genome_fasta)
        chromosomes = if_not_pickled(hg_pkl, pgen_hg19)
        return chromosomes
    @property
    def chromosomes(self):
        return self._chromosomes()
    
    @_cacher_
    def _chr2labelsNseq(self):
        chr2labelsNseq_pkl = '{}_chr2labelsNseq.pkl'.format(self.exp_id)
        pgen_chr2labelsNseq = partial(gen_chr2labelsNseq,
                                      self.chromosomes,
                                      self.chr2locNbound, 
                                      self.chr2locNpeaks)
        chr2labelsNseq = if_not_pickled(chr2labelsNseq_pkl, 
                                        pgen_chr2labelsNseq)
        del _cache['_chr2locNpeaks']
        del _cache['_chr2locNbound']
        del _cache['_chr2filter_locs']
        return chr2labelsNseq
    @property
    def chr2labelsNseq(self):
        return self._chr2labelsNseq()
    
    def _create_samples(self):
        logging.info('Creating positive/negative samples')
        # generate samples
        positive_samples, negative_samples, labels_positives = [],[],[]
        pos_chr, neg_chr = [], [] # keep track of which chromosome each are from
        
        chr_training = set(self.chr2labelsNseq.keys()) - set(self.chr_valid) - set(self.chr_test)
        if not len(chr_training):
            raise InsufficientChromosomesException(set(self.chr2labelsNseq.keys()), set(self.chr_valid) | set(self.chr_test))
        for chromosome, labelsNseq in tqdm(self.chr2labelsNseq.items()):
            _positive_samples,_negative_samples, _labels_positives = [],[],[]
            _pos_chr, _neg_chr = [], []
            logging.info('Working on {}'.format(chromosome))
            count = 0
            pos_count = 0
            neg_count = 0
            for label, seq in labelsNseq:
                # labels is None if negative sample
                sequence = seq2sequence(seq, self.chromosomes)
                if 'N' not in sequence:
                    if label==None:
                        _negative_samples.append(sequence)
                        _neg_chr.append(seq)
                        neg_count += 1
                    else:
                        _positive_samples.append(sequence)
                        _labels_positives.append(label)
                        _pos_chr.append(seq)
                        pos_count += 1
                    count += 1
            logging.info('Found {} samples for {}; {} positive and {} negative'.format(count, 
                                                                                       chromosome, 
                                                                                       pos_count, 
                                                                                       neg_count))
            # throw away some negative samples
#             if self.reduce_negative_samples :
#                 logging.info('Reducing negative samples for {}'.format(chromosome))
#                 ratio = 1 if not (chromosome in self.chr_valid or chromosome in self.chr_test) else self.check_set_ratio
#                 if len(_negative_samples) > len(_positive_samples):
#                     npified = np.array([_negative_samples, _neg_chr], dtype=np.object)
#                     n_to_choose_from = ratio * len(_positive_samples)
#                     if n_to_choose_from > len(_negative_samples):
#                         n_to_choose_from = len(_negative_samples)
#                     indices_chosen = np.random.choice(len(_negative_samples), 
#                                                       n_to_choose_from, 
#                                                       replace=False)
#                     chosen_samples, chosen_chr = npified[:,indices_chosen]
#                     _negative_samples, _neg_chr = list(chosen_samples), list(chosen_chr)
            
            logging.info('Found {} samples for {}; {} positive and {} negative'.format(count, 
                                                                                       chromosome, 
                                                                                       len(_positive_samples), 
                                                                                       len(_negative_samples)))
            positive_samples.extend(_positive_samples)
            negative_samples.extend(_negative_samples)
            labels_positives.extend(_labels_positives)
            pos_chr.extend(_pos_chr)
            neg_chr.extend(_neg_chr)
            
        # write samples to disk
        logging.info('Writing samples to disk')
        def write_to_disk(path, samples):
            with open(path, 'w') as samples_file:
                for i,sample in enumerate(samples):
                    samples_file.write('>{}__{}\n'.format(self.tf_name, i))
                    samples_file.write('{}\n'.format(sample))
        
        assert len(positive_samples) == len(pos_chr) == len(labels_positives)
        assert len(negative_samples) == len(neg_chr)
        write_to_disk(self.positive_samples_out, positive_samples)
        write_to_disk(self.negative_samples_out, negative_samples)
        write_to_disk(self.labels_out, labels_positives)
        np.save(self.chr_positive_out, np.array(pos_chr, dtype=np.object))
        np.save(self.chr_negative_out, np.array(neg_chr, dtype=np.object))
        logging.info('Done writing samples to disk')
        self._positive_samples, self._negative_samples, self._labels_positives = positive_samples, negative_samples, labels_positives
        self._pos_chr, self._neg_chr = pos_chr, neg_chr
        return positive_samples, negative_samples, labels_positives, pos_chr, neg_chr
    
    def _load_samples(self):
        # load samples from disk
        logging.info('Loading positive/negative samples from disk')
        positive_samples, negative_samples, labels_positives = [], [], []
        with open(self.positive_samples_out) as samples_file:
            for line in samples_file:
                if not line.startswith('>'):
                    positive_samples.append(line.strip())
        with open(self.negative_samples_out) as samples_file:
            for line in samples_file:
                if not line.startswith('>'):
                    negative_samples.append(line.strip())
        with open(self.labels_out) as samples_file:
            for line in samples_file:
                if not line.startswith('>'):
                    labels_positives.append(line.strip())  
        pos_chr = np.load(self.chr_positive_out)
        neg_chr = np.load(self.chr_negative_out) 
        logging.info('Done loading samples from disk')
        self._positive_samples, self._negative_samples, self._labels_positives, = positive_samples, negative_samples, labels_positives
        self._pos_chr, self._neg_chr = pos_chr, neg_chr
        return positive_samples, negative_samples, labels_positives, pos_chr, neg_chr
    
    @_cacher_
    def _samples(self):
        sample_files = [self.positive_samples_out, 
                        self.negative_samples_out, 
                        self.labels_out, 
                        self.chr_positive_out, 
                        self.chr_negative_out ]
        
        if not all_paths_exists(*sample_files):
            return self._create_samples()
        else: 
            return self._load_samples()
    @property
    def samples(self):
        return self._samples()
    
    @property
    def positive_samples(self):
        try:
            return self._positive_samples
        except:
            return self.samples[0]
    @property
    def negative_samples(self):
        try:
            return self._negative_samples
        except:
            return self.samples[1]
    @property
    def labels_positives(self):
        try:
            return self._labels_positives
        except:
            return self.samples[2]
    @property
    def pos_chr(self):
        try:
            return self._pos_chr
        except:
            return self.samples[3]
    @property
    def neg_chr(self):
        try:
            return self._neg_chr
        except:
            return self.samples[4]
    
    @func_metrics_display
    def dnashapeR(self, r_lib_location = "C:/Users/Rudolf/Documents/R/win-library/3.5"):
        # setup structural information
        # setup R
        import rpy2
        from rpy2.robjects.packages import importr
        import rpy2.robjects as robjects

        # set the available amount of memory
        robjects.r('memory.limit(size = {})'.format(self.memory_avail))

        base = importr('base')
        utils = importr('utils')
        logging.info('Using {}'.format(str(base._libPaths())))

        # if DNAshapeR cannot be found try this:
        robjects.r( ".libPaths('{}')".format(r_lib_location))

        from functools import reduce

        if not all_paths_exists(*[self.positive_samples_out+ext for ext in ['.EP', '.HelT', '.MGW', '.ProT', '.Roll']]):
            logging.info('Running DNAshapeR for positive samples')
            base = importr('base')
            utils = importr('utils')
            dna_shape = importr('DNAshapeR', lib_loc=r_lib_location)
            #rpy2 does not know how to release memory
            @rmem_manage
            def process_positive():
                r_statements = []
                r_statements.append('library(DNAshapeR)')
                r_statements.append('pred <- getShape("./{}")'.format(self.positive_samples_out))
                r_cmd = '\n'.join(r_statements)
                robjects.r(r_cmd)
            process_positive()
        else:
            logging.info('Skipping DNAshapeR for positive samples; already exists')
        if not all_paths_exists(*[self.negative_samples_out+ext for ext in ['.EP', '.HelT', '.MGW', '.ProT', '.Roll']]):
            logging.info('Running DNAshapeR for negative samples')
            base = importr('base')
            utils = importr('utils')
            dna_shape = importr('DNAshapeR', lib_loc=r_lib_location)
            gc.collect()
            @rmem_manage
            def process_negative():
                r_statements = []
                r_statements.append('library(DNAshapeR)')
                r_statements.append('pred <- getShape("./{}")'.format(self.negative_samples_out))
                r_cmd = '\n'.join(r_statements)
                robjects.r(r_cmd)
            process_negative()
        else:
            logging.info('Skipping DNAshapeR for negative samples; already exists')
    
    @_cacher_
    def _h5py_duke(self):
        return gen_h5py4bw(self.duke_bw, 'duke_unique')
    
    @property
    def h5py_duke(self):
        return self._h5py_duke()
    
    def duke_unique(self, chromosome, start, stop):
        return self.h5py_duke[chromosome][start:stop]
        
    @_cacher_
    def _h5py_dnase(self):
        return gen_h5py4bw(self.dnase_bw, self._cellline)
    
    @property
    def h5py_dnase(self):
        return self._h5py_dnase()
    
    def dnase(self, chromosome, start, stop):
        return self.h5py_dnase[chromosome][start:stop]
    
    def create_datagen_from_samples(self, balance_valid_ratio=9, useDNAshapeR=False):
        '''
        Returns functions that takes a batch_size as input and returns generators
        '''
        import pickle
        import numpy as np
        from tqdm import tqdm
        # files describing structure of dna
        exts = ['']
        if useDNAshapeR:
            raise NotImplementedError('DNAshapeR is not streamable for large datasets')
        dataset_pkl = '%s_dataset.pkl'%self.exp_id
        import os
        import pickle
        if os.path.exists(dataset_pkl):
            logging.info('Reading static data')
            with open(dataset_pkl, 'rb') as pkl:
                train_positive_samples = pickle.load(pkl)
                train_negative_samples = pickle.load(pkl)
                train_labels_positives = pickle.load(pkl)
                train_pos_chr = pickle.load(pkl) 
                train_neg_chr = pickle.load(pkl)
                training_cls = pickle.load(pkl)
                training_labels = pickle.load(pkl)
                valid_samples = pickle.load(pkl)
                valid_chr = pickle.load(pkl)
                valid_cls = pickle.load(pkl)
                valid_labels = pickle.load(pkl)
                test_samples = pickle.load(pkl)
                test_chr = pickle.load(pkl)
                test_cls = pickle.load(pkl)
                test_labels = pickle.load(pkl)
                self.dataset2counts_pos = pickle.load(pkl)
                self.dataset2counts_neg = pickle.load(pkl)
                self.training_length = pickle.load(pkl)
                self.valid_length = pickle.load(pkl)
                self.test_length = pickle.load(pkl)
                self.sample_length = pickle.load(pkl)
                self.feature_dimensions = pickle.load(pkl)  
                logging.info('Loaded training/valid/test of sizes {}/{}/{}'.format(self.training_length, self.valid_length, self.test_length))
        else:     
            logging.info('Getting samples')
            # positive_samples and negative_samples are lists of sequences:str
            # pos_chr and neg_chr are lists of (chromosome:str, (start:int, stop:int))
            # labels_positives is a list of (for crf) labels:str
            positive_samples, negative_samples, labels_positives, pos_chr, neg_chr = self.samples

            logging.info('Shuffling samples')
            assert len(positive_samples) == len(labels_positives) == len(pos_chr)
            assert len(negative_samples) == len(neg_chr)

            pos_len = len(positive_samples)
            neg_len = len(negative_samples)

            class Pair:
                __slots__=['chromosome', 'domain']
                def __init__(self, chromosome, domain):
                    self.chromosome = chromosome
                    self.domain = domain
                def __repr__(self):
                    return '('+str(self.chromosome)+', '+str(self.domain)+')'
                def reveal(self):
                    return (self.chromosome, self.domain)

            @func_metrics_display
            def shuffle(*arrays):
                import numpy as np
                try:
                    npified = np.array(arrays, dtype=np.object)
                except Exception as e:
                    logging.error('{}'.format([np.array(a).shape for a in arrays]))
                    raise e
                length = npified.shape[1]
                indices_chosen = np.random.choice(length, length, replace=False)
                return npified[:,indices_chosen]

            pos_pairs = [Pair(*c) for c in pos_chr]
            neg_pairs = [Pair(*c) for c in neg_chr]

            positive_samples, labels_positives, pos_pairs = shuffle(positive_samples, labels_positives, pos_pairs)
            negative_samples, neg_pairs = shuffle(negative_samples, neg_pairs)

            pos_chr = [pair.reveal() for pair in pos_pairs]
            neg_chr = [pair.reveal() for pair in neg_pairs]

            # create stats on number of pos/neg for training/valid/test sets
            logging.info('Creating stats on number of pos/neg for training/valid/test sets')

            chr_valid_set = set(self.chr_valid)
            chr_test_set = set(self.chr_test)

            def try_addone(counts, key):
                try:
                    counts[key]+=1
                except KeyError:
                    counts[key]=1
            @func_metrics_display
            def count(chrs):            
                counts = {}   
                for chromosome, (start,stop) in chrs:
                    if chromosome in chr_valid_set:
                        try_addone(counts, 'valid')
                    elif chromosome in chr_test_set:
                        try_addone(counts, 'test')
                    else: # in training set
                        try_addone(counts, 'train')
                return counts

            pos_counts = count(pos_chr)
            neg_counts = count(neg_chr)
            logging.info('Counted positive for training/valid/test: {:>12}  {:>12}  {:>12}'.format(*[pos_counts[x]for x in ['train','valid','test']]))
            logging.info('Counted negative for training/valid/test: {:>12}  {:>12}  {:>12}'.format(*[neg_counts[x]for x in ['train','valid','test']]))
            self.dataset2counts_pos = pos_counts
            self.dataset2counts_neg = neg_counts

            logging.info('Splitting samples into training/valid/test')

            train_positive_samples, train_labels_positives, train_pos_chr = [], [], []
            valid_positive_samples, valid_labels_positives, valid_pos_chr = [], [], []
            test_positive_samples, test_labels_positives, test_pos_chr = [], [], []
            train_negative_samples, train_neg_chr = [], []
            valid_negative_samples, valid_neg_chr = [], []
            test_negative_samples, test_neg_chr = [], []

            logging.info('Preparing for positive training/valid/test')
            for sample, label, c in zip(positive_samples, labels_positives, pos_chr):
                chromosome, (start, stop) = c
                if chromosome in chr_valid_set:
                    valid_positive_samples.append(sample)
                    valid_labels_positives.append(label)
                    valid_pos_chr.append(c)
                elif chromosome in chr_test_set:
                    test_positive_samples.append(sample)
                    test_labels_positives.append(label)
                    test_pos_chr.append(c)
                else: # in training set
                    train_positive_samples.append(sample)
                    train_labels_positives.append(label)
                    train_pos_chr.append(c)
            logging.info('Preparing for negative training/valid/test')
            for sample, c in zip(tqdm(negative_samples), neg_chr):
                chromosome, (start, stop) = c
                if chromosome in chr_valid_set:
                    valid_negative_samples.append(sample)
                    valid_neg_chr.append(c)
                elif chromosome in chr_test_set:
                    test_negative_samples.append(sample)
                    test_neg_chr.append(c)
                else: # in training set
                    train_negative_samples.append(sample)
                    train_neg_chr.append(c)

            if self.use_pickler:
                logging.info('Setting up pickler to intercept used data')
                label_dataset = PickleManager('{}_label_dataset.pkl'.format(self.exp_id), overwrite=True)

            import numpy as np
            # start with positive samples
            onehot = {'A':(1.,0.,0.,0.),
                      'C':(0.,1.,0.,0.),
                      'G':(0.,0.,1.,0.),
                      'T':(0.,0.,0.,1.),}

            logging.info('Generating feature generators')
            # we want one for each pos/neg and train/valid/test
            # we want same number of pos/neg samples for training
            sample_start, sample_stop = train_pos_chr[0][1]
            sample_length = sample_stop-sample_start

            logging.info('Samples are found to have length {}'.format(sample_length))
            self.sample_length = sample_length
            self.feature_dimensions = 6

            logging.info('Generating static data for training')
            n_pos_train = len(train_positive_samples) 
            if self.reduce_negative_samples:
                train_negative_samples=train_negative_samples[:n_pos_train]
                train_neg_chr=train_neg_chr[:n_pos_train]
            n_neg_train = len(train_negative_samples)
            # shuffle the pos/neg for training
            # x, metadata, y, labels (for crf)
            training_samples, training_chr, training_cls, training_labels = shuffle(train_positive_samples + train_negative_samples,
                                                                                    train_pos_chr          + train_neg_chr,
                                                                                    [1]*n_pos_train        + [0]*n_neg_train,
                                                                                    train_labels_positives + ['O'*sample_length]*n_neg_train)
            
            self.dataset2counts_pos['train'] = n_pos_train
            self.dataset2counts_neg['train'] = n_neg_train
            self.training_length = n_pos_train*2
            logging.info('Counted positive for training/valid/test: {:>12}  {:>12}  {:>12}'.format(*[pos_counts[x]for x in ['train','valid','test']]))
            logging.info('Counted negative for training/valid/test: {:>12}  {:>12}  {:>12}'.format(*[neg_counts[x]for x in ['train','valid','test']]))

            logging.info('Generating static data for validation/test')
            # take all negative samples for valid/test instead of limiting it like in training
            if balance_valid_ratio:
                n_pos_valid = len(valid_positive_samples)
                valid_negative_samples = valid_negative_samples[:n_pos_valid*balance_valid_ratio]
                valid_neg_chr = valid_neg_chr[:n_pos_valid*balance_valid_ratio]
            self.valid_length = len(valid_positive_samples) + len(valid_negative_samples)
            self.test_length = len(test_positive_samples) + len(test_negative_samples)

            valid_samples, valid_chr, valid_cls, valid_labels = shuffle(valid_positive_samples          + valid_negative_samples,
                                                                        valid_pos_chr                   + valid_neg_chr,
                                                                        [1]*len(valid_positive_samples) + [0]*len(valid_negative_samples),
                                                                        valid_labels_positives          + ['O'*sample_length]*len(valid_negative_samples))

            test_samples, test_chr, test_cls, test_labels = shuffle(test_positive_samples          + test_negative_samples,
                                                                    test_pos_chr                   + test_neg_chr,
                                                                    [1]*len(test_positive_samples) + [0]*len(test_negative_samples),
                                                                    test_labels_positives          + ['O'*sample_length]*len(test_negative_samples))




            logging.info('Saving static data')
            with open(dataset_pkl, 'wb') as pkl:
                datalst= [train_positive_samples, train_negative_samples, train_labels_positives,
                          train_pos_chr, train_neg_chr,
                          training_cls, training_labels, 
                           valid_samples, valid_chr, valid_cls, valid_labels, 
                           test_samples, test_chr, test_cls, test_labels,
                          self.dataset2counts_pos, self.dataset2counts_neg, 
                          self.training_length, self.valid_length, self.test_length, self.sample_length,
                          self.feature_dimensions]
                for thing in tqdm(datalst):
                    pickle.dump(thing, pkl)
            
        logging.info('Creating generators')
    
        onehot = {'A':(1.,0.,0.,0.),
                  'C':(0.,1.,0.,0.),
                  'G':(0.,0.,1.,0.),
                  'T':(0.,0.,0.,1.),}
        def features_gen(samples, chrs):
            def batch_features_generator(batch_size):
                max_k = len(samples) // batch_size
                k = 0
                while True:
                    batch = []
                    
                    for i in range(batch_size):
                        seq, features = samples[i+k*batch_size], chrs[i+k*batch_size]
                        chromosome, domain = features
                        start, stop = domain
                        # this part is potentially very slow #
                        uniqueness = np.array(self.duke_unique(chromosome, start, stop))
                        openness = np.array(self.dnase(chromosome, start, stop))
                        uniqueness[np.isnan(uniqueness)] = 0
                        openness[np.isnan(openness)] = 0
                        # # # # # # # # # # # # # # # # # # # 
                        features = []
                        for i in range((stop-start)):
                            feature = []
                            feature.extend( onehot[seq[i]] )
                            feature.append( openness[i] )
                            feature.append( uniqueness[i] )
                            features.append(feature)
                        batch.append(np.array(features))
                    yield np.array(batch)
                    # cycle for infinite generator
                    k+=1
                    if k==max_k:
                        k=0
            return batch_features_generator
        
        logging.info('Creating feature generators')
#         train_x = features_gen(train_positive_samples + train_negative_samples, training_chr)
        valid_x = features_gen(valid_samples, valid_chr)
        test_x = features_gen(test_samples, test_chr)
        
        @func_metrics_display
        def values_gen(items, dtype=np.object):
            def batch_generator(batch_size):
                max_k = len(items) // batch_size
                k = 0
                while True:
                    result = [items[i+k*batch_size] for i in range(batch_size)]
                    yield np.array(result, dtype=dtype)
                    # cycle for infinite generator
                    k+=1
                    if k==max_k:
                        k=0
            return batch_generator
        
        logging.info('Calculating sample weights')
        def sample_weights(c, pos_count, neg_count):
            neg_weight = pos_count/neg_count
            return (1-c)*neg_weight + c        
        
        logging.info('Creating features sequencers')
        train_labels_sequencer = lambda batch_size: TrainingFeaturesSequence((train_positive_samples, train_negative_samples), 
                                                                             (train_pos_chr, train_neg_chr), 
                                                                             (train_labels_positives, None), 
                                                                     batch_size, 
                                                                     self.sample_length, self.feature_dimensions, 
                                                                     self.duke_unique, self.dnase)
        train_cls_sequencer = lambda batch_size: TrainingFeaturesSequence((train_positive_samples, train_negative_samples), 
                                                                          (train_pos_chr, train_neg_chr), 
                                                                          ([1]*self.dataset2counts_pos['train'] , None), 
                                                                     batch_size, 
                                                                     self.sample_length, self.feature_dimensions, 
                                                                     self.duke_unique, self.dnase)
        valid_labels_sequencer = lambda batch_size: FeaturesSequence(valid_samples, valid_chr, valid_labels, 
                                                                     batch_size, 
                                                                     self.sample_length, self.feature_dimensions, 
                                                                     self.duke_unique, self.dnase)
        valid_cls_sequencer = lambda batch_size: FeaturesSequence(valid_samples, valid_chr, valid_cls, 
                                                                     batch_size, 
                                                                     self.sample_length, self.feature_dimensions, 
                                                                     self.duke_unique, self.dnase)
        test_labels_sequencer = lambda batch_size: FeaturesSequence(test_samples, test_chr, test_labels, 
                                                                     batch_size, 
                                                                     self.sample_length, self.feature_dimensions, 
                                                                     self.duke_unique, self.dnase)
        test_cls_sequencer = lambda batch_size: FeaturesSequence(test_samples, test_chr, test_cls, 
                                                                    batch_size, 
                                                                    self.sample_length, self.feature_dimensions, 
                                                                    self.duke_unique, self.dnase)
        
        
        logging.info('Creating value generators')
        
#         train_label = values_gen(training_labels)
        valid_label = values_gen(valid_labels)
        test_label = values_gen(test_labels)
        
#         train_seq = values_gen(train_pos_chr+train_neg_chr)
        valid_seq = values_gen(valid_chr)
        test_seq = values_gen(test_chr)
        
#         train_y = values_gen(training_cls, int)
        valid_y = values_gen(valid_cls, int)
        test_y = values_gen(test_cls, int)

        return  (
#                 train_x, train_label, train_seq, train_y, 
                 train_labels_sequencer, train_cls_sequencer,
                 valid_x, valid_label, valid_seq, valid_y, valid_labels_sequencer, valid_cls_sequencer, 
                 test_x, test_label, test_seq, test_y, test_labels_sequencer, test_cls_sequencer)
        
    def load_data(self, *names):
        label_dataset_output_dir = os.path.join(self.output_dir, self.exp_id+'_label_dataset')  
        paths = [os.path.join(label_dataset_output_dir, name+'.npy') for name in names]
        return [np.load(path) for path in paths]

In [27]:
import os
import re
import logging
logger = logging.getLogger()
logger.setLevel('DEBUG')
import numpy as np

### <<<<<<<< Launch file setup >>>>>>>>

In [28]:
debug = {}
label_files = [f for f in os.listdir('.') if f.endswith('_.labels.tsv')]
filter_file = './total_regions.blacklistfiltered.merged.bed'
tf_nameNdm = []
dm = None
for label_file in label_files:
    logging.info('Looking for peak files for file %s' % label_file)
    with open(label_file) as f:
        first_line = next((l for l in f))
        # first line contains titles; chr, start, stop, <cellline 1>, ... , <cellline n> 
        _,_,_, *celllines = first_line.split() 

    _tf_name = os.path.basename(label_file).split('.')[0]
    regex = re.compile(r'chipseq\.\w+\.%s\..*\.\_\.narrowpeak'%_tf_name, re.IGNORECASE)
    celllineNtf_peakfiles = []
    for cellline in celllines:
        for each_file in os.listdir('.'):
            if regex.match(each_file):
                celllineNtf_peakfiles.append(each_file)
    for peakfile in celllineNtf_peakfiles:
        logging.info('Working on peakfile %s'%peakfile)
        try:
#             dm = DataManager(label_file, peakfile, output_dir='datasets', 
#                         move_finished_src='finished_peakfiles',
#                         filter_file=filter_file, 
#                         reduce_negative_samples=True, check_set_ratio=9,
#                         memory_avail=8192, debug=debug)
#             dm.samples
#             generators = dm.create_datagen_from_samples(useDNAshapeR=False)
#             train_x, train_label, train_seq, train_cls, train_labels_sequencer, train_cls_sequencer = generators[:6]
#             valid_x, valid_label, valid_seq, valid_cls, valid_labels_sequencer, valid_cls_sequencer = generators[6:12]
#             test_x, test_label, test_seq, test_cls, test_labels_sequencer, test_cls_sequencer = generators[12:]
            tf_nameNdm.append((_tf_name, None))
        except InsufficientChromosomesException as e:
            logging.warning('Skipping creating data for {}'.format(peakfile))
            logging.warning('Error message: {}'.format(str(e)))

INFO:root:Looking for peak files for file FOXA2._.labels.tsv
INFO:root:Working on peakfile ChIPseq.liver.FOXA2.conservative._.narrowPeak
INFO:root:Looking for peak files for file NANOG._.labels.tsv


# <<<<<<<< RUN MODELS >>>>>>>>

# Callbacks

In [29]:
from keras import backend as K
def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall
    
def precision(y_true, y_pred):
    """Precision metric.

    Only computes a batch-wise average of precision.

    Computes the precision, a metric for multi-label classification of
    how many selected items are relevant.
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2*((p*r)/(p+r+K.epsilon()))

In [30]:
import logging
from sklearn.metrics import *
import numpy as np
from keras.callbacks import Callback
from tqdm import tqdm
class Stats_callback(Callback):
    def __init__(self, gen, steps, cls=None, convert_to_categorical=False, vs_display_length = 20, show_metric_interval=1, warning_threshold=0.85):
        self.auROCs = []
        self.auPRCs = []
        self.re5FDRs = []
        self.re10FDRs = []
        self.re25FDRs = []
        self.re50FDRs = []
        self.confusion_matrix = []
        self.valid_gen = gen
        self.valid_steps = int(steps)
        # convert crf label data into categorical
        self.convert_to_categorical = convert_to_categorical
        # if using convert_to_categorical (ie using crf) need to provide the classification along with OBIE from valid_gen
        if convert_to_categorical:
            if not cls:
                raise ValueError('if using convert_to_categorical (ie using crf) need to provide the classification along with OBIE from valid_gen')
            self.cls_gen = cls
        self.vs_display_length = vs_display_length if vs_display_length>=0 else 0
        
        # display settings
        self.show_metric_interval = show_metric_interval
        self.warning_threshold = warning_threshold
    
    @property
    def history(self):
        return {
            'auROCs': tuple(self.auROCs),
            'auPRCs': tuple(self.auPRCs),
            're5FDRs': tuple(self.re5FDRs),
            're10FDRs': tuple(self.re10FDRs),
            're25FDRs': tuple(self.re25FDRs),
            're50FDRs': tuple(self.re50FDRs),
        }
    
    @staticmethod
    def auROC(labels, predictions):
        return roc_auc_score(labels, predictions)

    @staticmethod
    def auPRC(labels, predictions):
        precision, recall = precision_recall_curve(labels, predictions)[:2]
        return auc(recall, precision)

    @staticmethod
    def recall_at_precision(labels, predictions, precision_at):
        threshold = 1.0-precision_at
        precision, recall = precision_recall_curve(labels, predictions)[:2]
        return 100 * recall[np.searchsorted(precision - threshold, 0)]
    
    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):      
        if epoch%self.show_metric_interval or epoch<=1:
            logging.info('Showing metric in {} epochs'.format(self.show_metric_interval-(epoch%self.show_metric_interval)) )
            return
        logging.info('Calculating metrics...')
        def contains_BIE(sparse):
            for c in sparse:
                if c==1 or c==2 or c==3:
                    return 1
            return 0
        epoch_auROC = []
        epoch_auPRC = []
        re5, re10, re25, re50 = [], [], [], []
        tns, fps, fns, tps = 0,0,0,0
        for batch_index in tqdm(range(self.valid_steps)):
            features, y_true = self.valid_gen[batch_index]            
            y_pred = np.asarray(self.model.predict(features, verbose=1))
            y_pred = y_pred.round().astype(int) 
            
            if self.convert_to_categorical:
#                 logging.info('Converting to categorical')
                maxed = np.argmax(y_pred, axis=2)
                cls_pred =[contains_BIE(m) for m in maxed]
                _y_pred = np.array(cls_pred)
            else:
                _y_pred = y_pred

            _y = next(self.cls_gen) if self.convert_to_categorical else y_true
            
            if self.warning_threshold:
                total_pos = sum(_y)
                pred_pos = np.count_nonzero(_y_pred>=self.warning_threshold)
                if pred_pos/ total_pos < 0.1:
                    logging.warning('Very low positive prediction rate')
                    logging.warning('{:25}{:25}'.format('y head :', 'y prediction head :'))
                    for i in range(10):
                        logging.warning('{:25}{:25}'.format(_y[i], _y_pred[i]))
            # ROC
            try:
                epoch_auROC.append(self.auROC(_y, _y_pred))
            except Exception as e:
                logging.error('y shape: {}\ny_pred shape: {}'.format(_y.shape, _y_pred.shape))
                logging.error('y : {}\ny_pred : {}'.format(_y, _y_pred))
                raise e
            # PRC
            epoch_auPRC.append(self.auPRC(_y, _y_pred))
            # Recalls
            result5, result10, result25, result50 = [self.recall_at_precision(_y, _y_pred, precision) for precision in [0.05,0.1,0.25,0.5]]
            re5.append(result5)
            re10.append(result10)
            re25.append(result25)
            re50.append(result50)
            # Compute confusion matrix
#             logging.info('Calculating confusion matrix')
            tn, fp, fn, tp = confusion_matrix(_y, np.around(_y_pred)).ravel()
            tns += tn
            fps += fp
            fns += fn
            tps += tp
            
        # collect into object's history
        self.auROCs.append(np.mean(epoch_auROC))        
        self.auPRCs.append(np.mean(epoch_auPRC))
        self.re5FDRs.append(np.mean(re5))
        self.re10FDRs.append(np.mean(re10))
        self.re25FDRs.append(np.mean(re25))
        self.re50FDRs.append(np.mean(re50))

        self.confusion_matrix.append((tns, fps, fns, tps))
        
        logging.info('Epoch {}:'.format(epoch))
        logging.info('avg auROC: {:8.4f} avg auPRC: {:8.4f}'.format(np.mean(epoch_auROC), np.mean(epoch_auPRC)))
        logging.info('Recall@5%/10%/25%/50%: {:6.4f} {:6.4f} {:6.4f} {:6.4f}'.format(np.mean(re5), np.mean(re10), np.mean(re25), np.mean(re50)))
        logging.info('tp:{:>10} fn:{:>10} tn:{:>10} fp:{:>10}'.format(tps, fns, tns, fps,))
        
        if self.vs_display_length:
            tp_fn = int(tps/(tps+fns) * self.vs_display_length)
            compl_tp_fn = self.vs_display_length - tp_fn
            tn_fp = int(tns/(tns+fps) * self.vs_display_length)
            compl_tn_fp = self.vs_display_length - tn_fp

            logging.info('tp|{}|fn   tn|{}|fp'.format('#'*tp_fn + '_'*(compl_tp_fn), '#'*tn_fp + '_'*(compl_tn_fp) ))
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return



# Helper Functions

In [31]:
def autofill_params(generator, params_dict):
    '''
    Auto fill generator with parameters from parmas_dict
    returns curried generator
    '''
    import inspect
    creation_params = inspect.signature(generator).parameters.keys()
    best_params = {k:v for k,v in params_dict.items() if k in creation_params}
    def recreated():
        return create_baseline(**best_params)
    return recreated

In [32]:
def get_rnd_subset(size, *data_arrs):
    '''
    Get a random subset of a set of data with matching indices
    '''
    import functools
    same_sizes = [data_arrs[i].shape[0]==data_arrs[i+1].shape[0] if i < len(data_arrs)-1 else True for i, da in enumerate(data_arrs)]
    if not  functools.reduce(lambda acc,x: acc and x, same_sizes):
        raise ValueError('Not all datasets have the same number of samples (1st dimension size)')
    subset_selection = np.random.choice(data_arrs[0].shape[0], size, replace=False)
    return [data_arr[subset_selection,...] for data_arr in data_arrs]

In [33]:
def get_rnd_subset_gen(size, data_arrs_gen, idempotent=False):
    '''
    Get a generator for random subset of a set of data with matching indices using generator. For minimizing memory usage.
    '''
    import itertools
    if idempotent:
        _, gen = itertools.tee(data_arrs_gen)
    else: 
        gen = data_arrs_gen
    peek = next(gen)
    subset_selection = np.random.choice(peek.shape[0], size, replace=False)
    
    gen = itertools.chain([peek], gen)
    for data_arr in gen:
        yield data_arr[subset_selection, ...]

In [34]:
def chunks(iterable, size):
    '''
    Get chunks from an iterable
    '''
    import itertools
    it = iter(iterable)
    return iter(lambda: tuple(itertools.islice(it, size)), ())

In [35]:
def load_data_chunks_gen(files_to_load, n_types, size_limits=None):
    c_gen = chunks(files_to_load, n_types)
    it = zip(size_limits, c_gen) if size_limits else ((False, c) for c in c_gen)
    for size_limit, chunk in it:
        data_arr_gen = (np.load(path) for path in chunk)
        if size_limit:
            yield list(get_rnd_subset_gen(size_limit, data_arr_gen))
        else:
            yield list(data_arr_gen)

# Hyperopt setup

In [36]:
from models import ReverseComplementLayer
from models import CRF
from models import CRF_ext

In [37]:
instanceHolder = {"instance": None}
class ClassWrapper(CRF_ext):
    def __init__(self, *args, **kwargs):
        instanceHolder["instance"] = self
        super(ClassWrapper, self).__init__(*args, **kwargs)
def loss(*args):
    method = getattr(instanceHolder["instance"], "loss_function")
    return method(*args)
def accuracy(*args):
    method = getattr(instanceHolder["instance"], "accuracy")
    return method(*args)
def viterbi_precision(*args):
    method = getattr(instanceHolder["instance"], "viterbi_precision")
    return method(*args)
def f1(*args):
    method = getattr(instanceHolder["instance"], "viterbi_f1")
    return method(*args)
def recall(*args):
    method = getattr(instanceHolder["instance"], "viterbi_recall")
    return method(*args)
def precision(*args):
    method = getattr(instanceHolder["instance"], "viterbi_precision")
    return method(*args)

In [38]:
import keras
from sklearn.metrics import *
from tqdm import tqdm

def create_hyper_objective(model_handle, id_name, output_path, dm,
                           train_sequencer, train_len,
                           valid_sequencer, valid_cls, valid_len,
                           test_sequencer, test_x, test_cls, test_len,
                           trials, 
                           use_generator=True,
                           hyper_obj_pkl='hyper_obj_savepoint.pkl',
                           subtrials_results_log='trial.log',
                           convert_to_categorical=False,
                           patience=20, show_metric_interval=5,
                           best_model_h5 = 'best_model.h5', hyper_obj_log={}):
    
    hyper_obj_log['best_auprc_so_far'] = 0.0
    def hyper_objective(params):
        from hyperopt import STATUS_OK
        import pickle
        import os
        if trials.statuses():
            logging.info('Making checkpoint for hyperparameters')
            with open(hyper_obj_pkl, 'wb') as pkl:
                pickle.dump(trials, pkl)
            
        logging.info('Current parameters :\n{}'.format('\n'.join([str(k)+' : '+str(v) for k,v in params.items()])))
        batch_n, epoch_n = params['batch_size'], params['epochs']
        
        train_steps = train_len//batch_n
#         validation_steps = valid_len//batch_n
        # scale down the number of validations but keep distribution
        validation_steps = int((train_len/4)//batch_n)
        test_steps = test_len//batch_n
        
        train_generator = train_sequencer(batch_n)
        validation_generator = valid_sequencer(batch_n)
        
        # set up callbacks
        monitor = params['monitor'] if 'monitor' in params else 'val_loss'
        mode = 'max' if 'monitor' in params else 'auto'
        logging.info("Current monitor mode is set to {}".format(mode))
        earlystopping_cb = keras.callbacks.EarlyStopping(monitor=monitor,
                                          min_delta=0.0001,
                                          patience=patience,
                                          verbose=0, mode=mode)
        stats_cb = Stats_callback(validation_generator, validation_steps,
                                  convert_to_categorical=convert_to_categorical, 
                                  cls=valid_cls(batch_n) if convert_to_categorical else None,
                                  show_metric_interval=show_metric_interval)
        from keras.callbacks import ModelCheckpoint
        checkpoint_path = os.path.join(output_path, '%s-{epoch:04d}-{val_acc:0.2f}.hdf5' % (id_name))
        checkpoint_cb = ModelCheckpoint(checkpoint_path, monitor=monitor, verbose=1, save_best_only=True, mode=mode)
        board_path = os.path.join(output_path, '{}_tensorboard.log'.format(id_name))
        board_cb = keras.callbacks.TensorBoard(board_path, batch_size=batch_n)

        cbs = [earlystopping_cb, stats_cb, checkpoint_cb, board_cb]
        # check whether a model was under training
        import os
        epoch_restart = 0
        load_file = ''
        try:
            if os.path.exists('in_progress.lck'):
                logging.info('Continuing previous training session')
                import re
                regex = re.compile(r'(\w+)-(\d+)-(\d*\.\d+)\.hdf5', re.IGNORECASE)
                matching_files = filter(lambda file: regex.match(file), os.listdir(output_path))
                meta = [(regex.findall(file)[0], file) for file in matching_files]
                # meta looks like ((prev_id, epoch, val_acc), file)
                matching_meta = filter(lambda m: m[0][0]==id_name, meta)
                max_meta = max(matching_meta, key=lambda m: m[0][1])
                epoch_restart = int(max_meta[0][1])
                logging.info('Restarting from {}'.format(epoch_restart))
                load_file = os.path.join(output_path, max_meta[1])
                logging.info('load file from {}'.format(load_file))
        except ValueError:
            logging.info('Error loading previous training. Continuing without loading')
            
        if load_file:
            from keras.models import load_model
            base = load_model(load_file, custom_objects={'ReverseComplementLayer': ReverseComplementLayer, 
                                                         "ClassWrapper": ClassWrapper ,
                                                         "CRF_ext": ClassWrapper, "loss": loss, "accuracy":accuracy,
                                                         "viterbi_precision":viterbi_precision, "f1":f1,
                                                         "recall":recall, "precision":precision})
        else:
            base = model_handle.autofill_params(params)
        try:
            with open('in_progress.lck', 'w') as f:
                pass
            
            if use_generator:
                history = base.fit_generator(train_generator,
                                             steps_per_epoch= train_steps,
                                             epochs=epoch_n,
                                             callbacks=cbs,
                                             validation_data=validation_generator,
                                             validation_steps=validation_steps,
    #                                          use_multiprocessing=True, workers=1, # may not work on Windows
                                             verbose=1, initial_epoch=epoch_restart)
            else:
                filename = id_name+'_train.hdf5'
                import h5py
                if not all_paths_exists(filename):
                    x0, y0 = train_generator[0]                    
                    logging.info('Constructing training data from metadata')
                    _x = np.empty((batch_n*train_steps, *x0.shape[1:]))
                    _y = np.empty((batch_n*train_steps, *y0.shape[1:]))
                    for i in tqdm(range(train_steps)):
                        sample = train_generator[i]
                        _x[i*batch_n:(i+1)*batch_n] = sample[0]
                        _y[i*batch_n:(i+1)*batch_n] = sample[1]
                    logging.info('Constructing validation data from metadata')
                    val_x = np.empty((batch_n*validation_steps, *x0.shape[1:]))
                    val_y = np.empty((batch_n*validation_steps, *y0.shape[1:]))
                    for i in tqdm(range(validation_steps)):
                        sample = validation_generator[i]
                        val_x[i*batch_n:(i+1)*batch_n] = sample[0]
                        val_y[i*batch_n:(i+1)*batch_n] = sample[1]
                    logging.info('Writing to disk')
                    with h5py.File(filename, "w") as h5:
                        h5.create_dataset('x', data=_x, compression='lzf',)
                        h5.create_dataset('y', data=_y, compression='lzf',)
                        h5.create_dataset('val_x', data=val_x, compression='lzf',)
                        h5.create_dataset('val_y', data=val_y, compression='lzf',)
                else:
                    logging.info('Loading from disk')
                    with h5py.File(filename, "r") as h5:
                        _x = h5['x'].value
                        _y = h5['y'].value
                        val_x = h5['val_x'].value
                        val_y = h5['val_y'].value
                history = base.fit(_x,_y, batch_n, epochs=epoch_n, callbacks=cbs, validation_data=(val_x, val_y), 
                                   verbose=1, initial_epoch=epoch_restart)
        except ValueError as e:
            base.summary()
            raise e
        base.summary()
        
        import os
        os.remove('in_progress.lck')
        
#         logging.info('Evaluating model on test set of size {}'.format(test_len))
        
#         test_generator = test_sequencer(batch_n)
        
#         score, acc = base.evaluate_generator(test_generator, 
#                                              steps=test_steps, 
#                                              verbose=1)
        
#         logging.info('Got score {} and accuracy {}'.format(score, acc))
        
        logging.info('Getting test y prediction')
        test_pred = base.predict_generator(test_x(batch_n), steps=test_steps, verbose=1)
        def contains_BIE(sparse):
            for c in sparse:
                if c==1 or c==2 or c==3:
                    return 1
            return 0
        if convert_to_categorical:
            logging.info('Converting test y prediction to categorical')
            test_pred = test_pred.round().astype(int)
            test_cls_pred = [ contains_BIE([ np.argmax(a) for a in each]) for each in test_pred ]
#             maxed = [ [ np.argmax(a) for a in each] for each in test_pred]
#             test_cls_pred =[ contains_BIE(m) for m in maxed]
            test_pred = np.array(test_cls_pred)
        else:
            logging.info('Converting test y prediction to list')
            test_pred = list(test_pred[:,0])
        logging.info('Getting test y')
        import itertools
        test_real = [x for batch in itertools.islice(test_cls(batch_n),test_steps) for x in batch ]
        
        # make sure they are the same length
        try:
            assert len(test_pred) == len(test_real) 
        except:
            logging.error('Lengths mismatch at testing; prediction {} real {} samples {}'.format(len(test_pred), len(test_real), test_len))
            # memory usage?
            test_pred, test_real = zip(*zip(test_pred, test_real))
        
        def auROC(labels, predictions):
            return roc_auc_score(labels, predictions)

        def auPRC(labels, predictions):
            precision, recall = precision_recall_curve(labels, predictions)[:2]
            return auc(recall, precision)

        def recall_at_precision(labels, predictions, precision_at):
            threshold = 1.0-precision_at
            precision, recall = precision_recall_curve(labels, predictions)[:2]
            return 100 * recall[np.searchsorted(precision - threshold, 0)]
        
        # auROC,  auPRC, recalls
        logging.info('Calculating auROC')
        auroc = auROC(test_real, test_pred)
        logging.info('auROC: {}'.format(auroc))
        logging.info('Calculating auPRC')
        auprc = auPRC(test_real, test_pred)
        logging.info('auPRC: {}'.format(auprc))
        logging.info('Calculating recall @ precisions')
        re5, re10, re25, re50 = [recall_at_precision(test_real, test_pred, precision) for precision in [0.05,0.1,0.25,0.5]]
        # confusion matrix 
        confusion = confusion_matrix(test_real, np.round(test_pred))
    
        logging.info('auROC: {} auPRC {} re@5/10/25/50: {:>7.5}/{:>7.5}/{:>7.5}/{:>7.5}'.format(auroc, auprc, re5, re10, re25, re50))
        
        if auprc > hyper_obj_log['best_auprc_so_far']:
            base.save(best_model_h5)
            hyper_obj_log['best_auprc_so_far'] = auprc
            hyper_obj_log['best_model'] = base
            hyper_obj_log['best_model_history'] = history
        logging.info('Writing scores to file')
        with open(subtrials_results_log,'a') as output:
            base.summary(print_fn=lambda x:output.write('{}\n'.format(x)))
            output.write('auROC: {} auPRC {} re@5/10/25/50: {:>7.5}/{:>7.5}/{:>7.5}/{:>7.5}\n'.format(auroc, auprc, re5, re10, re25, re50))
            output.write('Confusion matrix:\n{}'.format(str(confusion)))
        return { 
            # results required by hyperopt
            'loss': -auprc, 'status': STATUS_OK, 
            # other additional results
            'metrics_history':stats_cb.history
               }
    return hyper_objective

# Run Model

In [39]:
def run_model(model_starter, parameter_space,
              model_name, tf_name, 
              filter_file='total_regions.blacklistfiltered.merged.bed',
              training_size_limit=None,
              valid_size_limit=None,
              test_size_limit=None,
              n_trials=32,
              patience=20,
              use_labels_as_y=False,
              use_generator=True,
              output_path='.',
              show_design=False, 
              seq_data_only=False, reduce_negative_samples=False,
              show_metric_interval=5,
              rerun_full=True):
    
    import logging
    id_name = '{}_{}'.format(tf_name, model_name)
    tf_folder = './{tf_name}_label_dataset'.format(tf_name=tf_name) 

    # setup datasets
    logging.info('Setting up data streams')
    import re
    peakfile_regex = re.compile(r'chipseq\.\w+\.%s\..*\.\_\.narrowpeak'%tf_name, re.IGNORECASE)
    labelfile_regex = re.compile(r'%s\.\_\.labels.tsv'%tf_name, re.IGNORECASE)
    for file in os.listdir('.'):
        if peakfile_regex.match(file):
            peakfile = file
        elif labelfile_regex.match(file):
            label_file = file
    try:
        logging.info('Creating data manager for peakfile {} labelfile {}'.format(peakfile, label_file))
        dm = DataManager(label_file, peakfile, output_dir='datasets', 
                        move_finished_src='finished_peakfiles', 
                        filter_file=filter_file,
                        reduce_negative_samples=reduce_negative_samples, # check_set_ratio=9, # use for reduce_negative_samples=True
                        )
        dm.samples # make sure samples are already created
        generators = dm.create_datagen_from_samples(useDNAshapeR=False)
#         train_x, train_label, train_seq, train_cls, train_labels_sequencer, train_cls_sequencer = generators[:6]
        train_labels_sequencer, train_cls_sequencer = generators[:2]
        valid_x, valid_label, valid_seq, valid_cls, valid_labels_sequencer, valid_cls_sequencer = generators[2:8]
        test_x, test_label, test_seq, test_cls, test_labels_sequencer, test_cls_sequencer = generators[8:]
    except InsufficientChromosomesException as e:
        logging.warning('Skipping creating data for {}'.format(peakfile))
        logging.warning('Error message: {}'.format(str(e)))
    
    if seq_data_only:
        raise NotImplementedError
    
    # create model
    
    # select Tree Parzen Estimator 
    # check this for comparison with other methods
    # http://www.cs.ubc.ca/~hutter/papers/13-BayesOpt_EmpiricalFoundation.pdf
    tpe_algorithm = tpe.suggest
    
    # store track progress
    hyper_obj_pkl = '{}_hyper_obj_savepoint.pkl'.format(id_name) 
    hyper_obj_pkl = os.path.join(output_path, hyper_obj_pkl)
    if not os.path.exists(hyper_obj_pkl):
        tpe_progress = Trials()
    else:
        logging.info('Loading available previous hyperparameters settings')
        try:
            with open(hyper_obj_pkl, 'rb') as pkl:
                tpe_progress = pickle.load(pkl)
                print('tpe statuses {}'.format(tpe_progress.statuses()))
                if 'new' == tpe_progress.statuses()[0]:
                    tpe_progress = Trials()
        except ValueError:
            logging.error('Loading previous hyperparameters failed. Creating new set')
            tpe_progress = Trials()
    
    trial_log_file = '{}_trials.log'.format(id_name)
    trial_log_file = os.path.join(output_path, trial_log_file)
    
    model_handle = model_starter((dm.sample_length, dm.feature_dimensions))
    best_model_h5 = os.path.join(output_path, '{}_best_model.h5'.format(id_name)) 
    hyper_obj_log = {}
    logging.info('Setting up hyperopt objective')
    if use_labels_as_y:
        objective = create_hyper_objective(model_handle, id_name, output_path, dm,
                                           train_labels_sequencer, 
                                           dm.dataset2counts_pos['train'] * 2,
                                           valid_labels_sequencer,
                                           valid_cls,
                                           dm.dataset2counts_pos['valid'] + dm.dataset2counts_neg['valid'],
                                           test_labels_sequencer, test_x, test_cls,
                                           dm.dataset2counts_pos['test'] + dm.dataset2counts_neg['test'],
                                           tpe_progress,
                                           use_generator,
                                           hyper_obj_pkl,
                                           trial_log_file,
                                           convert_to_categorical = True,
                                           patience=patience, show_metric_interval=show_metric_interval,
                                           best_model_h5=best_model_h5, hyper_obj_log=hyper_obj_log)
    else:
        objective = create_hyper_objective(model_handle, id_name, output_path, dm,
                                           train_cls_sequencer,
                                           dm.dataset2counts_pos['train'] * 2,
                                           valid_cls_sequencer, 
                                           valid_cls,
                                           dm.dataset2counts_pos['valid'] + dm.dataset2counts_neg['valid'],
                                           test_cls_sequencer, test_x, test_cls,
                                           dm.dataset2counts_pos['test'] + dm.dataset2counts_neg['test'],
                                           tpe_progress,
                                           use_generator,
                                           hyper_obj_pkl,  
                                           trial_log_file,
                                           convert_to_categorical = False,
                                           patience=patience, show_metric_interval=show_metric_interval,
                                           best_model_h5=best_model_h5, hyper_obj_log=hyper_obj_log)        
    # requires networkx1.11 newer versions may not work 
    optimum = fmin(objective, parameter_space, tpe_algorithm, trials=tpe_progress, max_evals=n_trials)
    
    try:
        if show_design:
            from keras.utils.vis_utils import plot_model
            logging.info('drawing model architecture')
            if type(show_design)==str:
                plot_model(model_handle.model, show_design)
            else: 
                plot_model(model_handle.model, '{}_arch.png'.format(model_handle.model_factory.__name__))
    except:
        logging.warning("Not drawing model for {}".format(model_name))
    
    import hyperopt
    best_params = hyperopt.space_eval(parameter_space, optimum)
    best_score = -tpe_progress.best_trial['result']['loss']
    metrics_history = tpe_progress.best_trial['result']['metrics_history']
    
    with open(os.path.join(output_path, 'test_%s.txt'%(id_name)), 'a') as out:
        out.write("Best: {} using {}\n".format(best_score, str(best_params)))
        out.write("Metrics history\n")
        n = len(metrics_history)
        out.write(('{:>20} '* n +'\n').format(*metrics_history.keys()))
        for e in zip(*metrics_history.values()):
            out.write (('{:>20} '* n +'\n').format(*e))
            
    if 'best_model_history' in hyper_obj_log: # doesn't exist if no trials are run
        best_model_history = hyper_obj_log['best_model_history']
    
        import history_plot
        history_plot.plot_history(best_model_history, os.path.join(output_path, id_name))
    
    
    # TODO: generate cb generators for both this and create_hyper_objective
    
    
    #### Rerun best model from tpe trials ####
    if not rerun_full:
        return
    
    
#     recreation = model_handle.autofill_params(best_params)
#     recreation.summary() 
    
#     logging.info('Running full training')
    
#     logging.info('Setting up to load full data')
#     size_limit =  None
#     dataset_gen = load_data_chunks_gen(paths_to_load, file_types, size_limit)

#     # load datset
#     train_x, train_seq, train_cls, train_label = next(dataset_gen)
#     logging.info('Loaded training data')
#     valid_x, valid_seq, valid_cls, valid_label = next(dataset_gen)
#     logging.info('Loaded validation data')
#     test_x, test_seq, test_cls, test_label = next(dataset_gen)
#     logging.info('Loaded test data')
#     if seq_data_only:
#         train_x = train_x[:, :, :4]
#         valid_x = valid_x[:, :, :4]
#         test_x = test_x[:, :, :4]
    
#     earlystopping_cb = keras.callbacks.EarlyStopping(monitor='val_loss',
#                                       min_delta=0.0001,
#                                       patience=patience,
#                                       verbose=0, mode='auto')
#     statistics = Stats_callback(valid_cls, convert_to_categorical=True)

#     from keras.callbacks import ModelCheckpoint
#     checkpoint_path = '%s-{epoch:04d}-{val_acc:0.2f}.hdf5' % (id_name)
#     checkpoint_path = os.path.join(output_path, checkpoint_path)
#     checkpoint_cb = ModelCheckpoint(checkpoint_path, monitor='val_acc', verbose=1, save_best_only=True)
#     board_path = os.path.join(output_path, '{}_tensorboard.log'.format(id_name))
#     board_cb = keras.callbacks.TensorBoard(board_path, batch_size=best_params['batch_size'])
    
#     cbs = [earlystopping_cb, statistics, checkpoint_cb]
#     # TODO: add load from checkpoints
#     if use_labels_as_y:
#         history = recreation.fit(train_x, train_label, 
#                              epochs=best_params['epochs'],
#                              batch_size=best_params['batch_size'],
#                              validation_data=(valid_x, valid_label),
#                              callbacks=cbs)
#         test_metric_out = recreation.evaluate(test_x, test_label)
#     else:
#         history = recreation.fit(train_x, train_cls, 
#                              epochs=best_params['epochs'],
#                              batch_size=best_params['batch_size'],
#                              validation_data=(valid_x, valid_cls),
#                              callbacks=cbs)
#         test_metric_out = recreation.evaluate(test_x, test_cls)
        
#     with open(os.path.join(output_path, 'test_metrics'), 'w') as test_metrics:
#         try:
#             for metric_name, metric_out in zip(recreation.metrics_names, test_metric_out):
#                 test_metrics.write('{:30}:{:>30}\n'.format(metric_name, metric_out))
#         except:
#             test_metrics.write('{:30}:{:>30}\n'.format('Test loss', test_metric_out))
#     # plot history
#     import history_plot
#     history_plot.plot_history(history, os.path.join(output_path, id_name))

#     # TODO: plot other things
    
#     def plot_curves(x, y, use_labels_as_y=False):
#         from sklearn.metrics import roc_curve
#         from sklearn.metrics import auc

#         y_pred = recreation.predict(x)
#         if use_labels_as_y:
#             maxed = [ [ np.argmax(a) for a in each] for each in y_pred]
#             cls_pred =[ 1 if (1 in m or 2 in m or 3 in m) else 0 for m in maxed]
#             _y_pred = np.array(cls_pred)
#         else:
#             _y_pred = y_pred

#         _y = y
    
#         fpr, tpr, thresholds = roc_curve(_y, _y_pred)
#         auc_model = auc(fpr, tpr)
    
#         import matplotlib.pyplot as plt
#         plt.figure(1)
#         plt.plot([0, 1], [0, 1], 'k--')
#         plt.plot(fpr, tpr, label='{} (area = {:.3f})'.format(model_name, auc_model))
#         plt.xlabel('False positive rate')
#         plt.ylabel('True positive rate')
#         plt.title('ROC curve for {}'.format(model_name))
#         plt.legend(loc='best')
#         figure_path = os.path.join(output_path, id_name+'_ROCAUC.png')
#         plt.savefig(figure_path)
#     plot_curves(valid_x, valid_cls, use_labels_as_y=use_labels_as_y)
    
#     # save final model
#     final_model_path = os.path.join(output_path, id_name+"_final.h5")
#     recreation.save(final_model_path)
    
#     with open(os.path.join(output_path, '{}_final_history.pkl'.format(id_name)), 'wb') as pkl:
#         pickle.dump(statistics.history, pkl)
#     with open(os.path.join(output_path, '{}_final_history.txt'.format(id_name)), 'w') as out:
#         out.write("Metrics history\n")
#         out.write('{:>20} {:>20} {:>20} {:>20}\n'.format(*statistics.history.keys()))
#         for e in zip(*statistics.history.values()):
#             out.write ('{:>20} {:>20} {:>20} {:>20}\n'.format(*e))
            
#     return statistics, history

# Setup Models

In [40]:
from models import *
from model_handler import *

#hyperparameter optimizer
from keras.wrappers.scikit_learn import KerasClassifier
import pickle
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

# ROC scores
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback

In [41]:
test_models = []
labelled_test_models = []
patience=10
n_trials=1

_model_choice = Models.cnn_cnn
_model_name = 'cnn_cnn'

search_space = {
    'cnn1_n_filters': hp.choice('cnn1_n_filters', [8,16]),
    'cnn1_kernel_size': hp.choice('cnn1_kernel_size', [8,16,32]),
    'cnn2_n_filters': hp.choice('cnn2_n_filters', [4,8]),
    'cnn2_kernel_size': None,
    'do_rate': hp.choice('do_rate', [0.1,0.3,0.5]),
    'dense_size': hp.choice('dense_size', [128 , 256, 1024]),
    
    'optimizer': hp.choice('optimizer', [Adam()]),
    'batch_size': 256, 'epochs': 200,
}
test_models.append((_model_choice, _model_name, search_space))

_model_choice = Models.bilstm_crf
_model_name = 'bilstm_crf'

search_space = { 
    'lstm_size': hp.choice('lstm_size', [16,32,64]),
    'bi_do_rate': hp.choice('bi_do_rate', [0.1,0.25,0.5]),
    'rec_do_rate': hp.choice('rec_do_rate', [0.1,0.25,0.5]),
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
}

labelled_test_models.append((_model_choice, _model_name, search_space))

_model_choice = Models.cnn_bilstm_crf
_model_name = 'cnn_bilstm_crf'

search_space = { 
    'cnn_kernel_size': hp.choice('cnn_kernel_size', [26]),
    'cnn_n_filters': hp.choice('cnn_n_filters', [32]),
    'lstm_size': hp.choice('lstm_size', [16,32,64]),
    'crf_size': hp.choice('crf_size', [4]),
    'do_rate': hp.choice('do_rate', [0.1,0.25,0.5]),
    'd1_size' : hp.choice('d1_size', [4]),
    

    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
}

labelled_test_models.append((_model_choice, _model_name, search_space))

_model_choice = Models.cnn_cnn_lstm
_model_name = 'cnn_cnn_lstm'

search_space = {
    'cnn1_n_filters': hp.choice('cnn1_n_filters', [8,16,32]),
    'cnn1_kernel_size': hp.choice('cnn1_kernel_size', [8,16,32]),
    'cnn2_n_filters': hp.choice('cnn2_n_filters', [4,8]),
    'do_rate': hp.choice('do_rate', [0.1,0.3,0.5]),
    'lstm_size': hp.choice('lstm_size', [4,8,16,32]),
    'dense_size': hp.choice('dense_size', [128 , 256, 1024]),
    
    'optimizer': hp.choice('optimizer', [Adam()]),
    'batch_size': 256, 'epochs': 200,
}
test_models.append((_model_choice, _model_name, search_space))

_model_choice = Models.cnn_cnn_bilstm
_model_name = 'cnn_cnn_bilstm'

search_space = {
    'cnn1_n_filters': hp.choice('cnn1_n_filters', [8,16,32]),
    'cnn1_kernel_size': hp.choice('cnn1_kernel_size', [8,16,32]),
    'cnn2_n_filters': hp.choice('cnn2_n_filters', [4,8]),
    'do_rate': hp.choice('do_rate', [0.1,0.3,0.5]),
    'lstm_size': hp.choice('lstm_size', [4,8,16,32]),
    'dense_size': hp.choice('dense_size', [128 , 256, 1024]),
    
    'optimizer': hp.choice('optimizer', [Adam()]),
    'batch_size': 256, 'epochs': 200,
}
test_models.append((_model_choice, _model_name, search_space))

_model_choice = Models.cnn_cnn_dnn
_model_name = 'cnn_cnn_dnn'

search_space = {
    'cnn1_n_filters': hp.choice('cnn1_n_filters', [8,16,32]),
    'cnn1_kernel_size': hp.choice('cnn1_kernel_size', [8,16,32]),
    'cnn2_n_filters': hp.choice('cnn2_n_filters', [4,8]),
    'do_rate': hp.choice('do_rate', [0.1,0.3,0.5]),
    'd1_size': hp.choice('d1_size', [64,128, 256]),
    'd2_size': hp.choice('d2_size', [16, 32]),
    'd3_size': hp.choice('d3_size', [4,8]),
    
    'dense_size': hp.choice('dense_size', [8,16,32]),
    
    'optimizer': hp.choice('optimizer', [Adam()]),
    'batch_size': 256, 'epochs': 200,
}
test_models.append((_model_choice, _model_name, search_space))

In [42]:
_model_choice = Models.cnn
_model_name = 'deepbind'

search_space = { 
    'cnn1_n_filters':hp.choice('cnn_n_filters', [3]), 
    'cnn1_kernel_size':hp.choice('cnn_kernel_size', [26]),
    'do_rate':hp.choice('do_rate', [0.2, 0.5]),
    'dense_size':hp.choice('dense_size', [1024]),
        
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
}

test_models.append((_model_choice, _model_name, search_space))

In [43]:
_model_choice = Models.cnn_cnn
_model_name = 'deepbind_plus'

search_space = { 
    'cnn1_n_filters':hp.choice('cnn_n_filters', [3]), 
    'cnn1_kernel_size':hp.choice('cnn_kernel_size', [26]),
    'do_rate':hp.choice('do_rate', [0.2, 0.5]),
    'dense_size':hp.choice('dense_size', [1024]),
        
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
}

test_models.append((_model_choice, _model_name, search_space))

In [44]:
_model_choice = Models.bigru_crf
_model_name = 'bigru_crf'

search_space = { 
    'lstm_size': hp.choice('lstm_size', [64]),
    'bi_do_rate': hp.choice('bi_do_rate', [0.1]),
    'rec_do_rate': hp.choice('rec_do_rate', [0.5]),
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
    'monitor': 'val_f1'
}

labelled_test_models.append((_model_choice, _model_name, search_space))

In [45]:
_model_choice = Models.s_bigru_crf
_model_name = 's_bigru_crf'

search_space = { 
    'lstm_size': hp.choice('lstm_size', [64]),
    'bi_do_rate': hp.choice('bi_do_rate', [0.1]),
    'rec_do_rate': hp.choice('rec_do_rate', [0.1]),
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 1024, 'epochs': 200,
    'monitor': 'val_f1'
}

labelled_test_models.append((_model_choice, _model_name, search_space))

In [46]:
_model_choice = Models.s_bilstm_crf
_model_name = 's_bilstm_crf'

search_space = { 
    'lstm_size': hp.choice('lstm_size', [64]),
    'bi_do_rate': hp.choice('bi_do_rate', [0.1]),
    'rec_do_rate': hp.choice('rec_do_rate', [0.5]),
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
    'monitor': 'val_f1'
}

labelled_test_models.append((_model_choice, _model_name, search_space))

In [47]:
_model_choice = Models.s_cnn_bigru_crf
_model_name = 's_cnn_bigru_crf'

search_space = { 
    'cnn_kernel_size': hp.choice('cnn_kernel_size', [26]),
    'cnn_n_filters': hp.choice('cnn_n_filters', [32]),
    'lstm_size': hp.choice('lstm_size', [64]),
    'crf_size': hp.choice('crf_size', [4]),
    'do_rate': hp.choice('do_rate', [0.5]),
    'd1_size' : hp.choice('d1_size', [4]),
    

    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
    'monitor': 'val_f1'
}

labelled_test_models.append((_model_choice, _model_name, search_space))

In [48]:
_model_choice = Models.factor_net
_model_name = 'factor_net'

search_space = { 
    'cnn_kernel_size': hp.choice('cnn_kernel_size', [26]),
    'cnn_n_filters': hp.choice('cnn_n_filters', [32]),
    'lstm_size': hp.choice('lstm_size', [64]),
    'do_rate': hp.choice('do_rate', [0.5]),
    
    'optimizer': hp.choice('optimizer', [Adam(lr=0.0001)]),
    'batch_size': 512, 'epochs': 200,
}

test_models.append((_model_choice, _model_name, search_space))

In [49]:
# get all tf names from datasets available 
tf_names = ['_'.join(f.split('_')[:2]) for f in os.listdir('.') if f.endswith('_dataset') and len(f.split('_'))==4]
tf_names

[]

In [None]:
# %%memit
model_statsNhistory = []
import os
cwd = os.path.abspath(os.getcwd())
err_obj = None
err = None
use_generator=True
reduce_negative_samples=False
try:
    for tf_name, dm in tf_nameNdm:
        logging.info('Working on {}'.format(tf_name))
        tf_dir = '{}_{}'.format(tf_name, 'outputs')
        
        for _model_choice, _model_name, search_space  in labelled_test_models:
            logging.info('Testing {}'.format(_model_name))
            working_dir = '{}_{}_{}'.format(tf_name, _model_name, 'model_outputs')
            output_path = os.path.relpath(os.path.join(cwd, tf_dir, working_dir))
            # make a path for the outputs
            os.makedirs(output_path, exist_ok=True)

            model_statsNhistory.append(run_model(Models.use(_model_choice, _model_name),
                                                 search_space,
                                                 _model_name, tf_name,
                                                 training_size_limit=None, 
                                                 valid_size_limit=None, 
                                                 test_size_limit=None,
                                                 n_trials=n_trials, 
                                                 patience=patience, show_metric_interval=50,
                                                 output_path=output_path, use_labels_as_y=True, 
                                                 use_generator=use_generator, reduce_negative_samples=reduce_negative_samples,
                                                 show_design=True, seq_data_only=False, rerun_full=False ))
        for _model_choice, _model_name, search_space  in test_models:
            logging.info('Testing {}'.format(_model_name))
            working_dir = '{}_{}_{}'.format(tf_name, _model_name, 'model_outputs')
            output_path = os.path.relpath(os.path.join(cwd, tf_dir, working_dir))
            # make a path for the outputs
            os.makedirs(output_path, exist_ok=True)
            logging.info('Running model for {}'.format(_model_choice))
            model_statsNhistory.append(run_model(Models.use(_model_choice, _model_name),
                                                 search_space,
                                                 _model_name, tf_name,
                                                 training_size_limit=None, 
                                                  valid_size_limit=None, 
                                                 test_size_limit=None,
                                                 n_trials=n_trials, 
                                                 patience=patience,show_metric_interval=50,
                                                 output_path=output_path,
                                                 use_generator=use_generator, reduce_negative_samples=reduce_negative_samples,
                                                 show_design=True, seq_data_only=False, rerun_full=False ))
            logging.info('Finished running model for {}'.format(_model_choice))
except DebugObject as e:
    err_obj = e.content
    err = e
    print('Error object loaded in err_obj')

INFO:root:Working on FOXA2
INFO:root:Testing bigru_crf
INFO:root:Setting up data streams
INFO:root:Creating data manager for peakfile ChIPseq.liver.FOXA2.conservative._.narrowPeak labelfile FOXA2._.labels.tsv
INFO:root:Creating positive/negative samples
INFO:root:Loading pickled gen_hg19
INFO:root:Finished setting up gen_hg19
INFO:root:The function if_not_pickled took 05 seconds
INFO:root:No pickle for gen_chr2locNbound is found. Generating anew.
  0%|                                                                                     | 0/60519749 [00:00<?, ?it/s]INFO:root:Working on chr10

  4%|██▉                                                                | 2697574/60519749 [00:13<04:46, 202105.54it/s]INFO:root:Working on chr11

  9%|█████▉                                                             | 5356472/60519749 [00:26<04:34, 200595.89it/s]INFO:root:Working on chr12

 13%|████████▉                                                          | 8016793/60519749 [00:40<04:24, 198

tpe statuses ['new']


INFO:root:Setting up hyperopt objective
INFO:hyperopt.tpe:tpe_transform took 0.002935 seconds
INFO:hyperopt.tpe:TPE using 0 trials
INFO:root:Making checkpoint for hyperparameters
INFO:root:Current parameters :
lstm_size : 64
rec_do_rate : 0.5
optimizer : <keras.optimizers.Adam object at 0x00000243641E49E8>
bi_do_rate : 0.1
epochs : 200
batch_size : 512
monitor : val_f1
INFO:root:Found y are labels; using OBIE
INFO:root:Found y are labels; using OBIE
INFO:root:Current monitor mode is set to max
INFO:root:Continuing previous training session
INFO:root:Restarting from 5
INFO:root:load file from FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0005-0.95.hdf5


Epoch 6/200


INFO:root:Showing metric in 45 epochs



Epoch 00006: val_f1 improved from -inf to 0.68024, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0006-0.94.hdf5
Epoch 7/200


INFO:root:Showing metric in 44 epochs



Epoch 00007: val_f1 did not improve from 0.68024
Epoch 8/200


INFO:root:Showing metric in 43 epochs



Epoch 00008: val_f1 did not improve from 0.68024
Epoch 9/200


INFO:root:Showing metric in 42 epochs



Epoch 00009: val_f1 did not improve from 0.68024
Epoch 10/200


INFO:root:Showing metric in 41 epochs



Epoch 00010: val_f1 did not improve from 0.68024
Epoch 11/200


INFO:root:Showing metric in 40 epochs



Epoch 00011: val_f1 did not improve from 0.68024
Epoch 12/200


INFO:root:Showing metric in 39 epochs



Epoch 00012: val_f1 did not improve from 0.68024
Epoch 13/200


INFO:root:Showing metric in 38 epochs



Epoch 00013: val_f1 did not improve from 0.68024
Epoch 14/200


INFO:root:Showing metric in 37 epochs



Epoch 00014: val_f1 did not improve from 0.68024
Epoch 15/200


INFO:root:Showing metric in 36 epochs



Epoch 00015: val_f1 improved from 0.68024 to 0.68174, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0015-0.94.hdf5
Epoch 16/200


INFO:root:Showing metric in 35 epochs



Epoch 00016: val_f1 improved from 0.68174 to 0.68366, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0016-0.94.hdf5
Epoch 17/200


INFO:root:Showing metric in 34 epochs



Epoch 00017: val_f1 improved from 0.68366 to 0.69222, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0017-0.94.hdf5
Epoch 18/200


INFO:root:Showing metric in 33 epochs



Epoch 00018: val_f1 did not improve from 0.69222
Epoch 19/200


INFO:root:Showing metric in 32 epochs



Epoch 00019: val_f1 did not improve from 0.69222
Epoch 20/200


INFO:root:Showing metric in 31 epochs



Epoch 00020: val_f1 did not improve from 0.69222
Epoch 21/200


INFO:root:Showing metric in 30 epochs



Epoch 00021: val_f1 did not improve from 0.69222
Epoch 22/200


INFO:root:Showing metric in 29 epochs



Epoch 00022: val_f1 did not improve from 0.69222
Epoch 23/200


INFO:root:Showing metric in 28 epochs



Epoch 00023: val_f1 did not improve from 0.69222
Epoch 24/200


INFO:root:Showing metric in 27 epochs



Epoch 00024: val_f1 improved from 0.69222 to 0.69524, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0024-0.94.hdf5
Epoch 25/200


INFO:root:Showing metric in 26 epochs



Epoch 00025: val_f1 did not improve from 0.69524
Epoch 26/200


INFO:root:Showing metric in 25 epochs



Epoch 00026: val_f1 did not improve from 0.69524
Epoch 27/200


INFO:root:Showing metric in 24 epochs



Epoch 00027: val_f1 did not improve from 0.69524
Epoch 28/200


INFO:root:Showing metric in 23 epochs



Epoch 00028: val_f1 improved from 0.69524 to 0.69535, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0028-0.94.hdf5
Epoch 29/200


INFO:root:Showing metric in 22 epochs



Epoch 00029: val_f1 did not improve from 0.69535
Epoch 30/200


INFO:root:Showing metric in 21 epochs



Epoch 00030: val_f1 did not improve from 0.69535
Epoch 31/200


INFO:root:Showing metric in 20 epochs



Epoch 00031: val_f1 improved from 0.69535 to 0.70395, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0031-0.94.hdf5
Epoch 32/200


INFO:root:Showing metric in 19 epochs



Epoch 00032: val_f1 did not improve from 0.70395
Epoch 33/200


INFO:root:Showing metric in 18 epochs



Epoch 00033: val_f1 did not improve from 0.70395
Epoch 34/200


INFO:root:Showing metric in 17 epochs



Epoch 00034: val_f1 did not improve from 0.70395
Epoch 35/200


INFO:root:Showing metric in 16 epochs



Epoch 00035: val_f1 did not improve from 0.70395
Epoch 36/200


INFO:root:Showing metric in 15 epochs



Epoch 00036: val_f1 improved from 0.70395 to 0.71087, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0036-0.95.hdf5
Epoch 37/200


INFO:root:Showing metric in 14 epochs



Epoch 00037: val_f1 did not improve from 0.71087
Epoch 38/200


INFO:root:Showing metric in 13 epochs



Epoch 00038: val_f1 improved from 0.71087 to 0.71287, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0038-0.95.hdf5
Epoch 39/200


INFO:root:Showing metric in 12 epochs



Epoch 00039: val_f1 did not improve from 0.71287
Epoch 40/200


INFO:root:Showing metric in 11 epochs



Epoch 00040: val_f1 did not improve from 0.71287
Epoch 41/200


INFO:root:Showing metric in 10 epochs



Epoch 00041: val_f1 did not improve from 0.71287
Epoch 42/200


INFO:root:Showing metric in 9 epochs



Epoch 00042: val_f1 did not improve from 0.71287
Epoch 43/200


INFO:root:Showing metric in 8 epochs



Epoch 00043: val_f1 improved from 0.71287 to 0.71591, saving model to FOXA2_outputs\FOXA2_bigru_crf_model_outputs\FOXA2_bigru_crf-0043-0.95.hdf5
Epoch 44/200


INFO:root:Showing metric in 7 epochs



Epoch 00044: val_f1 did not improve from 0.71591
Epoch 45/200


INFO:root:Showing metric in 6 epochs



Epoch 00045: val_f1 did not improve from 0.71591
Epoch 46/200


INFO:root:Showing metric in 5 epochs



Epoch 00046: val_f1 did not improve from 0.71591
Epoch 47/200


INFO:root:Showing metric in 4 epochs



Epoch 00047: val_f1 did not improve from 0.71591
Epoch 48/200


INFO:root:Showing metric in 3 epochs



Epoch 00048: val_f1 did not improve from 0.71591
Epoch 49/200


INFO:root:Showing metric in 2 epochs



Epoch 00049: val_f1 did not improve from 0.71591
Epoch 50/200


INFO:root:Showing metric in 1 epochs



Epoch 00050: val_f1 did not improve from 0.71591
Epoch 51/200


INFO:root:Calculating metrics...
  0%|                                                                                          | 0/117 [00:00<?, ?it/s]





  1%|▋                                                                                 | 1/117 [00:05<10:01,  5.18s/it]





  2%|█▍                                                                                | 2/117 [00:08<08:32,  4.45s/it]





  3%|██                                                                                | 3/117 [00:12<08:02,  4.24s/it]





  3%|██▊                                                                               | 4/117 [00:16<07:43,  4.11s/it]





  4%|███▌                                                                              | 5/117 [00:20<07:33,  4.05s/it]





  5%|████▏                                                                             | 6/117 [00:24<07:24,  4.00s/it]





  6%|████▉                                                                             | 7/117 [00:27<07:15,  3.96s/it]





  7%|█████▌                                                                            | 8/117 [00:31<07:09,  3.94s/it]





  8%|██████▎                                                                           | 9/117 [00:35<07:02,  3.91s/it]





  9%|██████▉                                                                          | 10/117 [00:38<06:57,  3.90s/it]





  9%|███████▌                                                                         | 11/117 [00:42<06:51,  3.89s/it]





 10%|████████▎                                                                        | 12/117 [00:46<06:47,  3.88s/it]





 11%|█████████                                                                        | 13/117 [00:50<06:42,  3.87s/it]





 12%|█████████▋                                                                       | 14/117 [00:54<06:37,  3.86s/it]





 13%|██████████▍                                                                      | 15/117 [00:57<06:33,  3.85s/it]





 14%|███████████                                                                      | 16/117 [01:01<06:29,  3.85s/it]





 15%|███████████▊                                                                     | 17/117 [01:05<06:25,  3.86s/it]





 15%|████████████▍                                                                    | 18/117 [01:09<06:21,  3.85s/it]





 16%|█████████████▏                                                                   | 19/117 [01:13<06:17,  3.85s/it]





 17%|█████████████▊                                                                   | 20/117 [01:17<06:13,  3.85s/it]





 18%|██████████████▌                                                                  | 21/117 [01:20<06:09,  3.85s/it]





 19%|███████████████▏                                                                 | 22/117 [01:24<06:06,  3.86s/it]





 20%|███████████████▉                                                                 | 23/117 [01:28<06:02,  3.86s/it]





 21%|████████████████▌                                                                | 24/117 [01:32<05:58,  3.86s/it]





 21%|█████████████████▎                                                               | 25/117 [01:36<05:54,  3.86s/it]





 22%|██████████████████                                                               | 26/117 [01:40<05:51,  3.86s/it]





 23%|██████████████████▋                                                              | 27/117 [01:44<05:47,  3.86s/it]





 24%|███████████████████▍                                                             | 28/117 [01:47<05:43,  3.86s/it]





 25%|████████████████████                                                             | 29/117 [01:51<05:39,  3.85s/it]





 26%|████████████████████▊                                                            | 30/117 [01:55<05:35,  3.85s/it]





 26%|█████████████████████▍                                                           | 31/117 [01:59<05:31,  3.85s/it]





 27%|██████████████████████▏                                                          | 32/117 [02:03<05:27,  3.85s/it]





 28%|██████████████████████▊                                                          | 33/117 [02:06<05:23,  3.85s/it]





 29%|███████████████████████▌                                                         | 34/117 [02:10<05:19,  3.85s/it]





 30%|████████████████████████▏                                                        | 35/117 [02:15<05:16,  3.86s/it]





 31%|████████████████████████▉                                                        | 36/117 [02:18<05:12,  3.86s/it]





 32%|█████████████████████████▌                                                       | 37/117 [02:22<05:08,  3.86s/it]





 32%|██████████████████████████▎                                                      | 38/117 [02:26<05:05,  3.86s/it]





 33%|███████████████████████████                                                      | 39/117 [02:30<05:00,  3.86s/it]





 34%|███████████████████████████▋                                                     | 40/117 [02:34<04:56,  3.85s/it]





 35%|████████████████████████████▍                                                    | 41/117 [02:37<04:52,  3.85s/it]





 36%|█████████████████████████████                                                    | 42/117 [02:41<04:48,  3.85s/it]





 37%|█████████████████████████████▊                                                   | 43/117 [02:45<04:44,  3.85s/it]





 38%|██████████████████████████████▍                                                  | 44/117 [02:49<04:40,  3.84s/it]





 38%|███████████████████████████████▏                                                 | 45/117 [02:52<04:36,  3.84s/it]





 39%|███████████████████████████████▊                                                 | 46/117 [02:56<04:32,  3.84s/it]





 40%|████████████████████████████████▌                                                | 47/117 [03:00<04:28,  3.83s/it]





 41%|█████████████████████████████████▏                                               | 48/117 [03:03<04:24,  3.83s/it]





 42%|█████████████████████████████████▉                                               | 49/117 [03:07<04:20,  3.83s/it]





 43%|██████████████████████████████████▌                                              | 50/117 [03:11<04:16,  3.83s/it]





 44%|███████████████████████████████████▎                                             | 51/117 [03:15<04:12,  3.82s/it]





 44%|████████████████████████████████████                                             | 52/117 [03:18<04:08,  3.82s/it]





 45%|████████████████████████████████████▋                                            | 53/117 [03:22<04:04,  3.82s/it]





 46%|█████████████████████████████████████▍                                           | 54/117 [03:26<04:00,  3.82s/it]





 47%|██████████████████████████████████████                                           | 55/117 [03:29<03:56,  3.82s/it]





 48%|██████████████████████████████████████▊                                          | 56/117 [03:33<03:52,  3.82s/it]





 49%|███████████████████████████████████████▍                                         | 57/117 [03:37<03:48,  3.81s/it]





 50%|████████████████████████████████████████▏                                        | 58/117 [03:41<03:44,  3.81s/it]





 50%|████████████████████████████████████████▊                                        | 59/117 [03:44<03:41,  3.81s/it]





 51%|█████████████████████████████████████████▌                                       | 60/117 [03:48<03:37,  3.81s/it]





 52%|██████████████████████████████████████████▏                                      | 61/117 [03:52<03:33,  3.81s/it]





 53%|██████████████████████████████████████████▉                                      | 62/117 [03:55<03:29,  3.81s/it]





 54%|███████████████████████████████████████████▌                                     | 63/117 [03:59<03:25,  3.80s/it]





 55%|████████████████████████████████████████████▎                                    | 64/117 [04:03<03:21,  3.80s/it]





 56%|█████████████████████████████████████████████                                    | 65/117 [04:07<03:17,  3.80s/it]





 56%|█████████████████████████████████████████████▋                                   | 66/117 [04:10<03:13,  3.80s/it]





 57%|██████████████████████████████████████████████▍                                  | 67/117 [04:14<03:09,  3.80s/it]





 58%|███████████████████████████████████████████████                                  | 68/117 [04:18<03:05,  3.80s/it]





 59%|███████████████████████████████████████████████▊                                 | 69/117 [04:21<03:02,  3.79s/it]





 60%|████████████████████████████████████████████████▍                                | 70/117 [04:25<02:58,  3.79s/it]





 61%|█████████████████████████████████████████████████▏                               | 71/117 [04:29<02:54,  3.79s/it]





 62%|█████████████████████████████████████████████████▊                               | 72/117 [04:32<02:50,  3.79s/it]





 62%|██████████████████████████████████████████████████▌                              | 73/117 [04:36<02:46,  3.79s/it]





 63%|███████████████████████████████████████████████████▏                             | 74/117 [04:40<02:42,  3.79s/it]





 64%|███████████████████████████████████████████████████▉                             | 75/117 [04:43<02:38,  3.79s/it]





 65%|████████████████████████████████████████████████████▌                            | 76/117 [04:47<02:35,  3.79s/it]





 66%|█████████████████████████████████████████████████████▎                           | 77/117 [04:51<02:31,  3.78s/it]





 67%|██████████████████████████████████████████████████████                           | 78/117 [04:55<02:27,  3.78s/it]





 68%|██████████████████████████████████████████████████████▋                          | 79/117 [04:58<02:23,  3.78s/it]





 68%|███████████████████████████████████████████████████████▍                         | 80/117 [05:02<02:19,  3.78s/it]





 69%|████████████████████████████████████████████████████████                         | 81/117 [05:06<02:16,  3.78s/it]





 70%|████████████████████████████████████████████████████████▊                        | 82/117 [05:09<02:12,  3.78s/it]





 71%|█████████████████████████████████████████████████████████▍                       | 83/117 [05:13<02:08,  3.78s/it]





 72%|██████████████████████████████████████████████████████████▏                      | 84/117 [05:17<02:04,  3.78s/it]





 73%|██████████████████████████████████████████████████████████▊                      | 85/117 [05:20<02:00,  3.78s/it]





 74%|███████████████████████████████████████████████████████████▌                     | 86/117 [05:24<01:57,  3.77s/it]





 74%|████████████████████████████████████████████████████████████▏                    | 87/117 [05:28<01:53,  3.77s/it]





 75%|████████████████████████████████████████████████████████████▉                    | 88/117 [05:32<01:49,  3.77s/it]





 76%|█████████████████████████████████████████████████████████████▌                   | 89/117 [05:35<01:45,  3.77s/it]





 77%|██████████████████████████████████████████████████████████████▎                  | 90/117 [05:39<01:41,  3.77s/it]





 78%|███████████████████████████████████████████████████████████████                  | 91/117 [05:43<01:38,  3.77s/it]





 79%|███████████████████████████████████████████████████████████████▋                 | 92/117 [05:46<01:34,  3.77s/it]





 79%|████████████████████████████████████████████████████████████████▍                | 93/117 [05:50<01:30,  3.77s/it]





 80%|█████████████████████████████████████████████████████████████████                | 94/117 [05:54<01:26,  3.77s/it]





 81%|█████████████████████████████████████████████████████████████████▊               | 95/117 [05:57<01:22,  3.77s/it]





 82%|██████████████████████████████████████████████████████████████████▍              | 96/117 [06:01<01:19,  3.77s/it]





 83%|███████████████████████████████████████████████████████████████████▏             | 97/117 [06:05<01:15,  3.77s/it]





 84%|███████████████████████████████████████████████████████████████████▊             | 98/117 [06:08<01:11,  3.76s/it]





 85%|████████████████████████████████████████████████████████████████████▌            | 99/117 [06:12<01:07,  3.76s/it]





 85%|████████████████████████████████████████████████████████████████████▍           | 100/117 [06:16<01:03,  3.76s/it]





 86%|█████████████████████████████████████████████████████████████████████           | 101/117 [06:19<01:00,  3.76s/it]





 87%|█████████████████████████████████████████████████████████████████████▋          | 102/117 [06:23<00:56,  3.76s/it]





 88%|██████████████████████████████████████████████████████████████████████▍         | 103/117 [06:27<00:52,  3.76s/it]





 89%|███████████████████████████████████████████████████████████████████████         | 104/117 [06:30<00:48,  3.76s/it]





 90%|███████████████████████████████████████████████████████████████████████▊        | 105/117 [06:34<00:45,  3.76s/it]





 91%|████████████████████████████████████████████████████████████████████████▍       | 106/117 [06:38<00:41,  3.76s/it]





 91%|█████████████████████████████████████████████████████████████████████████▏      | 107/117 [06:42<00:37,  3.76s/it]





 92%|█████████████████████████████████████████████████████████████████████████▊      | 108/117 [06:45<00:33,  3.76s/it]





 93%|██████████████████████████████████████████████████████████████████████████▌     | 109/117 [06:49<00:30,  3.76s/it]





 94%|███████████████████████████████████████████████████████████████████████████▏    | 110/117 [06:53<00:26,  3.76s/it]





 95%|███████████████████████████████████████████████████████████████████████████▉    | 111/117 [06:56<00:22,  3.76s/it]





 96%|████████████████████████████████████████████████████████████████████████████▌   | 112/117 [07:00<00:18,  3.76s/it]





 97%|█████████████████████████████████████████████████████████████████████████████▎  | 113/117 [07:04<00:15,  3.75s/it]





 97%|█████████████████████████████████████████████████████████████████████████████▉  | 114/117 [07:08<00:11,  3.75s/it]





 98%|██████████████████████████████████████████████████████████████████████████████▋ | 115/117 [07:11<00:07,  3.75s/it]





 99%|███████████████████████████████████████████████████████████████████████████████▎| 116/117 [07:15<00:03,  3.75s/it]





100%|████████████████████████████████████████████████████████████████████████████████| 117/117 [07:19<00:00,  3.75s/it]
INFO:root:Epoch 50:
INFO:root:avg auROC:   0.8665 avg auPRC:   0.7644
INFO:root:Recall@5%/10%/25%/50%: 0.0000 0.6322 31.7805 76.2451
INFO:root:tp:      4597 fn:      1421 tn:     52304 fp:      1582
INFO:root:tp|###############_____|fn   tn|###################_|fp



Epoch 00051: val_f1 did not improve from 0.71591
Epoch 52/200


INFO:root:Showing metric in 49 epochs



Epoch 00052: val_f1 did not improve from 0.71591
Epoch 53/200


INFO:root:Showing metric in 48 epochs



Epoch 00053: val_f1 did not improve from 0.71591
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 200, 6)            0         
_________________________________________________________________
model_10 (Model)             (None, 200, 4)            27780     
_________________________________________________________________
crf_ext_4 (ClassWrapper)     (None, 200, 4)            44        
Total params: 27,824
Trainable params: 27,824
Non-trainable params: 0
_________________________________________________________________


INFO:root:Getting test y prediction






INFO:root:Converting test y prediction to categorical


In [None]:
'done'