diff --git a/README.md b/README.md index 622a738..683efbb 100644 --- a/README.md +++ b/README.md @@ -2,11 +2,11 @@ ## Methodology of DeepMod -DeepMod is a computational tool which takes long-read signals as input and outputs modification summary for each genomic position in a reference genome together with modification prediction for each base in a long read. The modification prediction model in DeepMod is a well-trained bidirectional recurrent neural network (RNN) with long short-term memory (LSTM) units. LSTM RNN is a class of artificial neural network for modeling sequential behaviors with LSTM to preclude vanishing gradient problem. To detect DNA modifications, normalized signals of events in a long read were rescaled from -5 and 5, and signal mean, standard deviation and the number of signals together with base information (denoted by 7-feature description) were obtained for each event as input of a LSTM unit with 100 hidden nodes. In DeepMod with 3 hidden layers in RNN. Predicted modification summary for each position would be generated in a BED format, suggesting how many reads cover genomic positions, how many mapped bases in long reads were predicted to be modified and the coverage percentage of prediction modifications. This modification prediction by DeepMod is strand-sensitive and single-nucleotide based. +DeepMod is a computational tool which takes long-read signals as input and outputs modification summary for each genomic position in a reference genome together with modification prediction for each base in a long read. The modification prediction model in DeepMod is a well-trained bidirectional recurrent neural network (RNN) with long short-term memory (LSTM) units. LSTM RNN is a class of artificial neural network for modeling sequential behaviors with LSTM to preclude vanishing gradient problem. To detect DNA modifications, normalized signals of events in a long read were rescaled from -5 and 5, and signal mean, standard deviation and the number of signals together with base information (denoted by 7-feature description) were obtained for each event as input of a LSTM unit with 100 hidden nodes. In DeepMod with 3 hidden layers in RNN. Predicted modification summary for each position would be generated in a BED format, suggesting how many reads cover genomic positions, how many mapped bases in long reads were predicted to be modified and the coverage percentage of prediction modifications. This modification prediction by DeepMod is strand-sensitive and single-nucleotide based. ### Inputs of DeepMod -The input of DeepMod is Nanopore long read data together a refrence genome. +The input of DeepMod is Nanopore long read data together a refrence genome. ## System Requirements ### Hardware requirements @@ -14,12 +14,12 @@ DeepMod is based on deep learning framework, and needs to access raw data of Nan * RAM: 20+ GB per thread * GPU or CPU with 8+ cores * HDD or better with SSD. Dependent on how large raw data is (for 30X E coli data, it might need 10+GB, while for 30X human data, it might need 10+TB) - + ### Software requirements The developmental version of DeepMod has been tested on Linux operating system: CentOS 7.0 with both CPU and GPU machines. ### Future improvement -Now, DeepMod does not support multi-fast5 and Move table. For multi-fast5 issue, one can use API at https://github.com/nanoporetech/ont_fast5_api to convert multi-fast5 to single fast5 file, and then re-basecall to get event information as input of DeepMod. We have been working on improvement of DeepMod to support both multi-fast5 and Move table. +Now, DeepMod supports basecalled data with either event tables or move tables. But it does not support multi-fast5. For multi-fast5 issue, one can use API at https://github.com/nanoporetech/ont_fast5_api to convert multi-fast5 to single fast5 file, and then re-basecall to get event information as input of DeepMod. We have been working on improvement of DeepMod to support multi-fast5. ## Installation Please refer to [Installation](https://github.com/WGLab/DeepMod/blob/master/docs/Install.md) for how to install DeepMod. @@ -38,7 +38,7 @@ For release history, please visit [here](https://github.com/WGLab/NanoDeepMod/re ## Contact -If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/WGLab/DeepMod/issues). They would also be helpful to other users. +If you have any questions/issues/bugs, please post them on [GitHub](https://github.com/WGLab/DeepMod/issues). They would also be helpful to other users. ## Reference **Please cite the publication below if you use our tool:** diff --git a/bin/DeepMod.py b/bin/DeepMod.py index b819092..e3fe3b8 100644 --- a/bin/DeepMod.py +++ b/bin/DeepMod.py @@ -63,7 +63,7 @@ def mCommonParam(margs): if moptions['outFolder']==None or (not os.path.isdir(moptions['outFolder'])): try: os.system('mkdir -p '+moptions['outFolder']); - except: + except: ErrorMessage = ErrorMessage + ("\n\tThe output folder (%s) does not exist and cannot be created." % moptions['outFolder']) # check all data in a recurive way @@ -85,6 +85,8 @@ def mCommonParam(margs): moptions['SignalGroup'] = margs.SignalGroup; + moptions['move'] = margs.move + return [moptions, ErrorMessage] # @@ -99,7 +101,7 @@ def mDetect(margs): moptions['basecall_1d'] = margs.basecall_1d moptions['basecall_2strand'] = margs.basecall_2strand # Whether consider those chromosome which contain -_:/ - # default: yes; + # default: yes; moptions['ConUnk'] = margs.ConUnk # output layer information for deep learning moptions['outputlayer'] = margs.outputlayer @@ -168,7 +170,7 @@ def mDetect(margs): parser.print_help(); parser.parse_args(['detect', '--help']); sys.exit(1) - + from scripts import myDetect myDetect.mDetect_manager(moptions) @@ -211,7 +213,7 @@ def mTrain(margs): if moptions['test'][0] in ['-']: moptions['test'][1] = int(moptions['test'][1]) * (10**6) moptions['test'][2] = int(moptions['test'][2]) * (10**6) - else: moptions['test'][1] = int(moptions['test'][1])/100.0 + else: moptions['test'][1] = int(moptions['test'][1])/100.0 else: moptions['test'] = ['N', '100'] # print help document if necessary options are not provided. @@ -232,7 +234,7 @@ def mTrain(margs): # def mGetFeatures(margs): from scripts import myGetFeatureBasedPos - + # get common options moptions, ErrorMessage = mCommonParam(margs) # motif-based data: positive or negative control data @@ -254,7 +256,7 @@ def mGetFeatures(margs): rsp = margs.region.split(':') for rv_ind in range(len(rsp)): rsp[rv_ind] = rsp[rv_ind].strip(); - if not rsp[rv_ind]=='': + if not rsp[rv_ind]=='': moptions['region'][rv_ind] = rsp[rv_ind] # referene genome @@ -309,6 +311,7 @@ def mGetFeatures(margs): com_group_for_comparison.add_argument("--windowsize", type=int, default=21, help="The window size to extract features. Default: 51") com_group_for_comparison.add_argument("--alignStr", type=str, default='minimap2', choices=["bwa","minimap2"], help="Alignment tools (bwa or minimap2 is supported). Default: minimap2") com_group_for_comparison.add_argument("--SignalGroup", type=str, default='simple', choices=["simple","rundif"], help="How to associate signals to each called bases. Default: simple") +com_group_for_comparison.add_argument("--move", default=False, action="store_true", help="Whether the basecalled data use move tables instead of event tables. Default: False") # add detection options parser_detect = subparsers.add_parser('detect', parents=[parent_parser], help="Detect modifications at a genomic scale", description="Detect modifications by integrating all long reads for a genome", epilog="For example, \n \ @@ -323,7 +326,7 @@ def mGetFeatures(margs): parser_detect.add_argument("--basecall_1d", default="Basecall_1D_000", help="Path for basecall_1d. Default: Basecall_1D_000") parser_detect.add_argument("--basecall_2strand", default="BaseCalled_template", help="Path for basecall_2strand. Default: BaseCalled_template") parser_detect.add_argument("--region", default=None, help="The region of interest: for example, chr:1:100000;chr2:10000"); -parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome"); +parser_detect.add_argument("--ConUnk", default=True, choices=[False, True], help="Whether contain unknown chromosome"); parser_detect.add_argument("--outputlayer", default="", choices=["", "sigmoid"], help="how to put activation function for output layer") parser_detect.add_argument("--Base", type=str, default='C', choices=['A', 'C', 'G', 'T'], help="Interest of bases"); parser_detect.add_argument("--mod_cluster", default=0, choices=[0,1], help="1: CpG cluster effect; 0: not"); @@ -373,4 +376,3 @@ def mGetFeatures(margs): else: args = parser.parse_args() args.func(args); - diff --git a/bin/scripts/MoveTable.py b/bin/scripts/MoveTable.py new file mode 100644 index 0000000..81dc6f6 --- /dev/null +++ b/bin/scripts/MoveTable.py @@ -0,0 +1,79 @@ + +import os,sys +import numpy as np +import h5py + + +def getMove_Info(moptions, sp_param, move_data): + ''' + sp_param.keys: fq_seq, raw_signals, first_sample_template, duration_template + ''' + + #sp_param['first_sample_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['first_sample_template'] + #sp_param['duration_template'] = sp_param['f5reader']['/Analyses/Segmentation_001/Summary/segmentation'].attrs['duration_template'] + + seg = "Segmentation_" + moptions['basecall_1d'].split('_')[-1] + attr_path = '/'.join(['', 'Analyses', seg, 'Summary', 'segmentation']) + #mv_str = '/'.join(['', 'Analyses', moptions['basecall_1d'], moptions['basecall_2strand'], 'Move']) + sp_param['first_sample_template'] = sp_param['f5reader'][attr_path].attrs['first_sample_template'] + sp_param['duration_template'] = sp_param['f5reader'][attr_path].attrs['duration_template'] + #move_data = sp_param['f5reader'][mv_str][()] + nrow = len(sp_param['fq_seq']) # row number of event_info; equals to the base number + nsig = len(sp_param['raw_signals']) + first = int(sp_param['first_sample_template']) + duration = int(sp_param['duration_template']) + move_info = np.empty([nrow], dtype=[('mean', '0: # for non-stay event @@ -178,7 +199,7 @@ def getEvent(moptions, sp_param): cal_st = (events_data['start'][pre_i]-events_data['start'][move0_left])*sp_param["channel_info"]["sampling_rate"]+based_ind if cal_st<0: print("Wanging Less than 0") if cal_st>0 and cal_st - (m_event[-1][2]+ m_event[-1][3])>0 and (cal_st - (m_event[-1][2]+ m_event[-1][3])).astype('uint64')>0: - if (cal_st - (m_event[-1][2]+ m_event[-1][3])).astype('uint64')>2: # + if (cal_st - (m_event[-1][2]+ m_event[-1][3])).astype('uint64')>2: # m_event.append((round(events_data['mean'][pre_i],3), round(events_data['stdv'][pre_i],3), m_event[-1][2]+ m_event[-1][3], (cal_st - (m_event[-1][2]+ m_event[-1][3])).astype('uint64'), events_data['model_state'][pre_i].upper())) m_event.append((round(events_data['mean'][pre_i],3), round(events_data['stdv'][pre_i],3), cal_st.astype('uint64'), cur_length, events_data['model_state'][pre_i].upper())) else: # for a normal event @@ -189,11 +210,11 @@ def getEvent(moptions, sp_param): if not convertError: print ('ex: %.7f*%d=%.0f' % (events_data['start'][move0_left].astype(np.float64), sp_param["channel_info"]["sampling_rate"], events_data['start'][move0_left].astype(np.float64)*sp_param["channel_info"]["sampling_rate"])), sp_param['raw_attributes']['start_time'], sp_param['mfile_path'], m_event[-1][2], m_event[-1][3] convertError = True; - pre_i = i; + pre_i = i; cur_length=(events_data['length'][i]*sp_param["channel_info"]["sampling_rate"]).astype('uint64'); else: # for stay event cur_length += (events_data['length'][i]*sp_param["channel_info"]["sampling_rate"]).astype('uint64') - if sp_param['f5status'] == "": # for the last event + if sp_param['f5status'] == "": # for the last event # calculated position cal_st = (events_data['start'][pre_i]-events_data['start'][move0_left])*sp_param["channel_info"]["sampling_rate"]+based_ind if cal_st<0: print("Wanging Less than 0") @@ -249,7 +270,7 @@ def mnormalized(moptions, sp_param): mscale = np.median(np.abs(sp_param['raw_signals'][sp_param['m_event']['start'][0]:(sp_param['m_event']['start'][-1]+sp_param['m_event']['length'][-1])]-mshift)); # standardize sp_param['raw_signals'] = (sp_param['raw_signals'] - mshift)/mscale - # get meand + # get meand read_med = np.median(sp_param['raw_signals'][sp_param['m_event']['start'][0]:(sp_param['m_event']['start'][-1]+sp_param['m_event']['length'][-1])]) read_mad = np.median(np.abs(sp_param['raw_signals'][sp_param['m_event']['start'][0]:(sp_param['m_event']['start'][-1]+sp_param['m_event']['length'][-1])] - read_med)) lower_lim = read_med - (read_mad * 5) @@ -268,7 +289,7 @@ def getRawInfo(moptions, sp_param): for raw_data in sp_param['f5reader'][fast5_rawReads].values(): pass; sp_param['raw_attributes'] = dict(raw_data.attrs.items()) - sp_param['raw_signals'] = raw_data['Signal'].value + sp_param['raw_signals'] = raw_data['Signal'][()] except: raiseError(("No Raw_reads/Signal data %s" % (fast5_rawReads)), sp_param, "No Raw_reads/Signal") @@ -292,26 +313,28 @@ def getFast5Info(moptions, sp_param): fq_data = sp_param['f5reader'][fq_path][()] except: raiseError('No Fastq data', sp_param, "No Fastq data") - if sp_param['f5status']=="": fq_data = (fq_data.decode(encoding="utf-8")).split('\n') sp_param['read_id'] = (fq_data[0][1:] if fq_data[0][0]=='@' else fq_data[0]).replace(" ", ":::").replace("\t", "|||") sp_param['fq_seq'] = fq_data[1]; - # get raw signals getRawInfo(moptions, sp_param) # get events - if sp_param['f5status']=="": getEvent(moptions, sp_param) + if sp_param['f5status']=="": + getEvent(moptions, sp_param) # normalize signals. - if sp_param['f5status']=="": mnormalized(moptions, sp_param) + if sp_param['f5status']=="": + mnormalized(moptions, sp_param) if sp_param['f5status']=="": - # get mean, std for each event + # get mean, std for each event for i in range(len(sp_param['m_event'])): if (len(sp_param['raw_signals'][sp_param['m_event']['start'][i]:sp_param['m_event']['start'][i]+sp_param['m_event']['length'][i]])==0): print ('Signal out of range {}: {}-{} {};{} for {}'.format(i, sp_param['m_event']['start'][i], sp_param['m_event']['length'][i], len(sp_param['m_event']), len(sp_param['raw_signals']), sp_param['mfile_path'])) - if i>500: sp_param['m_event'] = sp_param['m_event'][:i-1] - else: sp_param['f5status']=="Less event" + if i>500: + sp_param['m_event'] = sp_param['m_event'][:i-1] + else: + sp_param['f5status']=="Less event" break; sp_param['m_event']['mean'][i] = round(np.mean(sp_param['raw_signals'][sp_param['m_event']['start'][i]:sp_param['m_event']['start'][i]+sp_param['m_event']['length'][i]]), 3) sp_param['m_event']['stdv'][i] = round(np.std(sp_param['raw_signals'][sp_param['m_event']['start'][i]:sp_param['m_event']['start'][i]+sp_param['m_event']['length'][i]]), 3) @@ -334,7 +357,6 @@ def get_Event_Signals(moptions, sp_options, f5files): sp_param['mfile_path'] = f5f sp_param['f5reader'] = mf5 sp_param['f5status'] = ""; - # get associated events of signals getFast5Info(moptions, sp_param) if 'get_albacore_version' in sp_param: sp_options["get_albacore_version"][str(sp_param['get_albacore_version'])] += 1 @@ -344,10 +366,11 @@ def get_Event_Signals(moptions, sp_options, f5files): f5data[sp_param['read_id']] = (sp_param['m_event_basecall'], sp_param['m_event'], sp_param['raw_signals'], f5f, sp_param['left_right_skip']) else: sp_options["Error"][sp_param['f5status']].append(f5f) - # for outputing progress + # for outputing progress if moptions['outLevel']<=myCom.OUTPUT_DEBUG: runnum += 1; - if runnum%500==0: + + if runnum%500==0: end_time = time.time(); print ("%d consuming time %d" % (runnum, end_time-start_time)) except: @@ -372,11 +395,11 @@ def mDetect1(moptions, sp_options, f5files): f5keys = sorted(f5data.keys()); #f5keys.sort() for f5k in f5keys: temp_fa.write(''.join(['>', f5k, '\n', f5data[f5k][0], '\n'])) - temp_fa.flush(); + temp_fa.flush(); if moptions['outLevel']<=myCom.OUTPUT_DEBUG: end_time = time.time(); print ("Write consuming time %d" % (end_time-start_time)) - + # run alignmen tools of bwa-mem or minimap2 temp_sam = tempfile.NamedTemporaryFile() if moptions['alignStr']=='bwa': @@ -406,7 +429,7 @@ def mDetect1(moptions, sp_options, f5files): sp_param['ref_info'] = defaultdict() if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); - ilid = 0; + ilid = 0; # get alignment records while ilid < len(align_info): if len(align_info[ilid])==0 or align_info[ilid][0]=='@': @@ -475,7 +498,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): # output chromosome of interest if (not moptions['ConUnk']) and ((not rname.find('_')==-1) or (not rname.find('-')==-1) or (not rname.find('/')==-1) or (not rname.find(':')==-1)): continue; - isinreg = False; + isinreg = False; # check the region of interest for cur_mr in moptions['region']: if (cur_mr[0] in ['', None, rname]): @@ -512,7 +535,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]] if mdiinfo[-1] in ['H']: rightclip += numinfo[-1] numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1] - if forward_reverse=='+': # remove clipped events + if forward_reverse=='+': # remove clipped events if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip] else: m_event = f5data[readk][1][leftclip:] else: @@ -531,7 +554,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): if not isinreg: continue; - lastmatch = None; firstmatch = None; + lastmatch = None; firstmatch = None; first_match_pos = None; last_match_pos = None last_al_match = None; first_al_match = None lasmtind = 0; @@ -560,7 +583,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): numinsert += 1 elif mdi == 'D': base_map_info.append((refseq[pos], '-', pos, read_ind, 0)) - pos += 1; + pos += 1; numdel += 1 elif mdi == 'N': base_map_info.append((refseq[pos], '-', pos, read_ind, 0)) @@ -582,16 +605,16 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): if first_match_pos==None: first_match_pos = pos if last_match_pos==None or last_match_pos1: leftclip += len(m_event)-lastmatch-1 - + if forward_reverse=='+': if len(m_event)-lastmatch>1: m_event = m_event[firstmatch:(lastmatch+1-len(m_event))] @@ -629,7 +652,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))] elif first_al_match>0: base_map_info = base_map_info[first_al_match:] - + # format base base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64), ('mod_pred', np.int)]) if forward_reverse=='-': @@ -645,7 +668,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]] ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]] for rali in range(len(read_align_list)): - if not read_align_list[rali]==base_map_info['readbase'][rali]: + if not read_align_list[rali]==base_map_info['readbase'][rali]: print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3])) if not ref_align_list[rali]==base_map_info['refbase'][rali]: print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3])) @@ -662,7 +685,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): while ali + addali < len(base_map_info): if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1; else: break; - if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G': + if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G': base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1] if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G': if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C': @@ -701,7 +724,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): pred_group = base_group.create_group(pred_f5_key) # save mapped chr, strand, positions - pred_group.attrs['mapped_chr'] = rname + pred_group.attrs['mapped_chr'] = rname pred_group.attrs['mapped_strand'] = forward_reverse pred_group.attrs['mapped_start'] = base_map_info['refbasei'][0] if forward_reverse=='+' else base_map_info['refbasei'][-1] pred_group.attrs['mapped_end'] = base_map_info['refbasei'][-1] if forward_reverse=='+' else base_map_info['refbasei'][0] @@ -709,7 +732,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): if forward_reverse=='+': pred_group.attrs['clipped_bases_start'] = leftclip pred_group.attrs['clipped_bases_end'] = rightclip - else: + else: pred_group.attrs['clipped_bases_start'] = rightclip pred_group.attrs['clipped_bases_end'] = leftclip @@ -725,7 +748,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): pred_group.attrs['readk'] = readk base_map_info = np.array(base_map_info, dtype=[('refbase', 'S1'), ('readbase', 'S1'), ('refbasei', np.uint64), ('readbasei', np.uint64), ('mod_pred', np.int)]) pred_group.create_dataset('predetail', data=base_map_info, compression="gzip") - + try: save_data.flush(); save_data.close(); @@ -741,7 +764,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): cur_chr = None; cur_writer = None; for mfi in sp_options['Mod']: if cur_chr==None or (not cur_chr == mfi[0]): - if not cur_chr==None: + if not cur_chr==None: cur_writer.flush(); cur_writer.close() cur_chr = mfi[0] @@ -750,16 +773,16 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): for mfidetail in mfi: cur_m_f.append(str(mfidetail)) cur_m_f.append('\n') - cur_writer.write(' '.join(cur_m_f)) - if not cur_writer==None: + cur_writer.write(' '.join(cur_m_f)) + if not cur_writer==None: cur_writer.flush(); - cur_writer.close() + cur_writer.close() # # make modificatoin prediction for a long read -# +# def mPredict1(moptions, sp_options, sp_param, mfeatures, base_map_info, readk, start_clip, end_clip): - # + # modevents = sp_param['f5data'][readk][1] # get features. labels might be all zero t0, ty, tx = np.split(mfeatures, [1,3], axis=1); @@ -770,14 +793,14 @@ def mPredict1(moptions, sp_options, sp_param, mfeatures, base_map_info, readk, s if ie>=start_clip and ie rnn_pred_batch_size*1.2: x_sub_group = np.array_split(test_feature, int(len(test_feature)/rnn_pred_batch_size)) @@ -792,21 +815,21 @@ def mPredict1(moptions, sp_options, sp_param, mfeatures, base_map_info, readk, s else: mfpred_output = np.concatenate((mfpred_output, (sp_options['rnn'][0].run([sp_options['rnn'][4]], \ feed_dict={sp_options['rnn'][1]:x_sub_group[subi], sp_options['rnn'][2]:y_sub_group[subi]}))[0]), axis=0); - + # associate the prediction with reference positions and read positions modevents = sp_param['f5data'][readk][1] aligni = 0; pred_mod_num = 0; for ie in range(start_clip, len(modevents)-end_clip): - while base_map_info['readbase'][aligni]=='-': aligni += 1 + while base_map_info['readbase'][aligni]=='-': aligni += 1 if not base_map_info['readbase'][aligni] == modevents['model_state'][ie][2]: print ('Error Does not match', base_map_info['readbase'][aligni], modevents['model_state'][ie][2], aligni, ie) if mfpred_output[ie-start_clip]==1: base_map_info['mod_pred'][aligni] = 1; pred_mod_num += 1; - - aligni += 1 + + aligni += 1 return pred_mod_num - + # # get feature for a long read # @@ -818,7 +841,7 @@ def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_cl align_ref_pos = mapped_start_pos else: align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1 - + # initialize feature matrix if moptions['fnum']==57: mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); @@ -899,12 +922,12 @@ def calculate_mean_std(m_event, event_ind, forward_reverse, raw_pv, moptions, sp # # get required information from a mapped record -# +# def handle_line(moptions, sp_param, f5align): lsp = sp_param['line'].split('\t') qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11] # check query name, map quality, reference position, cigar and reference name - if qname=='*': sp_param['f5status'] = "qname is *" + if qname=='*': sp_param['f5status'] = "qname is *" elif int(mapq)==255: sp_param['f5status'] = "mapq is 255" elif int(pos)==0: sp_param['f5status'] = "pos is 0" elif cigar=='*': sp_param['f5status'] = "cigar is *" @@ -920,39 +943,39 @@ def handle_line(moptions, sp_param, f5align): # the worker of the detection in a multiprocessing way # def detect_handler(moptions, h5files_Q, failed_Q, file_map_info_q): - + _, init_l, _, _, _, X, Y, _, _, _, _, mfpred = myMultiBiRNN.mCreateSession(moptions['fnum'], moptions['hidden'], moptions['windowsize'], moptions) config = tf.ConfigProto() config.gpu_options.allow_growth = True - sess = tf.Session(config=config) + sess = tf.Session(config=config) # load module new_saver = tf.train.import_meta_graph(moptions['modfile'][0]+'.meta') new_saver.restore(sess,tf.train.latest_checkpoint(moptions['modfile'][1])) - + while not h5files_Q.empty(): cur_start_time = time.time() try: # get fast5 file f5files, ctfolderid, batchid = h5files_Q.get(block=False) except: - break; + break; sp_options = defaultdict(); sp_options['ctfolderid'] = ctfolderid sp_options['ctfolder'] = moptions['outFolder']+moptions['FileID']+'/'+str(ctfolderid) if not os.path.isdir(sp_options['ctfolder']): os.system('mkdir '+sp_options['ctfolder']) - #if moptions['testrnn']: + #if moptions['testrnn']: sp_options['rnn'] = (sess, X, Y, init_l, mfpred) sp_options['batchid'] = batchid - + sp_options['Mod'] = []; # make modification prediction for each fast5 mDetect1(moptions, sp_options, f5files) # outputing errors for errtype, errfiles in sp_options["Error"].items(): failed_Q.put((errtype, errfiles)); - + print ("Cur Prediction consuming time %d for %d %d" % (time.time() - cur_start_time, ctfolderid, batchid)) sess.close() @@ -969,7 +992,7 @@ def read_file_list(cur_cif, cur_chr, cur_strand, sp_options): if len(line)>0: lsp = line.split(); if line[0]=='#': - if lsp[1][0] not in ['/', '\\']: + if lsp[1][0] not in ['/', '\\']: lsp[1] = lsp[1] + '/' if lsp[0]=='#base_folder_fast5': sp_options['base_folder_fast5'] = lsp[1]; elif lsp[0]=='#base_folder_output': sp_options['base_folder_output'] = lsp[1]; @@ -979,7 +1002,7 @@ def read_file_list(cur_cif, cur_chr, cur_strand, sp_options): if not lsp[0]==cur_chr: print ('Warning!!! The chr should be %s but % is found.' % (cur_chr, lsp[0])) line = mr.readline(); - sp_options['handlingList'] = cur_list + sp_options['handlingList'] = cur_list # # get prediction detail @@ -988,7 +1011,7 @@ def read_pred_detail(moptions, sp_options, f5info): f5pred_file = sp_options['base_folder_output'] + '/' + f5info[5] f5_pred_key = ('/pred/%s/predetail' % f5info[3]) # get prediction detail from saved prediction file - # each file contains predictions for multiple fast5 + # each file contains predictions for multiple fast5 with h5py.File(f5pred_file, 'r') as mr: m_pred = mr[f5_pred_key].value; mapped_chrom = mr['/pred/%s' % f5info[3]].attrs['mapped_chr'] #.decode(encoding="utf-8") @@ -1068,7 +1091,7 @@ def sum_handler(moptions, chr_strand_Q): sp_options['4NA'][m_pred['refbase'][mi]][(cur_chr, cur_strand, int(m_pred['refbasei'][mi]) )] = [0, 0, m_pred['refbase'][mi]] if not (m_pred['refbase'][mi] == sp_options['4NA'][m_pred['refbase'][mi]][(cur_chr, cur_strand, int(m_pred['refbasei'][mi]) )][2]): print ('Error !!!! NA not equal %s == %s' % (m_pred['refbase'][mi], sp_options['4NA'][m_pred['refbase'][mi]][(cur_chr, cur_strand, int(m_pred['refbasei'][mi]) )][2])) - if not m_pred['readbase'][mi]=='-': + if not m_pred['readbase'][mi]=='-': sp_options['4NA'][m_pred['refbase'][mi]][(cur_chr, cur_strand, int(m_pred['refbasei'][mi]) )][0] += 1 if -0.1 < m_pred['mod_pred'][mi]-1 < 0.1: sp_options['4NA'][m_pred['refbase'][mi]][(cur_chr, cur_strand, int(m_pred['refbasei'][mi]) )][1] += 1 @@ -1103,7 +1126,7 @@ def mDetect_manager(moptions): # need to make prediction of modification if moptions['predDet']==1: - + # get well-trained model if moptions['modfile'].rfind('/')==-1: moptions['modfile'] = [moptions['modfile'], './'] @@ -1118,8 +1141,8 @@ def mDetect_manager(moptions): f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*.fast5" ))) f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" ))) f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" ))) - - print('Total files=%d' % len(f5files)) + + print('Total files=%d' % len(f5files)) # output folder if not os.path.isdir(moptions['outFolder']+moptions['FileID']): @@ -1131,7 +1154,7 @@ def mDetect_manager(moptions): failed_Q = pmanager.Queue() # spliting fast5 files into different lists - h5_batch = []; h5batchind = 0; + h5_batch = []; h5batchind = 0; sub_folder_size = 100; sub_folder_id = 0; for f5f in f5files: h5_batch.append(f5f); @@ -1159,7 +1182,7 @@ def mDetect_manager(moptions): try: errk, fns = failed_Q.get(block=False); failed_files[errk].extend(fns) - + except: time.sleep(1); continue; @@ -1202,7 +1225,7 @@ def mDetect_manager(moptions): moptions['outFolder'] = moptions['outFolder']+moptions['FileID'] end_time = time.time(); print ("Per-read Prediction consuming time %d" % (end_time-start_time)) - + ### for summarizing modificatoin prediction start_time = time.time(); # get all index files of prediction @@ -1215,9 +1238,9 @@ def mDetect_manager(moptions): for cur_cif in all_chr_ind_files: chr_strand_Q.put((cur_cif, cur_cif.split(pre_base_str)[-1][1:], '+')) chr_strand_Q.put((cur_cif, cur_cif.split(pre_base_str)[-1][1:], '-')) - jobnum +=2 - - # star to summarize modificaiton prediction of reference genomes of interest + jobnum +=2 + + # star to summarize modificaiton prediction of reference genomes of interest share_var = (moptions, chr_strand_Q) handlers = [] for hid in range(moptions['threads'] if moptions['threads']', f5k, '\n', f5data[f5k][0], '\n'])) - temp_fa.flush(); + temp_fa.flush(); if moptions['outLevel']<=myCom.OUTPUT_DEBUG: end_time = time.time(); print ("Write consuming time %d" % (end_time-start_time)) - + # alignment using bwa-mem or minimap2 temp_sam = tempfile.NamedTemporaryFile() if moptions['alignStr']=='bwa': @@ -70,7 +70,7 @@ def mGetFeature1(moptions, sp_options, f5files): sp_param['ref_info'] = defaultdict() if moptions['outLevel']<=myCom.OUTPUT_DEBUG:start_time = time.time(); - ilid = 0; + ilid = 0; # for each record in sam, get alignment information while ilid < len(align_info): if len(align_info[ilid])==0 or align_info[ilid][0]=='@': @@ -134,7 +134,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): if not ( (rname in moptions['fulmodlist'] and len(moptions['fulmodlist'][rname])>0) or \ ((not moptions['anymodlist']==None) and rname in moptions['anymodlist'] and len(moptions['anymodlist'][rname])>0) or \ - ((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ): + ((not moptions['nomodlist']==None) and rname in moptions['nomodlist'] and len(moptions['nomodlist'][rname])>0) ): continue; # get reference sequece @@ -164,7 +164,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): rightclip += numinfo[-1]; readseq = readseq[:-numinfo[-1]] if mdiinfo[-1] in ['H']: rightclip += numinfo[-1] numinfo = numinfo[:-1]; mdiinfo = mdiinfo[:-1] - if forward_reverse=='+': + if forward_reverse=='+': if rightclip>0: m_event = f5data[readk][1][leftclip:-rightclip] else: m_event = f5data[readk][1][leftclip:] else: @@ -181,7 +181,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): continue; # associate mapped reference positions with read positions - lastmatch = None; firstmatch = None; + lastmatch = None; firstmatch = None; first_match_pos = None; last_match_pos = None last_al_match = None; first_al_match = None lasmtind = 0; @@ -209,7 +209,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): numinsert += 1 elif mdi == 'D': base_map_info.append((refseq[pos], '-', pos, read_ind)) - pos += 1; + pos += 1; numdel += 1 elif mdi == 'N': base_map_info.append((refseq[pos], '-', pos, read_ind)) @@ -231,16 +231,16 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): if first_match_pos==None: first_match_pos = pos if last_match_pos==None or last_match_pos1: rightclip += len(m_event)-lastmatch-1 - # remove events whose bases are not mapped. + # remove events whose bases are not mapped. if forward_reverse=='+': if len(m_event)-lastmatch>1: m_event = m_event[firstmatch:(lastmatch+1-len(m_event))] @@ -274,8 +274,8 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): base_map_info = base_map_info[first_al_match:(last_al_match+1-len(base_map_info))] elif first_al_match>0: base_map_info = base_map_info[first_al_match:] - - # post-process mapping information + + # post-process mapping information base_map_info = np.array(base_map_info, dtype=[('refbase', 'U1'), ('readbase', 'U1'), ('refbasei', np.uint64), ('readbasei', np.uint64)]) if forward_reverse=='-': base_map_info = np.flipud(base_map_info) @@ -290,7 +290,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): read_align_list = [bt.decode(encoding="utf-8") for bt in mf5[read_align_key]] ref_align_list = [bt.decode(encoding="utf-8") for bt in mf5[ref_align_key]] for rali in range(len(read_align_list)): - if not read_align_list[rali]==base_map_info['readbase'][rali]: + if not read_align_list[rali]==base_map_info['readbase'][rali]: print ("Error not equal1! %s %s %d %s" % (read_align_list[rali], base_map_info['readbase'][rali], rali, f5data[readk][3])) if not ref_align_list[rali]==base_map_info['refbase'][rali]: print ("Error not equal2! %s %s %d %s" % (ref_align_list[rali], base_map_info['refbase'][rali], rali, f5data[readk][3])) @@ -307,7 +307,7 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): while ali + addali < len(base_map_info): if base_map_info['readbase'][ali+addali]=='-' and base_map_info['refbase'][ali+addali]=='G': addali += 1; else: break; - if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G': + if ali + addali < len(base_map_info) and base_map_info['readbase'][ali+addali]=='G' and base_map_info['refbase'][ali+addali]=='G': base_map_info['readbase'][ali+1], base_map_info['readbase'][ali+addali] = base_map_info['readbase'][ali+addali], base_map_info['readbase'][ali+1] if base_map_info['refbase'][ali]=='G' and base_map_info['readbase'][ali]=='G': if ali-1>-1 and base_map_info['readbase'][ali-1]=='-' and base_map_info['refbase'][ali-1]=='C': @@ -319,10 +319,10 @@ def handle_record(moptions, sp_options, sp_param, f5align, f5data): base_map_info['readbase'][ali-1], base_map_info['readbase'][ali-addali] = base_map_info['readbase'][ali-addali], base_map_info['readbase'][ali-1] # too short reads if len(m_event)<500: - sp_options["Error"]["Less(<500) events"].append(f5data[readk][3]) + sp_options["Error"]["Less(<500) events"].append(f5data[readk][3]) continue; - # get feautre + # get feautre mfeatures,isdif = get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, leftclip, rightclip, base_map_info, forward_reverse, rname, first_match_pos, numinsert, numdel) if isdif and moptions['outLevel']<=myCom.OUTPUT_WARNING: print("Dif is true") @@ -361,7 +361,7 @@ def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_cl align_ref_pos = mapped_start_pos else: align_ref_pos = mapped_start_pos + len(base_map_info) - num_insertions - 1 - + # initialize feature matrix for all events. if moptions['fnum']==57: #mfeatures = np.zeros((len(modevents)-end_clip+100-(start_clip-100), (binnum+3+3+4))); @@ -467,7 +467,7 @@ def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_cl cur_base = base_map_info['refbase'][aligni] # the second/third column is for negative/positive labels of methylation if moptions['posneg'] == 0: # for a data without any modification - if ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): + if ( (not moptions['anymodlist']==None) and rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 elif (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['fulmodlist'][rname]: mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 @@ -478,7 +478,7 @@ def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_cl mfeatures[cur_row_num][1] = 0; mfeatures[cur_row_num][2] = 1 else: if (forward_reverse, base_map_info['refbasei'][aligni]) not in cgpos[1]: - if moptions['anymodlist']==None: + if moptions['anymodlist']==None: if moptions['nomodlist']==None or ( rname in moptions['nomodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['nomodlist'][rname] ): mfeatures[cur_row_num][1] = 1; mfeatures[cur_row_num][2] = 0 elif rname in moptions['anymodlist'] and (forward_reverse, base_map_info['refbasei'][aligni]) in moptions['anymodlist'][rname]: @@ -491,7 +491,7 @@ def get_Feature(moptions, sp_options, sp_param, f5align, f5data, readk, start_cl else: align_ref_pos -= 1 aligni += 1 - # for bin features + # for bin features if ie>=0 and ie10 or currs<-10: print ('Error raw signal', currs, ie, modevents['start'][ie], modevents['length'][ie]) @@ -542,7 +542,7 @@ def handle_line(moptions, sp_param, f5align): lsp = sp_param['line'].split('\t') qname, flag, rname, pos, mapq, cigar, _, _, _, seq, _ = lsp[:11] # checked query name - if qname=='*': sp_param['f5status'] = "qname is *" + if qname=='*': sp_param['f5status'] = "qname is *" # check mapping quality elif int(mapq)==255: sp_param['f5status'] = "mapq is 255" # check mapped positions @@ -559,7 +559,7 @@ def handle_line(moptions, sp_param, f5align): return qname # -# feature handler/workder for multiprocessing +# feature handler/workder for multiprocessing # def getFeature_handler(moptions, h5files_Q, failed_Q, version_Q): while not h5files_Q.empty(): @@ -590,7 +590,7 @@ def readFA(mfa, t_chr=None): with open(mfa, 'r') as mr: cur_chr = None; line = mr.readline(); - while line: + while line: # remove empty spaces line = line.strip(); if len(line)>0: @@ -654,13 +654,13 @@ def getFeature_manager(moptions): start_time = time.time(); # multipprocessing manager pmanager = multiprocessing.Manager(); - + # prepare output folder if os.path.isdir(moptions['outFolder']): os.system('rm -dr '+moptions['outFolder']) if not os.path.isdir(moptions['outFolder']): os.system('mkdir '+moptions['outFolder']) - + moptions['size_per_batch'] = moptions['size_per_batch'] * (10**7) # read reference information @@ -668,7 +668,7 @@ def getFeature_manager(moptions): if moptions['motifORPos']==1: # get motif-based positions for modifications moptions['fulmodlist'], moptions['nomodlist'] = readMotifMod(fadict, moptions['motif'][0], moptions['motif'][1], moptions['region'][0], moptions['region'][1], moptions['region'][2]) moptions['anymodlist'] = None - moptions['nomodlist'] = {_mk:{} for _mk in moptions['nomodlist'].keys()}; # add for simple process + moptions['nomodlist'] = None; # add for simple process elif moptions['motifORPos']==2: # modification position is specified by the files fuldfiles = glob.glob(moptions["fulmod"]); moptions['fulmodlist'] = defaultdict(lambda: defaultdict()); @@ -685,7 +685,7 @@ def getFeature_manager(moptions): mthreadin = [moptions['fulmodlist'], moptions['anymodlist'], moptions['nomodlist']] mthfiles = [fuldfiles, anydfiles, nodfiles] # read completely modified positions, partially modified positions, completely un-modified positions from files - for mthi in range(len(mthreadin)): + for mthi in range(len(mthreadin)): curmeth = mthreadin[mthi]; curfilelist = mthfiles[mthi] if curmeth==None or curfilelist==None: continue; for curmthf in curfilelist: @@ -698,7 +698,7 @@ def getFeature_manager(moptions): line = mreader.readline(); for tchr in moptions['fulmodlist'] if moptions['anymodlist']==None else moptions['anymodlist']: if len(moptions['fulmodlist'][tchr])>0 or ((not moptions['anymodlist']==None) and len(moptions['anymodlist'][tchr])>0): - print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1)) + print ('%s fulmod=%d anymod=%d nomod=%d' % (tchr, len(moptions['fulmodlist'][tchr]), len(moptions['anymodlist'][tchr]) if (not moptions['anymodlist']==None) else -1, len(moptions['nomodlist'][tchr]) if (not moptions['nomodlist']==None) else -1)) if True: #False: # get all input fast5 files @@ -708,8 +708,8 @@ def getFeature_manager(moptions): f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*.fast5" ))) f5files.extend(glob.glob(os.path.join(moptions['wrkBase'],"*/*/*/*.fast5" ))) - - print('Total files=%d' % len(f5files)) + + print('Total files=%d' % len(f5files)) h5files_Q = pmanager.Queue(); failed_Q = pmanager.Queue() version_Q = pmanager.Queue() @@ -745,7 +745,7 @@ def getFeature_manager(moptions): except: time.sleep(1); continue; - + # output failure information if len(failed_files)>0: print ('Error information for different fast5 files:') @@ -756,7 +756,7 @@ def getFeature_manager(moptions): end_time = time.time(); print ("Total consuming time %d" % (end_time-start_time)) - + # for indepdent testing of code if __name__=='__main__': @@ -774,11 +774,9 @@ def getFeature_manager(moptions): moptions['fnum'] = 53; moptions['hidden'] = 100; moptions['windowsize'] = 21; - + moptions['threads'] = 8 moptions['threads'] = 1 moptions['files_per_thread'] = 500 mDetect_manager(moptions) - - diff --git a/bin/scripts/myMultiBiRNN.py b/bin/scripts/myMultiBiRNN.py index cf3569a..03abf95 100644 --- a/bin/scripts/myMultiBiRNN.py +++ b/bin/scripts/myMultiBiRNN.py @@ -38,7 +38,7 @@ def mCreateSession(num_input, num_hidden, timesteps, moptions): def BiRNN(x, weights, biases): x = tf.unstack(x, timesteps, 1); - # define the LSTM cells + # define the LSTM cells lstm_fw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]); lstm_bw_cell = rnn.MultiRNNCell([rnn.BasicLSTMCell(num_hidden, forget_bias=1.0) for _ in range(numlayers)]); @@ -58,7 +58,7 @@ def BiRNN(x, weights, biases): logits = BiRNN(X, weights, biases); prediction = tf.nn.softmax(logits) - mfpred=tf.argmax(prediction,1) + mfpred=tf.argmax(prediction,1) ## with different class-weights or not if 'unbalanced' in moptions and (not moptions['unbalanced']==None) and moptions['unbalanced']==1: # class_weights @@ -67,7 +67,7 @@ def BiRNN(x, weights, biases): loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y)) # - # for optimizer + # for optimizer optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate); train_op = optimizer.minimize(loss_op); @@ -87,7 +87,7 @@ def BiRNN(x, weights, biases): init_l = tf.local_variables_initializer() saver = tf.train.Saver(); - + return (init, init_l, loss_op, accuracy, train_op, X, Y, saver, auc_op, mpre, mspf, mfpred) # @@ -115,7 +115,7 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): with tf.Session(config=config) as sess: # initialization sess.run(init); - sess.run(init_l) + sess.run(init_l) start_time = time.time(); start_c_time = time.time(); io_time = 0; @@ -130,7 +130,7 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): # for each input groups. # usually two groups: one positive group and one negative group - # might also one group containing both positive and negative labelling data + # might also one group containing both positive and negative labelling data featurelist = [[[], []] for _ in range(len(filelists))]; minsize = None; cur_batch_num = None; # get data from all groups until 'minsize' data is loaded. @@ -139,7 +139,7 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): minsize = batchsize * sumpsize else: minsize = batchsize * cur_batch_num; while len(featurelist[ifl][0])3 else len(featurelist)-1 - if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: + if (file_group_id[0]+1) - last_desplay_files_num >= desplay_files: sess.run(init_l) try: # print some testing information as progress indicators @@ -197,9 +197,9 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): opstr = [] for tol in tok: opstr.append(str(round(tol, 2))) - print("\t\t\t"+','.join(opstr)) + print("\t\t\t"+','.join(opstr)) sys.exit(1) - + # adjust progress output information ifl=3 if len(featurelist)>3 else len(featurelist)-1 @@ -220,7 +220,7 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): os.system('mkdir -p '+moptions['outFolder']+str(step-1)+savp); saver.save(sess, moptions['outFolder']+str(step-1)+savp+'/'+moptions['FileID']); # for each epoch, store the trained model - if (not os.path.isdir(moptions['outFolder']+str(step))): + if (not os.path.isdir(moptions['outFolder']+str(step))): os.system('mkdir -p '+moptions['outFolder']+str(step)); saver.save(sess, moptions['outFolder']+str(step)+'/'+moptions['FileID']); print("Training Finished!") @@ -229,14 +229,14 @@ def train_save_model(filelists, num_input, mhidden, timesteps, moptions): # # get all data files in a folder -# +# def getTFiles1(folder1, moptions): t1files = glob.glob(os.path.join(folder1, "*.xy.gz")) # get all data in a recursive way if moptions['recursive']==1: t1files.extend(glob.glob(os.path.join(folder1, "*/*.xy.gz"))) - t1files.extend(glob.glob(os.path.join(folder1, "*/*/*.xy.gz"))); - t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*.xy.gz"))); + t1files.extend(glob.glob(os.path.join(folder1, "*/*/*.xy.gz"))); + t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*.xy.gz"))); t1files.extend(glob.glob(os.path.join(folder1, "*/*/*/*/*.xy.gz"))); print("Get folder1"); # for read-based independent testing @@ -266,7 +266,7 @@ def getTFiles(folder1, folder2, moptions): print(t1files.__sizeof__(), len(t1files)) if moptions['test'][0] == '0': if moptions['test'][1]>0.5: - t1files = t1files[:int(len(t1files)*moptions['test'][1])] + t1files = t1files[:int(len(t1files)*moptions['test'][1])] else: t1files = t1files[-int(len(t1files)*moptions['test'][1]):] print(t1files.__sizeof__(), len(t1files)) sys.stdout.flush(); @@ -336,7 +336,7 @@ def getDataFromFile_new(fn, moptions, mfind0ld=None): if has_nan_value: if fn in nan_file: pass else: - print ("Warning-nan-value {}".format(fn)) + print ("Warning-nan-value {}".format(fn)) nan_file.append(fn); else: m_y.append(ty[mind]) @@ -348,7 +348,7 @@ def getDataFromFile_new(fn, moptions, mfind0ld=None): ptofkeys = sorted(list(pos_to_file_dict.keys())) for npk_ind in range(len(ptofkeys)): if (npk_ind+1