In [1]:
import bioluigi.tasks 
from bioluigi.utils import get_ext
from bioluigi.decorators import requires, inherits
import luigi
import os,re
from luigi import LocalTarget, Parameter
import pandas as pd
import numpy as np

In [2]:
def parseHISATLog(logfile, lib):
    '''Parse the star Log.final.out file :param: logfile for the library :param: lib
       returns a pandas.Series of the fields defined in keep'''

    keep = [
    'Total pairs',
    'Aligned concordantly 1 time',
    'Aligned concordantly >1 times',
    'Aligned 1 time',
    'Aligned >1 times',
    ]

    try:
        s = pd.Series()
        with open(logfile, 'r') as f:
            for line in f:
                split = line.strip().split(":", 1)
                if split[0].strip() == 'Completed':
                    s['datetime'] = split[1].strip()

                elif split[0] in keep:
                    n = re.findall('^([0-9]+)', split[1].strip())
                    if len(n) > 0:
                        s[split[0]] = int(n[0])
                    pc = re.findall("(\d+\.\d+)\\%", split[1].strip())
                    if len(pc) > 0:
                        s[split[0] + ' %'] = float(pc[0])
                elif split[0] == 'Overall alignment rate':
                    s[split[0]] =  float(re.findall("(\d+\.\d+)\\%", split[1].strip())[0])

    except FileNotFoundError:
        s = pd.Series(dict(zip(keep, [float('nan')] * len(keep))))

    s['Library'] = lib
    return s

In [3]:
timecourse_libs  = ['ERR1224553','ERR1224554','ERR1224555','ERR1224556','ERR1224557',
                    'ERR1224558','ERR1224559','ERR1224560','ERR1224561','ERR1224562',
                    'ERR1224563','ERR1224564','ERR1224565','ERR1224566','ERR1224567',
                    'ERR1224568','ERR1224569','ERR1224570','ERR1224571','ERR1224572',
                    'ERR1224573','ERR1224574','ERR1224575','ERR1224576','ERR1224577',
                    'ERR1224578','ERR1224579','ERR1224580','ERR1224581','ERR1224582',
                    'ERR1224583','ERR1224584','ERR1224585','ERR1224586','ERR1224587',
                    'ERR1224588','ERR1224589','ERR1224590','ERR1224591','ERR1224592',
                    'ERR1224593','ERR1224594']
lib_path = '/nbi/Research-Groups/JIC/Diane-Saunders/YR_2018_genome_annotation/timecourse_rnaseq_data'
scratch_dir = '/nbi/scratch/buntingd/genome_annotation_PST104/'
base_dir = '/usr/users/JIC_a1/buntingd/GenomeAnnotation/PST104/data/'


class GetFastq(luigi.ExternalTask):
    library = Parameter()
    def output(self):
        return [LocalTarget(os.path.join(lib_path, self.library + '_1.fastq')),
                LocalTarget(os.path.join(lib_path, self.library + '_2.fastq'))]
    
class ReferenceFasta(luigi.ExternalTask):
    reference = luigi.Parameter()
    def output(self):
        return LocalTarget(self.reference)

@requires(ReferenceFasta)
class RepeatMasker(bioluigi.tasks.repeatmodeler.RepeatMasker):
    pass

@requires(ReferenceFasta)
class HISATIndexGenome(bioluigi.tasks.hisat.HISATIndexGenome):
    pass

@requires(reads=GetFastq, genome=HISATIndexGenome)
class HISAT(bioluigi.tasks.hisat.HISAT):
    def output(self):
        return {'hisat_bam': LocalTarget(os.path.join(scratch_dir, self.library, self.library + '.bam')),
                'hisat_log': LocalTarget(os.path.join(base_dir, self.library, self.library + '.hisat.log'))}

@requires(HISAT)
class Adaptor(luigi.WrapperTask):
    def output(self):
        return self.input()['hisat_bam']

@requires(Adaptor)
class SamtoolsSort(bioluigi.tasks.samtools.SamtoolsSort):
    pass

@requires(SamtoolsSort)
class StringTie(bioluigi.tasks.stringtie.StringTie):
    pass

@inherits(StringTie, HISAT)
class StringTieMerge(bioluigi.tasks.stringtie.StringTieMerge):
    library = None
    def requires(self):
        logs = [parseHISATLog(self.clone(HISAT, library=lib).output()['hisat_log'].path, lib) for lib in timecourse_libs]
        filtered = [x['Library'] for x in logs if x['Overall alignment rate'] > 0.1]   
        print([(x['Library'],x['Overall alignment rate']) for x in logs if x['Overall alignment rate'] > 10]    )
        return [self.clone(StringTie, library=lib) for lib in filtered]
    def output(self):
        return LocalTarget(os.path.join(base_dir, 'stringtie.gtf'))

@requires(StringTieMerge)
class GTFtoGFF3(bioluigi.tasks.misc.GTFtoGFF3):
    pass            

@requires(fasta=RepeatMasker, gff=GTFtoGFF3)
class CodingQuarry(bioluigi.tasks.codingquarry.CodingQuarry):
    def output(self):
        return LocalTarget(os.path.join(base_dir, 'CodingQuarry'))

@requires(fasta=RepeatMasker, gff=GTFtoGFF3)
class CodingQuarryPM(bioluigi.tasks.codingquarry.CodingQuarryPM):
    def output(self):
        return LocalTarget(os.path.join(base_dir, 'CodingQuarry'))

@requires(CodingQuarryPM)
class ConcatGFF(bioluigi.tasks.codingquarry.ConcatGFF):
    pass

@requires(ConcatGFF)
class FixGFF(bioluigi.tasks.codingquarry.FixGFF):
    pass

In [6]:
luigi.build([FixGFF(reference='/usr/users/JIC_a1/buntingd/GenomeAnnotation/PST104/reference/Pst_104E_v13_p_ctg.fa')],
            workers=25,
            local_scheduler=False,
            scheduler_host='j512n1',
            log_level='INFO')

INFO: Informed scheduler that task   FixGFF__usr_users_JIC_a_2c99a42331   has status   DONE
INFO: Done scheduling tasks
INFO: Running Worker with 25 processes
INFO: Worker Worker(salt=077415708, workers=25, host=n128n20.nbicluster, username=buntingd, pid=11917) was stopped. Shutting down Keep-Alive thread
INFO: 
===== Luigi Execution Summary =====

Scheduled 1 tasks of which:
* 1 present dependencies were encountered:
    - 1 FixGFF(...)

Did not run any tasks
This progress looks :) because there were no failed tasks or missing external dependencies

===== Luigi Execution Summary =====



True

In [4]:
@requires(ReferenceFasta)
class AddGenome(bioluigi.tasks.jbrowse.AddGenome):
    pass

class PST104Annotation(luigi.ExternalTask):
    def output(self):
        return LocalTarget("/usr/users/JIC_a1/buntingd/GenomeAnnotation/PST104/reference/Pst_104E_v13_p_ctg.anno.gff3")
    
@requires(gff=PST104Annotation, genome=AddGenome)
class AddGFF1(bioluigi.tasks.jbrowse.AddGFF):
    pass

@requires(gff=FixGFF, genome=AddGenome)
class AddGFF2(bioluigi.tasks.jbrowse.AddGFF):
    pass

In [8]:
AddGFF1(reference='/usr/users/JIC_a1/buntingd/GenomeAnnotation/PST104/reference/Pst_104E_v13_p_ctg.fa',
         jbrowse_dir='/usr/users/JIC_a1/buntingd/GenomeAnnotation/Jbrowse/JBrowse-1.15.0').run()