Some notebook to get the swissprot, merops, and, canDP running from funnanotate

In [1]:
from __future__ import division
import os, subprocess, logging, sys, argparse, inspect, csv, time, re, shutil, datetime, glob, platform, multiprocessing, itertools, hashlib
from natsort import natsorted
import warnings
from Bio import SeqIO
with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    from Bio import SearchIO

In [2]:
#some hardcoded reference database path would need to be changed accordingly
DB = '/home/benjamin/anaconda3/envs/funannotate/.linuxbrew/Cellar/funannotate/0.3.10/libexec/DB'
cpus = 8
BUSCO_DB = '/home/benjamin/anaconda3/envs/Genome_assess/lineages/basidiomycota_odb9'

In [6]:
p_genome_filtered = 'DK_0911_v04LT_p_ctg'
BASE_AA_PATH = '/home/benjamin/genome_assembly/Warrior/DK0911_v04'
BASE_A_PATH = '/home/benjamin/genome_assembly/Warrior/genome_v04'
OUT_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'funannotate', 'parsed')
TMP_PATH = os.path.join(BASE_AA_PATH, 'protein_annotation',p_genome_filtered, 'funannotate', 'tmp')
if not os.path.exists(OUT_PATH):
    os.makedirs(OUT_PATH)
if not os.path.exists(TMP_PATH):
    os.makedirs(TMP_PATH)
protein_input_file = os.path.join(BASE_A_PATH, '%s.protein.fa' % p_genome_filtered)

In [4]:
def SwissProtBlast(input, cpus, evalue, tmpdir, output):
    FNULL = open(os.devnull, 'w')
    #run blastp against uniprot
    blast_tmp = os.path.join(tmpdir, 'uniprot.xml')
    blastdb = os.path.join(DB,'uniprot')
    subprocess.call(['blastp', '-db', blastdb, '-outfmt', '5', '-out', blast_tmp,'-num_threads', str(cpus), '-max_target_seqs', '1', '-evalue', str(evalue), '-query', input], stdout = FNULL, stderr = FNULL)
    #parse results
    with open(output, 'w') as out:
        with open(blast_tmp, 'rU') as results:
            for qresult in SearchIO.parse(results, "blast-xml"):
                hits = qresult.hits
                qlen = qresult.seq_len
                ID = qresult.id
                num_hits = len(hits)
                if num_hits > 0:
                    length = hits[0].hsps[0].aln_span
                    pident = hits[0].hsps[0].ident_num / float(length)
                    if pident < 0.6:
                        continue
                    diff = length / float(qlen)
                    if diff < 0.6:
                        continue
                    description = hits[0].description.split("=")
                    hdescript = description[0].replace(' OS','')
                    name = description[2].replace(' PE','').upper()
                    #need to do some filtering here of certain words
                    bad_words = ['(Fragment)', 'homolog', 'homolog,']
                    descript = hdescript.split(' ') #turn string into array, splitting on spaces
                    final_desc = [x for x in descript if x not in bad_words]
                    final_desc = ' '.join(final_desc)
                    #okay, print out annotations for GAG
                    if ID.endswith('-T1'):
                        geneID = ID.replace('-T1','')
                        out.write("%s\tprot_desc\t%s\n" % (geneID,final_desc))     
                    else:
                        mrnaID = ID
                        out.write("%s\tprot_desc\t%s\n" % (mrnaID,final_desc))

In [5]:
def MEROPSBlast(input, cpus, evalue, tmpdir, output):
    FNULL = open(os.devnull, 'w')
    #run blastp against merops
    blast_tmp = os.path.join(tmpdir, 'merops.xml')
    blastdb = os.path.join(DB,'MEROPS')
    subprocess.call(['blastp', '-db', blastdb, '-outfmt', '5', '-out', blast_tmp, '-num_threads', str(cpus), '-max_target_seqs', '1', '-evalue', str(evalue), '-query', input], stdout = FNULL, stderr = FNULL)
    #parse results
    with open(output, 'w') as out:
        with open(blast_tmp, 'rU') as results:
            for qresult in SearchIO.parse(results, "blast-xml"):
                hits = qresult.hits
                qlen = qresult.seq_len
                ID = qresult.id
                num_hits = len(hits)
                if num_hits > 0:
                    if hits[0].hsps[0].evalue > evalue:
                        continue
                    sseqid = hits[0].id
                    family = hits[0].description
                    #okay, print out annotations for GAG
                    if ID.endswith('-T1'):
                        ID = ID.replace('-T1', '')
                    out.write("%s\tnote\tMEROPS:%s\n" % (ID,sseqid))

In [6]:
def dbCANsearch(input, cpus, evalue, tmpdir, output):
    CAZY = {'CBM': 'Carbohydrate-binding module', 'CE': 'Carbohydrate esterase','GH': 'Glycoside hydrolase', 'GT': 'Glycosyltransferase', 'PL': 'Polysaccharide lyase', 'AA': 'Auxillary activities'}
    FNULL = open(os.devnull, 'w')
    #run hmmerscan
    HMM = os.path.join(DB, 'dbCAN.hmm')
    dbCAN_out = os.path.join(tmpdir, 'dbCAN.txt')
    dbCAN_filtered = os.path.join(tmpdir, 'dbCAN.filtered.txt')
    subprocess.call(['hmmscan', '--domtblout', dbCAN_out, '--cpu', str(cpus), '-E', str(evalue), HMM, input], stdout = FNULL, stderr = FNULL)
    #now parse results
    with open(output, 'w') as out:
        with open(dbCAN_filtered, 'w') as filtered:
            filtered.write("#HMM_family\tHMM_len\tQuery_ID\tQuery_len\tE-value\tHMM_start\tHMM_end\tQuery_start\tQuery_end\tCoverage\n")
            with open(dbCAN_out, 'rU') as results:
                for qresult in SearchIO.parse(results, "hmmscan3-domtab"):
                    query_length = qresult.seq_len
                    hits = qresult.hits
                    num_hits = len(hits)
                    if num_hits > 0:
                        for i in range(0,num_hits):
                            hit_evalue = hits[i].evalue
                            if hit_evalue > evalue:
                                continue
                            hit = hits[i].id
                            hmmLen = hits[i].seq_len
                            hmm_aln = int(hits[i].hsps[0].hit_end) - int(hits[i].hsps[0].hit_start)
                            coverage = hmm_aln / float(hmmLen)
                            if coverage < 0.45:
                                continue
                            query = hits[i].query_id
                            filtered.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%f\n" % (hit, hmmLen, query, query_length, hit_evalue, hits[i].hsps[0].hit_start, hits[i].hsps[0].hit_end, hits[i].hsps[0].query_start, hits[i].hsps[0].query_end, coverage))
                            #get type of hit for writing the annotation note
                            type = ''.join(i for i in hit if not i.isdigit())
                            descript = CAZY.get(type)
                            if query.endswith('-T1'):
                                query = query.replace('-T1', '')
                            out.write("%s\tnote\tCAZy:%s\n" % (query, hit))

In [4]:
def runBUSCO(input, DB, cpus, tmpdir, output):
    FNULL = open(os.devnull, 'w')
    #run busco in protein mapping mode
    BUSCO = '/home/benjamin/anaconda3/envs/Genome_assess/bin/run_busco'
    proteins = input.split('/')[-1]
    #subprocess.call([BUSCO, '-i', input, '-m', 'prot', '-l', DB, '-o', proteins, '-c', str(cpus), '-f'], cwd = tmpdir, stdout = FNULL, stderr = FNULL)
    #now parse output and write to annotation file
    with open(output, 'w') as out:
        with open(os.path.join(tmpdir, 'run_%s' % proteins, 'full_table_%s.tsv' % proteins), 'rU') as busco:
            for line in busco:
                col = line.split('\t')
                if col[0].startswith('#'):
                    continue
                if col[1] == 'Complete' or col[1] == 'Duplicated':
                    if col[2].endswith('-T1'):
                        ID = col[2].replace('-T1', '')
                    else:
                        ID = col[2]
                    out.write("%s\tnote\tBUSCO:%s\n" % (ID, col[0]))

In [8]:
#this doesn't really fit here as it generates a tablist file, but hey
def get_names_from_fasta(input, output):
    with open(output, 'w') as out_fh:
        for seq in SeqIO.parse(input, 'fasta'):
            out_fh.write('%s\tSignalP3\n' % seq.id)

In [None]:
blast_out = os.path.join(OUT_PATH, 'annotations.swissprot.txt')
SwissProtBlast(protein_input_file, cpus, 1e-5, TMP_PATH,blast_out)

In [None]:
dbCAN_out = os.path.join(OUT_PATH, 'annotations.dbCAN.txt')
dbCANsearch(protein_input_file, cpus, 1e-17, TMP_PATH, dbCAN_out)

In [None]:
merops_out = os.path.join(OUT_PATH, 'annotations.merops.txt')
MEROPSBlast(protein_input_file, cpus, 1e-5, TMP_PATH, merops_out)

In [7]:
busco_out = os.path.join(OUT_PATH, 'annotations.busco.txt')
runBUSCO(protein_input_file, BUSCO_DB, cpus, TMP_PATH, busco_out )



In [13]:
#doing singalP3 tabfiles ran this twice for p and h.
p_secretome_fn = '/home/benjamin/genome_assembly/Warrior/DK0911/Secretome/DK_0911_v04_h_ctg.protein.fa'
p_secretome_out_fn = '/home/benjamin/genome_assembly/Warrior/DK0911_v04/protein_annotation/DK_0911_v04LT_h_ctg/combined/DK_0911_v04LT_h_ctg.SignalP3.tablist'
get_names_from_fasta(p_secretome_fn, p_secretome_out_fn)