In [39]:
'''
1. choose the largeset contig/chromosome of each species
2. filter annotation according to the contig
3. run genemark.hmm on the contig
4. convert genemark prediction to gtf
5. Compare gtf result with annotation
'''
import pandas as pd
from Bio import SeqIO
import numpy as np
import subprocess

In [3]:
full_info = pd.read_csv('../1_DataSmall/fullTableInfoGff3GffRNAESwithDensity20150829.csv')
full_info.index = full_info.shortName

rna_seq_result_path = '/storage3/w/richard/meta2015/placeForRNASeq/'
rna_seq_result_shortName = os.listdir(rna_seq_result_path)
rna_seq_result_shortName = filter(lambda x: 'Clafu1' not in x, rna_seq_result_shortName)

cols = ['shortName','longName','gc','intronDensityGff','fastaName','gffName','gff3Name']
supported_set = full_info.loc[rna_seq_result_shortName]  # rna seq data supported set
supported_set[cols].head(4)

Unnamed: 0_level_0,shortName,longName,gc,intronDensityGff,fastaName,gffName,gff3Name
shortName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aspoch1,Aspoch1,Aspergillus ochraceoroseus IBT 24754 v1.0,44.2,2.23,Aspoch1_AssemblyScaffolds.fasta.gz,Aspoch1_GeneCatalog_genes_20140127.gff.gz,Aspoch1.filtered_proteins.FilteredModels1.gff3.gz
Disac1,Disac1,Dissoconium aciculare v1.0,52.7,1.17,Disac1_AssemblyScaffolds.fasta.gz,Disac1_GeneCatalog_genes_20130805.gff.gz,Disac1.filtered_proteins.FilteredModels2.gff3.gz
Cenge3,Cenge3,Cenococcum geophilum 1.58 v2.0,37.5,1.93,Cenge3_AssemblyScaffolds.fasta.gz,Cenge3_GeneCatalog_genes_20130403.gff.gz,Cenge3.filtered_proteins.FilteredModels1.gff3.gz
Conli1,Conli1,Coniochaeta ligniaria CBS 111746,51.9,1.98,Conli1_AssemblyScaffolds.fasta.gz,Conli1_GeneCatalog_genes_20130808.gff.gz,Conli1.filtered_proteins.ExternalModels.gff3.gz


In [12]:
# 1. choose the largeset contig/chromosome of each species
# 1.1 read in all genomes
genomes = []
for shortName in rna_seq_result_shortName:
    fastaName = supported_set.loc[shortName].fastaName[:-3]
    filename = '{}{}/data/{}'.format(rna_seq_result_path,shortName,fastaName)
    genomes.append(list(SeqIO.parse(filename,'fasta')))

In [18]:
# 1.2 calculate length of each contig in each genome and find the argmax for each genome
genome_contig_lengths = [map(len,contigs) for contigs in genomes]
genome_argmax_contig_length = map(np.argmax,genome_contig_lengths)

In [25]:
# 1.3 extract contig with longest sequence
genomes_maxlength_contigs = map(lambda i: genomes[i][genome_argmax_contig_length[i]], range(len(genome_argmax_contig_length)))
## get contigs name
name_maxlength_contigs = map(lambda x: x.id, genomes_maxlength_contigs)

In [121]:
# 1.4 write extracted contig with longest sequence
outputPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigs/'
for i in range(len(genome_argmax_contig_length)):
    shortName = rna_seq_result_shortName[i]
    contigName = name_maxlength_contigs[i]
    filePath = "{}{}_{}.fasta".format(outputPath,shortName,contigName)
    SeqIO.write(genomes_maxlength_contigs[i],filePath,'fasta')

In [41]:
##############################
# 2. filter annotation according to the contig
annotationPath = "/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/"
annotationNames = [gffName[:-7]+".gtf" for gffName in supported_set.gffName]

outputPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsAnnotation/'
for i in range(len(annotationNames)):
    shortName = rna_seq_result_shortName[i]
    contigName = name_maxlength_contigs[i]
    outputName = '{}{}_{}.gtf'.format(outputPath,shortName,contigName)
    inputName = annotationPath+annotationNames[i]
    command = 'python /home/richard/research/4_Tools/filter_annotation.py'
    subprocess.check_call('{} {} {} {}'.format(command, inputName, outputName, contigName),shell=True)

In [58]:
##############################
# 3. run genemark.hmm on the contigs
fastaPath = "/storage3/w/richard/meta2015/finalTestSet20151002/longestContigs/"
fastaNames = ["{}_{}.fasta".format(rna_seq_result_shortName[i], name_maxlength_contigs[i]) for i in range(len(annotationNames))]


modelPath = '/home/richard/largeDataSet/ES_Run_modfileAug2015/'
modelNames = ['{}.ES_C_4.mod'.format(shortName) for shortName in rna_seq_result_shortName]

outputPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/'
outputNames = map(lambda x: x[:-6]+'.gmhmme3', fastaNames)

base = '/home/richard/research/4_Tools/essuite/gmhmme3 -m '
script = ''
for i in range(len(annotationNames)):
    script += base + modelPath+modelNames[i]+ " "+ fastaPath+fastaNames[i]+" -o "+ outputPath+outputNames[i]+"\n"

with open("/home/richard/tempFile/gmhmmContigs.sh",'w') as f: f.write(script)

In [84]:
# print script

In [85]:
# gmhmm3Names

In [125]:
##############################
## 4. convert genemark prediction to gtf
gmhmm3Path = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/'
gmhmm3Names = map(lambda x: x[:-6]+'.gmhmme3', fastaNames)

outputPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmmToGTF/'
outputNames = map(lambda x: x+".gtf",gmhmm3Names)

base = "python /home/richard/research/4_Tools/gmhmme3_to_gtf.py "
for i in range(len(gmhmm3Names)):
    contigName = name_maxlength_contigs[i]
    try:
        command = "{} {}{} {}{} {}".format(base,gmhmm3Path,gmhmm3Names[i],outputPath,outputNames[i],contigName)
        print command
        subprocess.check_call(command,shell = True)
    except:
        print 'Model file does not exists:',gmhmm3Names[i]
#     try:
# #     subprocess.check_call("{} {}{} {}{}".format(base,gmhmm3Path,gmhmm3Names[i],outputPath,outputNames[i])",shell = "True")
#         print "{} {}{} {}{}".format(base,gmhmm3Path,gmhmm3Names[i],outputPath,outputNames[i])
#     except:
#         print i

python /home/richard/research/4_Tools/gmhmme3_to_gtf.py  /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/Aspoch1_scaffold_1.gmhmme3 /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmmToGTF/Aspoch1_scaffold_1.gmhmme3.gtf scaffold_1
python /home/richard/research/4_Tools/gmhmme3_to_gtf.py  /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/Disac1_scaffold_1.gmhmme3 /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmmToGTF/Disac1_scaffold_1.gmhmme3.gtf scaffold_1
python /home/richard/research/4_Tools/gmhmme3_to_gtf.py  /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/Cenge3_scaffold_1.gmhmme3 /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmmToGTF/Cenge3_scaffold_1.gmhmme3.gtf scaffold_1
python /home/richard/research/4_Tools/gmhmme3_to_gtf.py  /storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmm/Conli1_NODE_1580.gmhmm

In [126]:
## 5. Compare gtf result with annotation
outputPath = "/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsEval"
referenceAnnotationPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsAnnotation/'
referenceAnnotationNames = ["{}_{}.gtf".format(rna_seq_result_shortName[i], name_maxlength_contigs[i]) for i in range(len(annotationNames))]

gtf_from_gmhmme3_path = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsGenemarkhmmToGTF/'
gtf_from_gmhmme3_names = ["{}_{}.gmhmme3.gtf".format(rna_seq_result_shortName[i], name_maxlength_contigs[i]) for i in range(len(annotationNames))]


outputPath = '/storage3/w/richard/meta2015/finalTestSet20151002/longestContigsEval/'
base = "perl evaluate_gtf.pl "
script = ''
for i in range(len(gmhmm3Names)):
    script += base+referenceAnnotationPath+referenceAnnotationNames[i]+" "+gtf_from_gmhmme3_path+gtf_from_gmhmme3_names[i]+" >"+outputPath+referenceAnnotationNames[i][:-3]+"eval\n"

with open("/home/richard/tempFile/largestContigsEval.sh",'w') as f: f.write(script)

In [128]:
# print script