In [256]:
'''
Cut Genomes according to 'good' gene filtered by /home/braker/bin/filterGenemark.pl
'''
import pandas as pd
from Bio import SeqIO
import numpy as np

In [3]:
from Bio import SeqIO

In [109]:
full_info = pd.read_csv('../1_DataSmall/fullTableInfoGff3GffRNAESwithDensity20150829.csv')
full_info.index = full_info.shortName

rna_seq_result_path = '/storage3/w/richard/meta2015/placeForRNASeq/'
rna_seq_result_shortName = os.listdir(rna_seq_result_path)
rna_seq_result_shortName = filter(lambda x: 'Clafu1' not in x, rna_seq_result_shortName)

cols = ['shortName','longName','gc','intronDensityGff','fastaName','gffName','gff3Name']
supported_set = full_info.loc[rna_seq_result_shortName]  # rna seq data supported set
supported_set[cols].head(4)

Unnamed: 0_level_0,shortName,longName,gc,intronDensityGff,fastaName,gffName,gff3Name
shortName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Aspoch1,Aspoch1,Aspergillus ochraceoroseus IBT 24754 v1.0,44.2,2.23,Aspoch1_AssemblyScaffolds.fasta.gz,Aspoch1_GeneCatalog_genes_20140127.gff.gz,Aspoch1.filtered_proteins.FilteredModels1.gff3.gz
Disac1,Disac1,Dissoconium aciculare v1.0,52.7,1.17,Disac1_AssemblyScaffolds.fasta.gz,Disac1_GeneCatalog_genes_20130805.gff.gz,Disac1.filtered_proteins.FilteredModels2.gff3.gz
Cenge3,Cenge3,Cenococcum geophilum 1.58 v2.0,37.5,1.93,Cenge3_AssemblyScaffolds.fasta.gz,Cenge3_GeneCatalog_genes_20130403.gff.gz,Cenge3.filtered_proteins.FilteredModels1.gff3.gz
Conli1,Conli1,Coniochaeta ligniaria CBS 111746,51.9,1.98,Conli1_AssemblyScaffolds.fasta.gz,Conli1_GeneCatalog_genes_20130808.gff.gz,Conli1.filtered_proteins.ExternalModels.gff3.gz


In [6]:
def filter_fasta(fastaFile, annotationFile, fastaFileFiltered, annotationFileUpdated):
    '''Wrapper function to filter multifasta file according to annotation'''
    seqRecords = read_fasta(fastaFile)
    annotation_boudary, annotation_items = read_annotation_location(annotationFile)
    # deal with fasta
    SeqRecords_filtered = filter(lambda x: x!=None, map(lambda x: cut_fasta_by_annotation(x,annotation_boudary),seqRecords))
    SeqIO.write(SeqRecords_filtered, fastaFileFiltered, "fasta")
    # deal with annotation
    items_updated = calculate_location_shift(annotation_items)
    update_annotation(items_updated,annotationFile,annotationFileUpdated)
    print "The filtered fasta file is stored at location:", fastaFileFiltered




def read_annotation_location(annotationFile):
    '''Version 0.1, cut start and stop region'''
    gene_info = {}
    with open(annotationFile) as f:
        for line in f:
            if "start_codon" in line or "stop_codon" in line:
                info = line.split()
                fasta_id = info[0]
                gene_id = info[9]
                key = (fasta_id,gene_id)
                if key not in gene_info:
                    gene_info[key] = []                
                
                gene_info[key].append(int(info[3]))
                gene_info[key].append(int(info[4]))
    for key in gene_info:
        locs = sorted(gene_info[key])
        try:
            gene_info[key] = (locs[0],locs[3])
        except:
            print 'there is a problem with: ', key
    items = gene_info.items()
    result = map(lambda x: (x[0][0],x[1]),items)
    return sorted(result), sorted(items)

def calculate_location_shift(annotation_items):
    '''
    input tuple: (key, (start,end))
    return left_shift_lengths    
    '''
    #add length need to shift left and gene length for counter prep
    items = map(lambda x: (x[0], x[1],x[1][0]-1,x[1][1]-x[1][0]+1), annotation_items)
    # calculate shift for each gene
    counter = {fasta:0 for fasta in set(map(lambda x: x[0][0],items))}

    items_updated = []
    for item in items:
        key = item[0]
        first_left_shift = item[2]
        gene_length = item[3]
        
        left_shift = first_left_shift-counter[key[0]]
        counter[key[0]] += gene_length
        items_updated.append((key,left_shift,(item[1][0]-left_shift, item[1][1]-left_shift))) #verification purpose
    return items_updated

def update_annotation(items_updated,annotationFile,annotationFileUpdated):
    '''
    take in tuple of (key, left_shift)
    output update annotationFile
    '''
    #transform tuples into dictionary
    lookup_table = {key:val for key,val,_ in items_updated}
    with open(annotationFile) as f:
        with open(annotationFileUpdated,'w') as f1:
            for line in f:
                info = line.split()
                data = line.split('\t')
                key = (info[0],info[9])
                shift_val = lookup_table[key]
                data[3] = str(int(data[3])- shift_val)
                data[4] = str(int(data[4])- shift_val)
                f1.write('\t'.join(data))

def cut_fasta_by_annotation(seqRecord, filteredAnnotation):
    '''
    20151003
    The function take in one fasta file(seq record) and all the filtered annotation
    The (multi)fasta file is cut by the CDS location of the annotation
    return seq record cut by location
    '''
    idx = seqRecord.id
    locations = filter(lambda x: x[0] == idx,filteredAnnotation)
#     print sum(map(lambda (_,(a,b)): b-a+1,locations)) #check cut accuracy; checked correct
    if len(locations) == 0:
        return None
    pieces = map(lambda (_,(start,end)): seqRecord[start-1:end],locations)
    return reduce(lambda a,b: a+b, pieces)

def read_fasta(fastaFile): return list(SeqIO.parse(fastaFile,'fasta'))

In [7]:
## testing
fastaFile = '/storage3/w/richard/meta2015/placeForRNASeq/Conli1/data/Conli1_AssemblyScaffolds.fasta'
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
fastaFileFiltered = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_AssemblyScaffolds.filtered.fasta'
annotationFileUpdated = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_GeneCatalog_genes_20130808.f.good.updated.test.gtf'
filter_fasta(fastaFile, annotationFile, fastaFileFiltered,annotationFileUpdated)
## passed, seems correct

The filtered fasta file is stored at location: /home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_AssemblyScaffolds.filtered.fasta


In [257]:
### Testing code###########
### Testing code###########
### Testing code###########

In [266]:
testRecs = list(SeqIO.parse('/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_AssemblyScaffolds.filtered.fasta','fasta'))
lens = map(len,testRecs)

print np.argmax(lens)
print testRecs[209].id,"is the longest sequence"
print len(testRecs[209])


209
NODE_1580 is the longest sequence
156541


In [269]:
## update annotation for NODE_1580, part I, extract information
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
annotationFileNODE_1580 = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_GeneCatalog_genes_20130808.f.good.NODE_1580.gtf'
with open(annotationFile) as f:
    with open(annotationFileNODE_1580,'w') as f1:
        for line in f:
            if 'NODE_1580' in line:
                f1.write(line)

In [322]:
line = 'NODE_1010	JGI	exon	3990	4270	0	+	.	gene_id ""NODE_1010_length_32477_cov_21..g1887.t1""; transcript_id ""NODE_1010_length_32477_cov_21..g1887.t1"";'
line.split('\t')

['NODE_1010',
 'JGI',
 'exon',
 '3990',
 '4270',
 '0',
 '+',
 '.',
 'gene_id ""NODE_1010_length_32477_cov_21..g1887.t1""; transcript_id ""NODE_1010_length_32477_cov_21..g1887.t1"";']

In [320]:
## update annotation for NODE_1580, part II, update
_, annotation_items = read_annotation_location(annotationFile)
# for item in annotation_items[:100]: print item
# print 
def calculate_location_shift(annotation_items):
    '''
    input tuple: (key, (start,end))
    return left_shift_lengths    
    '''
    #add length need to shift left and gene length for counter prep
    items = map(lambda x: (x[0], x[1],x[1][0]-1,x[1][1]-x[1][0]+1), annotation_items)
    # calculate shift for each gene
    counter = {fasta:0 for fasta in set(map(lambda x: x[0][0],items))}

    items_updated = []
    for item in items:
        key = item[0]
        first_left_shift = item[2]
        gene_length = item[3]
        
        left_shift = first_left_shift-counter[key[0]]
        counter[key[0]] += gene_length
        items_updated.append((key,left_shift,(item[1][0]-left_shift, item[1][1]-left_shift))) #verification purpose
    return items_updated

# for item in calculate_location_shift(annotation_items)[:100]: print item
# for item in update_annotation(annotation_items): print item
items_updated = calculate_location_shift(annotation_items)

In [326]:
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
annotationFileUpdated = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_GeneCatalog_genes_20130808.f.good.updated.gtf'
def update_annotation(items_updated,annotationFile,annotationFileUpdated):
    '''
    take in tuple of (key, left_shift)
    output update annotationFile
    '''
    #transform tuples into dictionary
    lookup_table = {key:val for key,val,_ in items_updated}
    with open(annotationFile) as f:
        with open(annotationFileUpdated,'w') as f1:
            for line in f:
                info = line.split()
                data = line.split('\t')
                key = (info[0],info[9])
                shift_val = lookup_table[key]
                data[3] = str(int(data[3])- shift_val)
                data[4] = str(int(data[4])- shift_val)
                f1.write('\t'.join(data))

update_annotation(items_updated,annotationFile,annotationFileUpdated)                

In [225]:
### check the repairing ability of eval with only CDS and Exon given
### check to see if it can add start and stop codon
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
annotationFile_withOnlyCDSandExon = '/home/richard/tempFile/Conli1_GeneCatalog_genes_20130808.f.good.ExonCDS.gtf'
with open(annotationFile) as f:
    with open(annotationFile_withOnlyCDSandExon,'w') as f1:
        for line in f:
            if 'CDS' in line or 'exon' in line:
                f1.write(line)        
## conclustion, no, eval can't add start and stop atomatically

In [118]:
## check to see if annotation with no start and stop affect the prediction accuracy
annotationFile = '/home/richard/research/5_Tests/Sep5modelWithLengthDistribution/Schpo1_converted_fixed.gtf'
annotationFile_withOnlyCDSandExon = '/home/richard/tempFile/Schpo1_converted_fixed.ExonCDS.gtf'
with open(annotationFile) as f:
    with open(annotationFile_withOnlyCDSandExon,'w') as f1:
        for line in f:
            if 'CDS' in line or 'exon' in line:
                f1.write(line)        
## conclustion, no, eval can't add start and stop atomatically

In [None]:
## test code for creating filter annotation

In [119]:
line = 'NODE_11529	JGI	CDS	40886	40945	.	+	0	gene_id ""NODE_11529_length_236519_cov_2.g9430.t1""; transcript_id ""NODE_11529_length_236519_cov_2.g9430.t1.a"";'

In [236]:
print list(enumerate(line.split()))

[(0, 'contig_99'), (1, 'JGI'), (2, 'start_codon'), (3, '1919'), (4, '1921'), (5, '.'), (6, '-'), (7, '0'), (8, 'gene_id'), (9, '""PNEJI1_003517g.01"";'), (10, 'transcript_id'), (11, '""PNEJI1_003517g.01"";')]


In [237]:
## collect information by gene_id
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
# annotationFile = '/home/richard/research/5_Tests/Sep5modelWithLengthDistribution/Schpo1_converted_fixed.gtf'
# annotationFile = '/home/richard/research/5_Tests/Sep7modelWithLengthDistributionNCrassa/eval/NCrassaContig12.1.fixed.gtf'
# annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gff/Pneji1_GeneCatalog_genes_20130607.gff'
# annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Pneji1_GeneCatalog_genes_20130607.f.good.gtf'
gene_info = {}
with open(annotationFile) as f:
    for line in f:
        if len(line)>10:
            data = line.split()
            tag = data[2]
            gene_id = data[9]
            if gene_id not in gene_info:
                gene_info[gene_id] = {}
            gene_info[gene_id][tag] = line

In [241]:
for key in gene_info.keys()[:5]:
# for key in gene_info.keys():
    if len(gene_info[key].keys()):
        
        for item in gene_info[key].items():
        
#         if len(item) > 2: print item
            res = item[1].split()
            print item[0], res[6],(res[3],res[4])
#             print item[1]
    print '###########'

start_codon - ('51891', '51893')
exon - ('51891', '51893')
stop_codon - ('51245', '51247')
CDS - ('51891', '51893')
###########
start_codon - ('161428', '161430')
exon - ('161127', '161430')
stop_codon - ('160412', '160414')
CDS - ('161127', '161430')
###########
start_codon + ('29465', '29467')
exon + ('29465', '29704')
stop_codon + ('29702', '29704')
CDS + ('29465', '29704')
###########
start_codon + ('25649', '25651')
exon + ('25649', '27088')
stop_codon + ('27086', '27088')
CDS + ('25649', '27088')
###########
start_codon + ('467', '469')
exon + ('1597', '1787')
stop_codon + ('1785', '1787')
CDS + ('1597', '1787')
###########


In [252]:
# test read_annotation_location
annotationFile = '/storage3/w/richard/meta2015/finalTestSet20151002/gtf_converted_from_gff/Conli1_GeneCatalog_genes_20130808.f.good.gtf'
read_annotation_location(annotationFile)

[('NODE_1010', (474, 2881)),
 ('NODE_1010', (3990, 5371)),
 ('NODE_1010', (5488, 6834)),
 ('NODE_1010', (7074, 7406)),
 ('NODE_1010', (8084, 8332)),
 ('NODE_1010', (8748, 10578)),
 ('NODE_1010', (11266, 12642)),
 ('NODE_1010', (13747, 16446)),
 ('NODE_1010', (19540, 20490)),
 ('NODE_1010', (20660, 21627)),
 ('NODE_1010', (26507, 27742)),
 ('NODE_1011', (5781, 7484)),
 ('NODE_1011', (10160, 11196)),
 ('NODE_1012', (791, 1668)),
 ('NODE_1012', (7750, 8013)),
 ('NODE_1012', (12991, 13651)),
 ('NODE_1012', (13776, 14577)),
 ('NODE_1012', (15649, 16329)),
 ('NODE_1012', (24434, 25650)),
 ('NODE_1012', (25799, 26864)),
 ('NODE_1012', (34530, 36320)),
 ('NODE_1012', (37725, 38698)),
 ('NODE_1013', (7671, 8392)),
 ('NODE_1013', (8546, 10142)),
 ('NODE_1013', (10662, 11959)),
 ('NODE_102', (2086, 3058)),
 ('NODE_102', (5309, 7776)),
 ('NODE_102', (9555, 10396)),
 ('NODE_102', (16664, 17918)),
 ('NODE_102', (18018, 19278)),
 ('NODE_102', (21170, 23571)),
 ('NODE_102', (27778, 29460)),
 ('NODE_10

In [None]:
# def read_annotation_location(annotationFile):
#     '''
#     Get all the location to be cut
    
#     Observation:
#     JGI annotation has start/stop codon intersect with CDS
#     NODE_1010	JGI	start_codon	26507	26509
#     NODE_1010	JGI	CDS	26507	27742
#     NODE_1010	JGI	stop_codon	27740	27742
    
#     Conclusion:
#     Only need to layout the CDS(from start to stop) region and cut
#     '''
#     locations = []
#     with open(annotationFile) as f:
#         for line in f:
#             if "CDS" in line:
#                 info = line.split()
# #                 locations.append((info[0],info[2],(info[3],info[4])))
#                 locations.append((info[0],(int(info[3]),int(info[4]))))
#     return sorted(locations)

In [328]:
def filter_annotation(annotationFile,outputFile,keyword):
    with open(annotationFile) as f:
        with open(outputFile,'w') as f1:
            for line in f:
                if keyword in line:
                    f1.write(line)

In [329]:
annotationFile = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_GeneCatalog_genes_20130808.f.good.updated.gtf'
outputFile = '/home/richard/research/5_Tests/Oct03ValidateGffToGTF/Conli1_GeneCatalog_genes_20130808.f.good.updated.NODE_1580.gtf'
keyword = 'NODE_1580'
filter_annotation(annotationFile,outputFile,keyword)