In [1]:
import pandas as pd
import re
import os

In [2]:
BLAST_IN_FILE = '/home/gamran/genome_analysis/Warrior/blast_Repbase/DK_0911_v03_ph_ctg.RM407.blastp.out'
TPSI_IN_FILE = '/home/gamran/genome_analysis/Warrior/Richard/transposonPSI/DK_0911_v03_ph_ctg.protein.fa.TPSI.topHits.onlyHits'

v04_OUT_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v04/'
GENOME_V03_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_v03'
H_CTG_GFF_PATH = os.path.join(GENOME_V03_PATH, 'DK_0911_v03_h_ctg.anno.gff3')
P_CTG_GFF_PATH = os.path.join(GENOME_V03_PATH, 'DK_0911_v03_p_ctg.anno.gff3')

In [3]:
blast_header = ['Query', 'Target', 'PctID', 'AlnLgth', 'NumMis', 'NumGap', 'StartQuery', 'StopQuery', 'StartTarget',\
              'StopTarget', 'e-value','BitScore']
blast_df = pd.read_csv(BLAST_IN_FILE, sep='\t', header=None, names=blast_header)
blast_df = blast_df[blast_df['e-value'] < 1e-10]
blast_df['contigLoc'] = blast_df['Query'].apply(lambda s: s.replace('evm.model.', ''))

tpsi_df = pd.read_csv(TPSI_IN_FILE, sep='\t', header=None)
tpsi_df = tpsi_df[tpsi_df[19] < 10**-10]
tpsi_df['contigLoc'] = tpsi_df[5].apply(lambda s: s.replace('evm.model.', ''))

In [4]:
pCtg_gff_df = pd.read_table(P_CTG_GFF_PATH, header = None, index_col = None, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
htg_gff_df = pd.read_table(H_CTG_GFF_PATH, header = None, index_col = None, \
                  names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

pCtg_gff_df.head()

ID_SEARCH = re.compile(r'^.*ID=(.*?)(;|$)')
P_CONTIG_LOC_SEARCH = re.compile(r'^.*\.(pcontig_\d{3}\.\d+)(?:\.|$)')
H_CONTIG_LOC_SEARCH = re.compile(r'^.*\.((?:p|h)contig_\d{3}(?:_\d{3})?\.\d+)(?:\.|$)')

# P_CONTIG_LOC_SEARCH.match('cds.evm.model.pcontig_000.1').group(1) will yield 'pcontig_000.1'
# H_CONTIG_LOC_SEARCH.match('evm.model.hcontig_006_028.2').group(1) will yield 'hcontig_006_028.2'

pCtg_gff_df['contigID'] = pCtg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
pCtg_gff_df['contigLoc'] = pCtg_gff_df['contigID'].apply(lambda s: P_CONTIG_LOC_SEARCH.match(s).group(1))

htg_gff_df['contigID'] = htg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
htg_gff_df['contigLoc'] = htg_gff_df['contigID'].apply(lambda s: H_CONTIG_LOC_SEARCH.match(s).group(1))

In [5]:
## THIS PART OF THE CODE IS NOT RE-USABLE FOR FUTURE STRAINS.
## however, the re-writing of this code should be straightforward to adapt to a blast database
## with proper labelling.

## Discovered bug due to incomplete renaming when re-assigning pwoh to htgs; changed seqid
## but did not change attributes column. Thus, the ID tag of the attributes column was 
## still the same as the initial primary contigs (pcontig_xxx.x instead of hcontig_xxx_xxx.x)
## blast dataframe was already generated with this incorrect labelling, so we filter based on
## these incorrect labels, and then fix the labels after (below).

filtered_htg_gff_df = htg_gff_df[(~htg_gff_df['contigLoc'].isin(blast_df['contigLoc'])) & (~htg_gff_df['contigLoc'].isin(tpsi_df['contigLoc']))]
filtered_pCtg_gff_df = pCtg_gff_df[(~pCtg_gff_df['contigLoc'].isin(blast_df['contigLoc'])) & (~pCtg_gff_df['contigLoc'].isin(tpsi_df['contigLoc']))]

# fix attributes column before writing to gff.
def fixHtgAttributes(row):
    attributes = row['attributes']
    contigLoc = row['contigLoc']
    seqid = row['seqid']
    
    numberSuffix = contigLoc.split('.')[-1]
    if attributes.find('pcontig') == -1:
        return attributes
    return attributes.replace(contigLoc, '%s.%s' %(seqid, str(numberSuffix)))

# fix attributes column (may not be required in future versions)
filtered_htg_gff_df['attributes'] = filtered_htg_gff_df.apply(fixHtgAttributes, axis=1)

# report how many elements were filtered
sumBeforeFilter = htg_gff_df['contigLoc'].unique().size + pCtg_gff_df['contigLoc'].unique().size
sumAfterFilter = filtered_htg_gff_df['contigLoc'].unique().size + filtered_pCtg_gff_df['contigLoc'].unique().size
print("Number of elements filtered out: %s" % (sumBeforeFilter - sumAfterFilter))

# drop created columns
filtered_htg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)
filtered_pCtg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)

filtered_htg_gff_df.to_csv(os.path.join(v04_OUT_PATH, 'DK_0911_v04_h_ctg.anno.gff3'), sep='\t', header=None, index=None)
filtered_pCtg_gff_df.to_csv(os.path.join(v04_OUT_PATH, 'DK_0911_v04_p_ctg.anno.gff3'), sep='\t', header=None, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Number of elements filtered out: 5344


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [6]:
print("Number of unique TE hits found from BLAST (e < 1e-10): %s" % blast_df['Query'].unique().size)
print("Number of unique TE hits from TPSI (e < 1e-10): %s" % tpsi_df[5].unique().size)
print("Number of unique TE hits from BLAST or TPSI (e < 1e-10): %s" % pd.concat([tpsi_df[5], blast_df['Query']], axis=0).unique().size)

Number of unique TE hits found from BLAST (e < 1e-10): 5042
Number of unique TE hits from TPSI (e < 1e-10): 3758
Number of unique TE hits from BLAST or TPSI (e < 1e-10): 5344
