# Filter Transposable Elements

- Inputs: **BLAST** & **transposonPSI** DataFrames, .*gff3* files.
- Programs: N/A
- Purpose: generate new *gff3* files without transposable elements.

After this, re-run code from `DK_0911_generate_fasta_files_from_gff3` onwards for genomic analysis of alleles without transposable elements.

In [4]:
import pandas as pd
import re
import os

In [5]:
GENOME_IN_VERSION = 'v03'
GENOME_OUT_VERSION = 'v04'

BLAST_IN_FILE = '/home/gamran/genome_analysis/Warrior/Richard/TE_filtering/blast_Repbase/DK_0911_%s_ph_ctg.RM407.blastp.out' % GENOME_IN_VERSION
TPSI_IN_FILE = '/home/gamran/genome_analysis/Warrior/Richard/TE_filtering/transposonPSI/DK_0911_%s_ph_ctg.protein.fa.TPSI.topHits.onlyHits' % GENOME_IN_VERSION

UNFILTERED_IN_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s' % GENOME_IN_VERSION
FILTERED_OUT_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/genome_%s/' % GENOME_OUT_VERSION

GENOME_IN = 'DK_0911_%s' % GENOME_IN_VERSION
GENOME_OUT = 'DK_0911_%s' % GENOME_OUT_VERSION

H_CTG_GFF_PATH = os.path.join(UNFILTERED_IN_PATH, GENOME_IN + '_h_ctg.anno.gff3')
P_CTG_GFF_PATH = os.path.join(UNFILTERED_IN_PATH, GENOME_IN + '_p_ctg.anno.gff3')

eValueCutoff = 1e-10

In [6]:
# Read in DataFrames (BLAST, transposonPSI, GFF)
blast_header = ['Query', 'Target', 'PctID', 'AlnLgth', 'NumMis', 'NumGap', 'StartQuery', 'StopQuery', 'StartTarget', 'StopTarget', 'e-value','BitScore']
blast_df = pd.read_csv(BLAST_IN_FILE, sep='\t', header=None, names=blast_header)
blast_df = blast_df[blast_df['e-value'] < eValueCutoff]

tpsi_df = pd.read_csv(TPSI_IN_FILE, sep='\t', header=None)
tpsi_df = tpsi_df[tpsi_df[19] < eValueCutoff] # column 19 is the e-value column

pCtg_gff_df = pd.read_table(P_CTG_GFF_PATH, header = None, index_col = None, names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes'])
htg_gff_df = pd.read_table(H_CTG_GFF_PATH, header = None, index_col = None, names = ['seqid', 'source', 'type', 'start', 'end', 'score', 'strand', 'phase', 'attributes']) 

In [8]:
blast_df.head()

Unnamed: 0,Query,Target,PctID,AlnLgth,NumMis,NumGap,StartQuery,StopQuery,StartTarget,StopTarget,e-value,BitScore
0,evm.model.hcontig_006_028.2,I-2_BF_pol#LINE/Jockey,31.47,143,74,1,25,143,1092,1234,1e-14,71.2
1,evm.model.hcontig_006_028.2,Outcast-1_BF_pol#LINE/Jockey,31.21,141,73,1,25,141,1135,1275,3e-14,70.1
2,evm.model.hcontig_006_028.2,I-2_AC_pol#LINE/I,31.88,138,83,4,10,141,1105,1237,7e-13,65.9
3,evm.model.hcontig_006_028.2,Tad1-1_EP_pol#LINE/Tad1,27.33,172,92,5,5,143,1112,1283,4e-12,63.5
4,evm.model.hcontig_006_028.2,Nimb-3_DR_pol#LINE/I-Nimb,27.88,165,95,3,3,143,1050,1214,7e-11,59.7


In [4]:
# Extract 'contigLoc' (e.g. hcontig_006_028.2) from ID in attributes and assign to a
# temporary new column for ease of access & comparison.
ID_SEARCH = re.compile(r'^.*ID=(.*?)(;|$)')
P_CONTIG_LOC_SEARCH = re.compile(r'^.*\.(pcontig_\d{3}\.\d+)(?:\.|$)') # P_CONTIG_LOC_SEARCH.match('cds.evm.model.pcontig_000.1').group(1) will yield 'pcontig_000.1'
H_CONTIG_LOC_SEARCH = re.compile(r'^.*\.((?:p|h)contig_\d{3}(?:_\d{3})?\.\d+)(?:\.|$)') # H_CONTIG_LOC_SEARCH.match('evm.model.hcontig_006_028.2').group(1) will yield 'hcontig_006_028.2'

pCtg_gff_df['contigID'] = pCtg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
htg_gff_df['contigID'] = htg_gff_df['attributes'].apply(lambda s: ID_SEARCH.match(s).group(1))
pCtg_gff_df['contigLoc'] = pCtg_gff_df['contigID'].apply(lambda s: P_CONTIG_LOC_SEARCH.match(s).group(1))
htg_gff_df['contigLoc'] = htg_gff_df['contigID'].apply(lambda s: H_CONTIG_LOC_SEARCH.match(s).group(1))

tpsi_df['contigLoc'] = tpsi_df[5].apply(lambda s: s.replace('evm.model.', ''))
blast_df['contigLoc'] = blast_df['Query'].apply(lambda s: s.replace('evm.model.', ''))

In [5]:
# filter out any elements that are in either the blast_df or the tpsi_df (note these DataFrames have already been filtered by e-value)
filtered_htg_gff_df = htg_gff_df[(~htg_gff_df['contigLoc'].isin(blast_df['contigLoc'])) & (~htg_gff_df['contigLoc'].isin(tpsi_df['contigLoc']))]
filtered_pCtg_gff_df = pCtg_gff_df[(~pCtg_gff_df['contigLoc'].isin(blast_df['contigLoc'])) & (~pCtg_gff_df['contigLoc'].isin(tpsi_df['contigLoc']))]

## THIS PART OF THE CODE IS NOT NEEDED FOR FUTURE STRAINS.
########## START ##########

## Discovered bug due to incomplete renaming when re-assigning pwoh to htgs; changed seqid
## but did not change attributes column. Thus, the ID tag of the attributes column was 
## still the same as the initial primary contigs (pcontig_xxx.x instead of hcontig_xxx_xxx.x)
## blast dataframe was already generated with this incorrect labelling, so we filter FIRST 
## (above) based on these incorrect labels, and then fix the labels after (below).
# fix attributes column before writing to gff.
def fixHtgAttributes(row):
    attributes = row['attributes']
    contigLoc = row['contigLoc']
    seqid = row['seqid']
    
    numberSuffix = contigLoc.split('.')[-1]
    if attributes.find('pcontig') == -1:
        return attributes
    return attributes.replace(contigLoc, '%s.%s' %(seqid, str(numberSuffix)))

# fix attributes column (may not be required in future versions)
filtered_htg_gff_df['attributes'] = filtered_htg_gff_df.apply(fixHtgAttributes, axis=1)

########## END ##########

# report how many elements were filtered
sumBeforeFilter = htg_gff_df['contigLoc'].unique().size + pCtg_gff_df['contigLoc'].unique().size
sumAfterFilter = filtered_htg_gff_df['contigLoc'].unique().size + filtered_pCtg_gff_df['contigLoc'].unique().size
print("Number of elements filtered out: %s" % (sumBeforeFilter - sumAfterFilter))

# drop temporarily-created columns
filtered_htg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)
filtered_pCtg_gff_df.drop(['contigID', 'contigLoc'], inplace=True, axis=1)

# write to new CSV
filtered_htg_gff_df.to_csv(os.path.join(FILTERED_OUT_PATH, GENOME_OUT + '_h_ctg.anno.gff3'), sep='\t', header=None, index=None)
filtered_pCtg_gff_df.to_csv(os.path.join(FILTERED_OUT_PATH, GENOME_OUT + '_p_ctg.anno.gff3'), sep='\t', header=None, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Number of elements filtered out: 5344


In [6]:
print("Number of unique TE hits found from BLAST (e < 1e-10): %s" % blast_df['Query'].unique().size)
print("Number of unique TE hits from TPSI (e < 1e-10): %s" % tpsi_df[5].unique().size)
print("Number of unique TE hits from BLAST or TPSI (e < 1e-10): %s" % pd.concat([tpsi_df[5], blast_df['Query']], axis=0).unique().size)

Number of unique TE hits found from BLAST (e < 1e-10): 5042
Number of unique TE hits from TPSI (e < 1e-10): 3758
Number of unique TE hits from BLAST or TPSI (e < 1e-10): 5344
