### Notebook for homology searches for DNA meythlation machinery

This needs to have Java 11 in the path and for example run in the pycoMeth environment only

In [1]:
import os
from Bio import SeqIO
import pandas as pd
import numpy as np
import re

In [2]:
notebook_path = os.path.abspath(".")

In [3]:
IN_DIR = os.path.abspath('../../analyses/methylation_machinery/')
OUT_DIR = os.path.abspath('../../analyses/methylation_machinery/')
GENOME_DIR = os.path.abspath('../../data/genomic_resources/')

In [4]:
Pgt_protein_fn = os.path.abspath('../../data/genomic_resources/Puccinia_graminis_tritici_21-0.proteins.fa')
FivemC_seeds_fn = os.path.abspath('../../analyses/methylation_machinery/5mC_methylation_query.fasta')


In [5]:
n_threads = 20
blast_outfmt6_headers = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(' ')

In [6]:
###write a function that takes the interpro TSV and returns a dict of domains for a specific search engine
def interpro_accession_dict(fn):
    header = ['P-ID', 'md5', 'len', 'analysis', 'accession', 'description', 'start', 'stop', 'score', 'status' , 'date',
         'Interpro_accession', 'Interpro_description']
    df = pd.read_csv(fn, sep='\t', header =None, names= header).dropna()
    return dict(zip(df.groupby('P-ID')['Interpro_accession'].unique().index, df.groupby('P-ID')['Interpro_accession'].unique()))

In [7]:
###write a function that takes the interpro TSV and returns a dict of domains for a specific search engine
def interpro_analysis_dict(fn, analysis):
    header = ['P-ID', 'md5', 'len', 'analysis', 'accession', 'description', 'start', 'stop', 'score', 'status' , 'date',
         'Interpro_accession', 'Interpro_description']
    df = pd.read_csv(fn, sep='\t', header =None, names= header).dropna()
    grouped = df[df.analysis == analysis].groupby('P-ID')
    return dict(zip(grouped['analysis'].unique().index, grouped['accession'].unique()))

### Here the blast analysis starts

In [8]:
os.chdir(OUT_DIR)

In [None]:
!makeblastdb -dbtype prot -in {Pgt_protein_fn}

In [None]:
!blastp -help

In [9]:
#define file names
FivemC_outfmt_6_fn = 'Puccinia_graminis_tritici_21-0.proteins.5mC_methylation_query.blastp.outfmt6'
FivemC_outfmt_6_fn = os.path.join(OUT_DIR, FivemC_outfmt_6_fn)

In [10]:
#run blast
!blastp -num_threads 20 -outfmt 6 -query {FivemC_seeds_fn} -db {Pgt_protein_fn} > {FivemC_outfmt_6_fn}

In [11]:
!head {FivemC_outfmt_6_fn}

tr|A8NEZ8|A8NEZ8_COPC7	PGT21_014413-T1	25.967	905	481	33	469	1238	266	1116	6.85e-55	209
tr|A8NEZ8|A8NEZ8_COPC7	PGT21_012711-T1	29.111	450	227	15	744	1122	114	542	3.11e-39	154
tr|A8NEZ8|A8NEZ8_COPC7	PGT21_014167-T1	32.558	43	29	0	688	730	32	74	4.7	29.6
tr|A8NEZ8|A8NEZ8_COPC7	PGT21_014158-T1	32.558	43	29	0	688	730	32	74	4.7	29.6
tr|A8NEZ8|A8NEZ8_COPC7	PGT21_017775-T1	32.857	70	45	2	619	688	6	73	5.3	28.9
tr|Q96W73|Q96W73_NEUCS	PGT21_012711-T1	28.621	290	161	12	834	1085	105	386	1.43e-18	91.3
tr|Q96W73|Q96W73_NEUCS	PGT21_014413-T1	28.425	292	163	12	832	1085	564	847	1.98e-18	92.0
tr|Q96W73|Q96W73_NEUCS	PGT21_037052-T1	25.397	126	83	2	907	1031	293	408	0.016	40.0
tr|Q96W73|Q96W73_NEUCS	PGT21_036642-T1	25.397	126	83	2	907	1031	292	407	0.017	39.7
tr|Q96W73|Q96W73_NEUCS	PGT21_028169-T1	24.299	107	76	2	859	964	349	451	0.094	37.4


### Downstream filtering of blast resutls

In [12]:
FivemC_blast_df = pd.read_csv(FivemC_outfmt_6_fn, header = None, names=blast_outfmt6_headers, sep='\t' )

In [13]:
FivemC_blast_df.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_014413-T1,25.967,905,481,33,469,1238,266,1116,6.85e-55,209.0
1,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_012711-T1,29.111,450,227,15,744,1122,114,542,3.1100000000000004e-39,154.0
2,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_014167-T1,32.558,43,29,0,688,730,32,74,4.7,29.6
3,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_014158-T1,32.558,43,29,0,688,730,32,74,4.7,29.6
4,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_017775-T1,32.857,70,45,2,619,688,6,73,5.3,28.9


In [14]:
#filtering of blast_df
FivemC_stringent_blast_df =  FivemC_blast_df[FivemC_blast_df.evalue < 1e-10].copy() 

In [15]:
FivemC_stringent_blast_df.groupby('qseqid')['sseqid'].count()

qseqid
sp|P26358|DNMT1_HUMAN             2
sp|Q94F88|CMT3_ARATH              2
tr|A0A4D6FZ77|A0A4D6FZ77_IXORI    2
tr|A8NEZ8|A8NEZ8_COPC7            2
tr|O42731|O42731_ASCIM            2
tr|Q96W73|Q96W73_NEUCS            2
Name: sseqid, dtype: int64

In [16]:
FivemC_stringent_blast_df.sseqid.unique()

array(['PGT21_014413-T1', 'PGT21_012711-T1'], dtype=object)

In [17]:
FivemC_seeds_ids = []
for seq in SeqIO.parse(FivemC_seeds_fn, 'fasta'):
    FivemC_seeds_ids.append(seq.id)

In [18]:
not_present = set(FivemC_seeds_ids) - set(FivemC_stringent_blast_df.qseqid.unique())

In [19]:
not_present

{'sp|Q9M548|DRM2_ARATH', 'tr|O13369|O13369_ASCIM'}

In [20]:
set(FivemC_seeds_ids) - set(FivemC_blast_df[FivemC_blast_df.evalue < 1e-2].qseqid.unique())

{'sp|Q9M548|DRM2_ARATH'}

In [21]:
##pull out fasta sequence of all the hits
e_value = 0.01
FivemC_Pgt_protein_hit_fn = 'Puccinia_graminis_tritici_21-0.proteins.5mC_methylation_query.blastp-%s.fasta' % e_value
FivemC_Pgt_protein_hit_fn = os.path.join(OUT_DIR, FivemC_Pgt_protein_hit_fn)

In [22]:
blast_df = FivemC_blast_df

In [23]:
###get all the hits once and subset the blast with the e-value selected
hit_ids = blast_df[blast_df.evalue < e_value].sseqid.unique()
hit_list = []
sub_blast_df = blast_df[blast_df.evalue < e_value].copy()
for seq in SeqIO.parse(Pgt_protein_fn, 'fasta'):
    if seq.id in hit_ids:
        print(seq.id)
        hit_list.append(seq)
SeqIO.write(hit_list, FivemC_Pgt_protein_hit_fn, 'fasta')

PGT21_012711-T1
PGT21_014413-T1
PGT21_030718-T1
PGT21_036642-T1
PGT21_037052-T1


5

In [24]:
sub_blast_df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_014413-T1,25.967,905,481,33,469,1238,266,1116,6.85e-55,209.0
1,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_012711-T1,29.111,450,227,15,744,1122,114,542,3.1100000000000004e-39,154.0
5,tr|Q96W73|Q96W73_NEUCS,PGT21_012711-T1,28.621,290,161,12,834,1085,105,386,1.43e-18,91.3
6,tr|Q96W73|Q96W73_NEUCS,PGT21_014413-T1,28.425,292,163,12,832,1085,564,847,1.98e-18,92.0
14,tr|O13369|O13369_ASCIM,PGT21_012711-T1,20.465,215,132,5,231,407,115,328,0.000119,44.7
15,tr|O13369|O13369_ASCIM,PGT21_014413-T1,20.465,215,132,5,231,407,576,789,0.000147,44.7
19,sp|P26358|DNMT1_HUMAN,PGT21_012711-T1,31.25,272,152,6,1138,1376,110,379,2.8199999999999997e-30,127.0
20,sp|P26358|DNMT1_HUMAN,PGT21_014413-T1,25.506,494,259,15,971,1376,368,840,8.089999999999999e-30,129.0
21,sp|P26358|DNMT1_HUMAN,PGT21_014413-T1,28.108,185,68,7,1412,1584,973,1104,4.43e-08,58.2
22,sp|P26358|DNMT1_HUMAN,PGT21_014413-T1,35.0,60,38,1,779,838,256,314,0.000316,45.4


### Pull in haplotype information

In [25]:
pgt_gff3_fn = os.path.join('../../data/genomic_resources/Puccinia_graminis_tritici_21-0.gff3')

In [26]:
with open(pgt_gff3_fn, 'r') as fh:
    haplotype_dict = {}
    for line in fh:
        line = line.rstrip()
        if any(s in line for s in hit_ids):
            for hit in hit_ids:
                if hit in line:
                    haplotype_dict[hit] = line.split('\t')[0][-1]

In [27]:
len(haplotype_dict.values()) == len(hit_ids)

True

In [28]:
sub_blast_df['shaplotype'] = sub_blast_df.sseqid.map(haplotype_dict)

In [29]:
#get the locus id for loci with multiple transcripts
sub_blast_df['sseqid_locus'] = [x.split('-')[0] for x in sub_blast_df.sseqid]

In [30]:
#only keep the transcript witht the best hit
sub_blast_df.drop_duplicates(['qseqid', 'sseqid_locus'], keep='first', inplace = True)

### Do Interpro scan on command line

In [31]:
interpro5 = '/home/jamila/anaconda3/downloads/interproscan-5.42-78.0/interproscan.sh'

In [32]:
TMP_DIR = os.path.join(OUT_DIR, 'tmp')
if not os.path.exists(TMP_DIR):
    os.mkdir(TMP_DIR)

In [33]:
Pgt_protein_hit_intrpro_fn = os.path.join(TMP_DIR, os.path.basename(FivemC_Pgt_protein_hit_fn).replace('.fasta', '.interpro5.tsv'))
FivemC_seeds_intrpro_fn = os.path.join(TMP_DIR, os.path.basename(FivemC_seeds_fn).replace('.fasta', '.interpro5.tsv'))

Run interpro on both set of protein files

In [34]:
!head {FivemC_Pgt_protein_hit_fn}

>PGT21_012711-T1 PGT21_012711
MTNQEQLVDVKSQLEGKCEIVHLFKDKTRQFVTVGSHREVSEESIINRDNCFYSILPTRK
QAKYFCSVPIIDDGWEKAIISQEPEWVKIEHESSKGNQYADLVKNYGCGRIRHLELFGGI
GSMSVALIELGLASQDETMFIDFSIPACQTLSTNFPRSTIICADVNEVLALMINGKTESG
QDFLVDQRTGKMICVNELPRPGDFDLITAGFPCGSHSTLNVLRKANDSKNALCATALSFI
AYLKPDYLFFENVRGLLKTSFINPGNDSVLNKAFLRIINGALISLGYQVQFGVLQAAQFG
SPQARRRIIFAGTRHGLTAIKLPEPTHHYPDEGLAILLPTNDEKSDRNGHRLVRADYRKC
SSGALKAITIHDAISDLPEFEYVNPDRIMAEYRSKRPHRIRQNDDDQELITGRSELVPQL
NRLVSFSSTLNHSTISLIGFDEFEYLTEPMNRYQSWLRKPLEWKPLVEAFIPKYPRIDNS
DPQNVQQRRRTRYDDEDGDHYALGSDCRIRNWHVTPRFSAKVTERICNIPLKPNADHRLS


In [35]:
!bash {interpro5} -cpu 4 -i {FivemC_Pgt_protein_hit_fn} -f tsv -iprlookup -o {Pgt_protein_hit_intrpro_fn}

29/04/2020 17:59:39:140 Welcome to InterProScan-5.42-78.0
29/04/2020 17:59:39:141 Running InterProScan v5 in STANDALONE mode... on Linux
29/04/2020 17:59:53:710 Loading file /home/jamila/jamila_Storage/analyses/methylation_machinery/Puccinia_graminis_tritici_21-0.proteins.5mC_methylation_query.blastp-0.01.fasta
29/04/2020 17:59:53:711 Running the following analyses:
[CDD-3.17,Coils-2.2.1,Gene3D-4.2.0,Hamap-2020_01,MobiDBLite-2.0,PANTHER-14.1,Pfam-32.0,PIRSF-3.02,PRINTS-42.0,ProSitePatterns-2019_11,ProSiteProfiles-2019_11,SFLD-4,SMART-7.1,SUPERFAMILY-1.75,TIGRFAM-15.0]
Available matches will be retrieved from the pre-calculated match lookup service.

Matches for any sequences that are not represented in the lookup service will be calculated locally.
29/04/2020 18:00:47:548 25% completed
29/04/2020 18:01:17:256 51% completed
29/04/2020 18:01:27:010 75% completed
29/04/2020 18:01:35:438 91% completed
29/04/2020 18:01:38:582 100% done:  InterProScan analyses completed 



In [36]:
!bash {interpro5} -cpu 4 -i {FivemC_seeds_fn} -f tsv -iprlookup -o {FivemC_seeds_intrpro_fn}

29/04/2020 18:01:40:200 Welcome to InterProScan-5.42-78.0
29/04/2020 18:01:40:202 Running InterProScan v5 in STANDALONE mode... on Linux
29/04/2020 18:01:55:844 Loading file /home/jamila/jamila_Storage/analyses/methylation_machinery/5mC_methylation_query.fasta
29/04/2020 18:01:55:846 Running the following analyses:
[CDD-3.17,Coils-2.2.1,Gene3D-4.2.0,Hamap-2020_01,MobiDBLite-2.0,PANTHER-14.1,Pfam-32.0,PIRSF-3.02,PRINTS-42.0,ProSitePatterns-2019_11,ProSiteProfiles-2019_11,SFLD-4,SMART-7.1,SUPERFAMILY-1.75,TIGRFAM-15.0]
Available matches will be retrieved from the pre-calculated match lookup service.

Matches for any sequences that are not represented in the lookup service will be calculated locally.
29/04/2020 18:02:44:967 25% completed
29/04/2020 18:03:15:715 50% completed
29/04/2020 18:03:32:296 75% completed
29/04/2020 18:03:51:159 93% completed
29/04/2020 18:03:57:681 100% done:  InterProScan analyses completed 



In [37]:
#pull in interpro results and add them to the dataframe
sub_blast_df['q_pfam'] = sub_blast_df.qseqid.map(interpro_analysis_dict(FivemC_seeds_intrpro_fn, 'Pfam'))
sub_blast_df['q_interpro'] = sub_blast_df.qseqid.map(interpro_accession_dict(FivemC_seeds_intrpro_fn))
sub_blast_df['s_pfam'] = sub_blast_df.sseqid.map(interpro_analysis_dict(Pgt_protein_hit_intrpro_fn, 'Pfam'))
sub_blast_df['s_interpro'] = sub_blast_df.sseqid.map(interpro_accession_dict(Pgt_protein_hit_intrpro_fn))


In [38]:
#do some cosmetics on the the dataframe for proteins without interpro /pfam domains because pandas is wierd sometimes.
for cln in ['q_pfam', 'q_interpro', 's_pfam','s_interpro']:
    if sub_blast_df[cln].isna().sum():
        sub_blast_df.loc[sub_blast_df[sub_blast_df[cln].isna()].index, cln] = [ [[]] * sub_blast_df[cln].isna().sum() ]

In [39]:
#calculate the fraction of overlapping interpro/pfam domains between query sequences and hits
sub_blast_df['pfam_int'] = sub_blast_df.apply(lambda row: set(row['q_pfam']).intersection(set(row['s_pfam'])) , axis=1)
sub_blast_df['pfam_int_frac'] = sub_blast_df['pfam_int'].apply(lambda x: len(x)) / sub_blast_df['q_pfam'].apply(lambda x: len(x))
sub_blast_df['interpro_int'] = sub_blast_df.apply(lambda row: set(row['q_interpro']).intersection(set(row['s_interpro'])) , axis=1)
sub_blast_df['interpro_int_frac'] = sub_blast_df['interpro_int'].apply(lambda x: len(x)) / sub_blast_df['q_interpro'].apply(lambda x: len(x))

In [40]:
sub_blast_df.iloc[:,[0,1,10, 17, 18,19]].head(30)

Unnamed: 0,qseqid,sseqid,evalue,s_interpro,pfam_int,pfam_int_frac
0,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_014413-T1,6.85e-55,[IPR001525],{PF00145},0.5
1,tr|A8NEZ8|A8NEZ8_COPC7,PGT21_012711-T1,3.1100000000000004e-39,[IPR001525],{PF00145},0.5
5,tr|Q96W73|Q96W73_NEUCS,PGT21_012711-T1,1.43e-18,[IPR001525],{PF00145},1.0
6,tr|Q96W73|Q96W73_NEUCS,PGT21_014413-T1,1.98e-18,[IPR001525],{PF00145},1.0
14,tr|O13369|O13369_ASCIM,PGT21_012711-T1,0.000119,[IPR001525],{PF00145},1.0
15,tr|O13369|O13369_ASCIM,PGT21_014413-T1,0.000147,[IPR001525],{PF00145},1.0
19,sp|P26358|DNMT1_HUMAN,PGT21_012711-T1,2.8199999999999997e-30,[IPR001525],{PF00145},0.2
20,sp|P26358|DNMT1_HUMAN,PGT21_014413-T1,8.089999999999999e-30,[IPR001525],{PF00145},0.2
23,sp|P26358|DNMT1_HUMAN,PGT21_036642-T1,0.001,"[IPR001525, IPR014001, IPR001650, IPR000330]",{PF00145},0.2
24,sp|P26358|DNMT1_HUMAN,PGT21_037052-T1,0.001,"[IPR000330, IPR014001, IPR001650, IPR001525]",{PF00145},0.2


In [41]:
#filter the dataframe to have only hits that have the best possible interpro domains fractions
pfam_filt_df = sub_blast_df[sub_blast_df.groupby('qseqid')['interpro_int_frac'].transform(max) == sub_blast_df['interpro_int_frac']]

In [42]:
##look at how many hits per query sequence are still left
pfam_filt_df.groupby('qseqid')['sseqid'].count()

qseqid
sp|P26358|DNMT1_HUMAN             4
sp|Q94F88|CMT3_ARATH              2
tr|A0A4D6FZ77|A0A4D6FZ77_IXORI    5
tr|A8NEZ8|A8NEZ8_COPC7            2
tr|O13369|O13369_ASCIM            2
tr|O42731|O42731_ASCIM            4
tr|Q96W73|Q96W73_NEUCS            2
Name: sseqid, dtype: int64

In [43]:
best_sseq_df = pfam_filt_df[pfam_filt_df.groupby('sseqid')['interpro_int_frac'].transform(max) == pfam_filt_df['interpro_int_frac']]

In [44]:
pgt_match_list = []
DNA_seed_list = []
haplotype_list = []
match_type_list = []
for seed_gene, pgt_gene  in zip(best_sseq_df.qseqid, best_sseq_df.sseqid): 
    if not pgt_gene.endswith('-T2'):
        DNA_seed_list.append(seed_gene)
        pgt_match_list.append(pgt_gene)
        match_type_list.append('blast')

In [45]:
pgt_match_series = pd.Series(pgt_match_list, name="Pgt_match")
DNA_seed_series = pd.Series(DNA_seed_list, name='Seed_ID')
haplotype_series = pd.Series(haplotype_list, name='haplotype')
match_type_series = pd.Series(match_type_list, name='Match_type')

  This is separate from the ipykernel package so we can avoid doing imports until


In [46]:
out_df = pd.concat([DNA_seed_series, pgt_match_series, haplotype_series, match_type_series], axis =1)

In [47]:
out_fn = os.path.join(OUT_DIR, '%s_orthologs.Pgt21-0.tsv' %os.path.basename(FivemC_seeds_fn).replace('.fasta', '') )

In [48]:
out_df.to_csv(out_fn, sep='\t', index=None)

In [49]:
!head {out_fn}

Seed_ID	Pgt_match	haplotype	Match_type
tr|O13369|O13369_ASCIM	PGT21_012711-T1		blast
tr|O13369|O13369_ASCIM	PGT21_014413-T1		blast
tr|A0A4D6FZ77|A0A4D6FZ77_IXORI	PGT21_036642-T1		blast
tr|A0A4D6FZ77|A0A4D6FZ77_IXORI	PGT21_037052-T1		blast
tr|A0A4D6FZ77|A0A4D6FZ77_IXORI	PGT21_030718-T1		blast
