### Notebook for homology searches for DNA repair machinery

This needs to have Java 11 in the path and for example run in the pycoMeth environment only

In [314]:
import os
from Bio import SeqIO
import pandas as pd
import numpy as np
import re

In [2]:
notebook_path = os.path.abspath(".")

In [3]:
IN_DIR = os.path.abspath('../../analyses/dna_repair_machinery')
OUT_DIR = os.path.abspath('../../analyses/dna_repair_machinery')
GENOME_DIR = os.path.abspath('../../data/genomic_resources/')

In [4]:
Pgt_protein_fn = os.path.abspath('../../data/genomic_resources/Puccinia_graminis_tritici_21-0.proteins.fa')
DNA_repair_seeds_fn = os.path.abspath('../../analyses/dna_repair_machinery/uniprot-yourlist_M20200319A2A5A37CD3FF71F97605B695F360A9FA08E70B1.fasta')

In [5]:
n_threads = 20
blast_outfmt6_headers = "qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore".split(' ')

In [340]:
###write a function that takes the interpro TSV and returns a dict of domains for a specific search engine
def interpro_accession_dict(fn):
    header = ['P-ID', 'md5', 'len', 'analysis', 'accession', 'description', 'start', 'stop', 'score', 'status' , 'date',
         'Interpro_accession', 'Interpro_description']
    df = pd.read_csv(fn, sep='\t', header =None, names= header).dropna()
    return dict(zip(df.groupby('P-ID')['Interpro_accession'].unique().index, df.groupby('P-ID')['Interpro_accession'].unique()))

In [341]:
###write a function that takes the interpro TSV and returns a dict of domains for a specific search engine
def interpro_analysis_dict(fn, analysis):
    header = ['P-ID', 'md5', 'len', 'analysis', 'accession', 'description', 'start', 'stop', 'score', 'status' , 'date',
         'Interpro_accession', 'Interpro_description']
    df = pd.read_csv(fn, sep='\t', header =None, names= header).dropna()
    grouped = df[df.analysis == analysis].groupby('P-ID')
    return dict(zip(grouped['analysis'].unique().index, grouped['accession'].unique()))

### Here the blast analysis starts

In [6]:
os.chdir(OUT_DIR)

In [7]:
!makeblastdb -dbtype prot -in {Pgt_protein_fn}



Building a new DB, current time: 03/31/2020 17:54:19
New DB name:   /home/jamila/jamila_Storage/data/genomic_resources/Puccinia_graminis_tritici_21-0.proteins.fa
New DB title:  /home/jamila/jamila_Storage/data/genomic_resources/Puccinia_graminis_tritici_21-0.proteins.fa
Sequence type: Protein
Deleted existing Protein BLAST database named /home/jamila/jamila_Storage/data/genomic_resources/Puccinia_graminis_tritici_21-0.proteins.fa
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 37832 sequences in 0.929826 seconds.


In [None]:
!blastp -help

In [8]:
outfmt_6_fn = 'Puccinia_graminis_tritici_21-0.proteins.DNA_repair_machinery_uniprot.blastp.outfmt6'
outfmt_6_fn = os.path.join(OUT_DIR, outfmt_6_fn)
outfmt_1_fn = outfmt_6_fn.replace('.outfmt6', '.outfmt1')

In [9]:
!blastp -num_threads {n_threads} -outfmt 6 -query {DNA_repair_seeds_fn} -db {Pgt_protein_fn} > {outfmt_6_fn}

In [10]:
!blastp -num_threads {n_threads} -query {DNA_repair_seeds_fn} -db {Pgt_protein_fn} > {outfmt_1_fn}

In [11]:
blast_outfmt6_headers

['qseqid',
 'sseqid',
 'pident',
 'length',
 'mismatch',
 'gapopen',
 'qstart',
 'qend',
 'sstart',
 'send',
 'evalue',
 'bitscore']

In [12]:
!head {outfmt_6_fn}

sp|P31378|NTH1_YEAST	PGT21_031718-T1	41.026	234	116	4	93	313	69	293	1.82e-47	168
sp|P31378|NTH1_YEAST	PGT21_033004-T1	41.026	234	116	4	93	313	69	293	7.76e-47	166
sp|P31378|NTH1_YEAST	PGT21_003802-T1	31.746	63	40	1	259	321	362	421	0.22	33.9
sp|P31378|NTH1_YEAST	PGT21_004151-T1	34.545	55	33	1	260	314	360	411	0.41	32.7
sp|P31378|NTH1_YEAST	PGT21_000308-T1	19.565	92	60	2	44	135	17	94	2.4	29.3
sp|P31378|NTH1_YEAST	PGT21_008054-T1	29.487	78	54	1	234	311	197	273	2.9	30.0
sp|P31378|NTH1_YEAST	PGT21_032633-T1	34.524	84	48	3	46	129	47	123	4.7	29.3
sp|P31378|NTH1_YEAST	PGT21_002873-T1	34.524	84	48	3	46	129	47	123	5.6	28.9
sp|P31378|NTH1_YEAST	PGT21_030196-T1	41.176	34	20	0	325	358	270	303	7.0	28.9
sp|P31378|NTH1_YEAST	PGT21_028400-T1	41.176	34	20	0	325	358	270	303	7.0	28.9


### Downstream filtering of blast resutls

In [13]:
blast_df = pd.read_csv(outfmt_6_fn, header = None, names=blast_outfmt6_headers, sep='\t' )

In [14]:
#filtering of blast_df
stringent_blast_df =  blast_df[blast_df.evalue < 1e-10].copy() 

In [15]:
stringent_blast_df.groupby('qseqid')['sseqid'].count()

qseqid
sp|P04819|DNLI1_YEAST     8
sp|P06777|RAD1_YEAST      4
sp|P06778|RAD52_YEAST     2
sp|P06779|RAD7_YEAST      2
sp|P06838|RAD10_YEAST     1
sp|P06839|RAD3_YEAST      4
sp|P07276|RAD2_YEAST     10
sp|P12689|REV1_YEAST      4
sp|P12753|RAD50_YEAST     4
sp|P14242|PMS1_YEAST      9
sp|P14284|DPOZ_YEAST     12
sp|P14736|RAD4_YEAST      2
sp|P22134|MAG_YEAST       1
sp|P25301|RAD57_YEAST     4
sp|P25336|MSH3_YEAST     15
sp|P25847|MSH2_YEAST     16
sp|P28519|RAD14_YEAST     2
sp|P31378|NTH1_YEAST      2
sp|P32628|RAD23_YEAST     2
sp|P32776|TFB1_YEAST      2
sp|P32829|MRE11_YEAST     4
sp|P32849|RAD5_YEAST     65
sp|P38086|RDH54_YEAST    58
sp|P38111|ATR_YEAST       7
sp|P40352|RAD26_YEAST    64
sp|P40469|MET18_YEAST     2
sp|P40965|MSH4_YEAST     14
sp|P53397|OGG1_YEAST      2
sp|Q00578|RAD25_YEAST     4
sp|Q03834|MSH6_YEAST     14
sp|Q04049|POLH_YEAST      3
sp|Q12175|MSH5_YEAST     15
Name: sseqid, dtype: int64

In [16]:
DNA_repair_seeds_ids = []
for seq in SeqIO.parse(DNA_repair_seeds_fn, 'fasta'):
    DNA_repair_seeds_ids.append(seq.id)

In [17]:
not_present = set(DNA_repair_seeds_ids) - set(stringent_blast_df.qseqid.unique())

In [18]:
not_present

{'sp|P10862|RAD18_YEAST',
 'sp|P22936|APN1_YEAST',
 'sp|P25615|DPO4_YEAST',
 'sp|P38207|APN2_YEAST',
 'sp|P38953|RAD55_YEAST',
 'sp|P48581|RAD17_YEAST',
 'sp|Q04231|RAD33_YEAST',
 'sp|Q06665|RAD34_YEAST',
 'sp|Q08702|APTX_YEAST',
 'sp|Q12223|RAD59_YEAST'}

In [19]:
set(DNA_repair_seeds_ids) - set(blast_df[blast_df.evalue < 1e-2].qseqid.unique())

{'sp|P22936|APN1_YEAST',
 'sp|P25615|DPO4_YEAST',
 'sp|P38207|APN2_YEAST',
 'sp|P38953|RAD55_YEAST',
 'sp|Q04231|RAD33_YEAST',
 'sp|Q08702|APTX_YEAST'}

In [20]:
##pull out fasta sequence of all the hits
e_value = 0.01
Pgt_protein_hit_fn = 'Puccinia_graminis_tritici_21-0.proteins.DNA_repair_machinery_uniprot.blastp-%s.fasta' % e_value
Pgt_protein_hit_fn = os.path.join(OUT_DIR, Pgt_protein_hit_fn)

In [62]:
###get all the hits once and subset the blast with the e-value selected
hit_ids = blast_df[blast_df.evalue < e_value].sseqid.unique()
hit_list = []
sub_blast_df = blast_df[blast_df.evalue < e_value].copy()
for seq in SeqIO.parse(Pgt_protein_fn, 'fasta'):
    if seq.id in hit_ids:
        print(seq.id)
        hit_list.append(seq)
SeqIO.write(hit_list, Pgt_protein_hit_fn, 'fasta')

PGT21_001046-T1
PGT21_001224-T1
PGT21_001410-T1
PGT21_001420-T1
PGT21_001479-T1
PGT21_001518-T1
PGT21_001585-T1
PGT21_001604-T1
PGT21_001683-T1
PGT21_001685-T1
PGT21_001816-T1
PGT21_001862-T1
PGT21_001937-T1
PGT21_001955-T1
PGT21_002175-T1
PGT21_002207-T1
PGT21_002374-T1
PGT21_002375-T1
PGT21_002481-T1
PGT21_002580-T1
PGT21_002714-T1
PGT21_002732-T1
PGT21_003250-T1
PGT21_003294-T1
PGT21_003627-T1
PGT21_003877-T1
PGT21_003898-T1
PGT21_003973-T1
PGT21_004258-T1
PGT21_004354-T1
PGT21_005000-T1
PGT21_005150-T1
PGT21_005168-T1
PGT21_005374-T1
PGT21_005374-T3
PGT21_005678-T1
PGT21_006560-T1
PGT21_007053-T1
PGT21_007126-T1
PGT21_007126-T2
PGT21_007215-T1
PGT21_007215-T2
PGT21_008482-T1
PGT21_008504-T1
PGT21_008704-T1
PGT21_008716-T1
PGT21_008815-T1
PGT21_008821-T1
PGT21_009126-T1
PGT21_009203-T1
PGT21_009319-T1
PGT21_009388-T1
PGT21_009924-T1
PGT21_010090-T1
PGT21_010181-T1
PGT21_010761-T1
PGT21_010891-T1
PGT21_011016-T1
PGT21_011159-T1
PGT21_011159-T2
PGT21_011280-T1
PGT21_011280-T3
PGT21_01

197

### Pull in haplotype information

In [344]:
pgt_gff3_fn = os.path.join('../../data/genomic_resources/Puccinia_graminis_tritici_21-0.gff3')

In [346]:
with open(pgt_gff3_fn, 'r') as fh:
    haplotype_dict = {}
    for line in fh:
        line = line.rstrip()
        if any(s in line for s in hit_ids):
            for hit in hit_ids:
                if hit in line:
                    haplotype_dict[hit] = line.split('\t')[0][-1]

In [350]:
len(haplotype_dict.values()) == len(hit_ids)

True

In [353]:
sub_blast_df['shaplotype'] = sub_blast_df.sseqid.map(haplotype_dict)

In [363]:
#get the locus id for loci with multiple transcripts
sub_blast_df['sseqid_locus'] = [x.split('-')[0] for x in sub_blast_df.sseqid]

In [373]:
#only keep the transcript witht the best hit
sub_blast_df.drop_duplicates(['qseqid', 'sseqid_locus'], keep='first', inplace = True)

### Do Interpro scan on command line

In [25]:
interpro5 = '/home/jamila/anaconda3/downloads/interproscan-5.42-78.0/interproscan.sh'

In [22]:
TMP_DIR = os.path.join(OUT_DIR, 'tmp')
if not os.path.exists(TMP_DIR):
    os.mkdir(TMP_DIR)

In [31]:
Pgt_protein_hit_intrpro_fn = os.path.join(TMP_DIR, os.path.basename(Pgt_protein_hit_fn).replace('.fasta', '.interpro5.tsv'))
DNA_repair_seeds_intrpro_fn = os.path.join(TMP_DIR, os.path.basename(DNA_repair_seeds_fn).replace('.fasta', '.interpro5.tsv'))

Run interpro on both set of protein files

In [None]:
!bash {interpro5} -cpu 4 -i {Pgt_protein_hit_fn} -f tsv -iprlookup -o {Pgt_protein_hit_intrpro_fn}

31/03/2020 18:03:23:663 Welcome to InterProScan-5.42-78.0
31/03/2020 18:03:23:666 Running InterProScan v5 in STANDALONE mode... on Linux
31/03/2020 18:03:59:818 Loading file /home/jamila/jamila_Storage/analyses/dna_repair_machinery/Puccinia_graminis_tritici_21-0.proteins.DNA_repair_machinery_uniprot.blastp-0.01.fasta
31/03/2020 18:03:59:839 Running the following analyses:
[CDD-3.17,Coils-2.2.1,Gene3D-4.2.0,Hamap-2020_01,MobiDBLite-2.0,PANTHER-14.1,Pfam-32.0,PIRSF-3.02,PRINTS-42.0,ProSitePatterns-2019_11,ProSiteProfiles-2019_11,SFLD-4,SMART-7.1,SUPERFAMILY-1.75,TIGRFAM-15.0]
Available matches will be retrieved from the pre-calculated match lookup service.

Matches for any sequences that are not represented in the lookup service will be calculated locally.
31/03/2020 18:10:27:132 25% completed
31/03/2020 18:12:49:792 51% completed


In [None]:
!bash {interpro5} -cpu 4 -i {DNA_repair_seeds_fn} -f tsv -iprlookup -o {DNA_repair_seeds_intrpro_fn}

In [374]:
#pull in interpro results and add them to the dataframe
sub_blast_df['q_pfam'] = sub_blast_df.qseqid.map(interpro_analysis_dict(DNA_repair_seeds_intrpro_fn, 'Pfam'))
sub_blast_df['q_interpro'] = sub_blast_df.qseqid.map(interpro_accession_dict(DNA_repair_seeds_intrpro_fn))
sub_blast_df['s_pfam'] = sub_blast_df.sseqid.map(interpro_analysis_dict(Pgt_protein_hit_intrpro_fn, 'Pfam'))
sub_blast_df['s_interpro'] = sub_blast_df.sseqid.map(interpro_accession_dict(Pgt_protein_hit_intrpro_fn))


In [375]:
#do some cosmetics on the the dataframe for proteins without interpro /pfam domains because pandas is wierd sometimes.
for cln in ['q_pfam', 'q_interpro', 's_pfam','s_interpro']:
    if sub_blast_df[cln].isna().sum():
        sub_blast_df.loc[sub_blast_df[sub_blast_df[cln].isna()].index, cln] = [ [[]] * sub_blast_df[cln].isna().sum() ]

In [376]:
#calculate the fraction of overlapping interpro/pfam domains between query sequences and hits
sub_blast_df['pfam_int'] = sub_blast_df.apply(lambda row: set(row['q_pfam']).intersection(set(row['s_pfam'])) , axis=1)
sub_blast_df['pfam_int_frac'] = sub_blast_df['pfam_int'].apply(lambda x: len(x)) / sub_blast_df['q_pfam'].apply(lambda x: len(x))
sub_blast_df['interpro_int'] = sub_blast_df.apply(lambda row: set(row['q_interpro']).intersection(set(row['s_interpro'])) , axis=1)
sub_blast_df['interpro_int_frac'] = sub_blast_df['interpro_int'].apply(lambda x: len(x)) / sub_blast_df['q_interpro'].apply(lambda x: len(x))

In [377]:
sub_blast_df.iloc[:,[0,1,10, 17, 18,19]].head(30)

Unnamed: 0,qseqid,sseqid,evalue,interpro_int,pfam_int_frac,interpro_int_frac
0,sp|P31378|NTH1_YEAST,PGT21_031718-T1,1.8200000000000001e-47,"{IPR003265, IPR000445, IPR030841}",1.0,0.75
1,sp|P31378|NTH1_YEAST,PGT21_033004-T1,7.76e-47,"{IPR003265, IPR000445, IPR030841}",1.0,0.75
12,sp|P25847|MSH2_YEAST,PGT21_008716-T1,0.0,"{IPR000432, IPR007861, IPR007696}",0.6,0.6
13,sp|P25847|MSH2_YEAST,PGT21_008482-T1,1.0299999999999999e-109,"{IPR007860, IPR007861, IPR007696}",0.6,0.6
14,sp|P25847|MSH2_YEAST,PGT21_008504-T1,1.07e-87,{IPR000432},0.2,0.2
15,sp|P25847|MSH2_YEAST,PGT21_011016-T1,5.9399999999999993e-64,"{IPR007861, IPR007696, IPR007860, IPR007695, I...",1.0,1.0
16,sp|P25847|MSH2_YEAST,PGT21_009924-T1,4.79e-63,"{IPR007861, IPR007860, IPR007696, IPR007695, I...",1.0,1.0
17,sp|P25847|MSH2_YEAST,PGT21_010090-T1,9.73e-51,"{IPR007695, IPR000432, IPR007696}",0.6,0.6
18,sp|P25847|MSH2_YEAST,PGT21_012155-T1,1.7599999999999998e-50,"{IPR007695, IPR000432, IPR007696}",0.6,0.6
19,sp|P25847|MSH2_YEAST,PGT21_011159-T1,9.27e-40,"{IPR000432, IPR007696}",0.4,0.4


In [378]:
#filter the dataframe to have only hits that have the best possible interpro domains fractions
pfam_filt_df = sub_blast_df[sub_blast_df.groupby('qseqid')['interpro_int_frac'].transform(max) == sub_blast_df['interpro_int_frac']]

In [379]:
##look at how many hits per query sequence are still left
pfam_filt_df.groupby('qseqid')['sseqid'].count()

qseqid
sp|P04819|DNLI1_YEAST     4
sp|P06777|RAD1_YEAST      1
sp|P06778|RAD52_YEAST     2
sp|P06838|RAD10_YEAST     1
sp|P06839|RAD3_YEAST      2
sp|P07276|RAD2_YEAST      2
sp|P10862|RAD18_YEAST     8
sp|P12689|REV1_YEAST      2
sp|P12753|RAD50_YEAST     2
sp|P14242|PMS1_YEAST      2
sp|P14284|DPOZ_YEAST      4
sp|P14736|RAD4_YEAST      2
sp|P22134|MAG_YEAST       2
sp|P25301|RAD57_YEAST     4
sp|P25336|MSH3_YEAST      2
sp|P25847|MSH2_YEAST      2
sp|P28519|RAD14_YEAST     2
sp|P31378|NTH1_YEAST      2
sp|P32628|RAD23_YEAST     2
sp|P32776|TFB1_YEAST      2
sp|P32829|MRE11_YEAST     2
sp|P32849|RAD5_YEAST      6
sp|P38086|RDH54_YEAST    40
sp|P38111|ATR_YEAST       2
sp|P40352|RAD26_YEAST    40
sp|P40469|MET18_YEAST     2
sp|P40965|MSH4_YEAST      2
sp|P48581|RAD17_YEAST     2
sp|P53397|OGG1_YEAST      2
sp|Q00578|RAD25_YEAST     2
sp|Q03834|MSH6_YEAST      1
sp|Q04049|POLH_YEAST      3
sp|Q06665|RAD34_YEAST     2
sp|Q12175|MSH5_YEAST      9
sp|Q12223|RAD59_YEAST     2
Name: sseqid,

In [380]:
pfam_filt_df[pfam_filt_df.qseqid == 'sp|P38086|RDH54_YEAST'].sort_values(['evalue']).evalue.tolist()

[8.629999999999998e-147,
 1.1599999999999996e-146,
 1.0599999999999997e-126,
 1.4399999999999996e-126,
 2.08e-57,
 2.4e-57,
 1.19e-54,
 1.2399999999999998e-54,
 7.709999999999999e-53,
 7.769999999999999e-53,
 1.1699999999999999e-52,
 1.09e-50,
 3.08e-50,
 2.23e-46,
 8.329999999999998e-45,
 1.74e-44,
 6.48e-37,
 1.36e-32,
 6.3999999999999995e-31,
 3.3499999999999996e-30,
 3.41e-30,
 7.639999999999999e-30,
 6e-26,
 6.64e-26,
 8.050000000000001e-25,
 8.329999999999998e-25,
 1.3600000000000002e-24,
 1.63e-24,
 1.5699999999999999e-21,
 2.85e-21,
 2.7300000000000004e-20,
 3.14e-20,
 5.25e-20,
 1.1199999999999999e-19,
 5.760000000000001e-19,
 1.27e-18,
 1.3e-17,
 4.48e-06,
 0.00055,
 0.0005639999999999999]

### incorporate orthofinder

In [381]:
orthofinder_yeast_fn = os.path.abspath('orthofinder/fastas/OrthoFinder/Results_Apr02/Orthologues/Orthologues_uniprot-proteome_UP000002311/uniprot-proteome_UP000002311__v__Puccinia_graminis_tritici_21-0.proteins.tsv')

In [426]:
with open(orthofinder_yeast_fn, 'r') as fh:
    yeast_og_dict = {}
    yeast_to_pgt_dict = {}
    pgt_og_dict = {}
    for line in fh:
        if any(s in line for s in sub_blast_df.qseqid.unique()):
            line = line.rstrip()
            og = line.split('\t')[0]
            yeast_hits = line.split('\t')[1].split(',')
            pgt_hits = line.split('\t')[2].split(',')
            real_hit = ''
            for hit in yeast_hits:
                yeast_og_dict[hit.lstrip()] = og
                yeast_to_pgt_dict[hit] = [x.lstrip() for x in pgt_hits]
            for hit in pgt_hits:

                #print(hit.lstrip())
                pgt_og_dict[hit.lstrip()] = og
            #print(line)
#print(pgt_og_dict, yeast_og_dict)

In [440]:
pfam_filt_df['q_OG'] = pfam_filt_df.qseqid.map(yeast_og_dict)
pfam_filt_df['s_OG'] = pfam_filt_df.sseqid.map(pgt_og_dict)
pfam_filt_df['OG_match'] = pfam_filt_df.apply(lambda row: row['q_OG'] == row['s_OG'], axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [485]:
####generate a couple of series that can be used to capture the identification method of orthologs
pgt_match_list = []
DNA_seed_list = []
haplotype_list = []
match_type_list = []
for key, values in yeast_to_pgt_dict.items():
    if key in sub_blast_df.qseqid.unique():
        for value in values:
            if not value.endswith('-T2'):
                DNA_seed_list.append(key)
                pgt_match_list.append(value)
                match_type_list.append('OG')

In [486]:
for pgt_match in pgt_match_list:
    with open(pgt_gff3_fn, 'r') as fh:
        for line in fh:
            line = line.rstrip()
            if pgt_match in line:
                haplotype_list.append(line.split('\t')[0][-1])
                break

In [487]:
len(haplotype_list) == len(pgt_match_list)

True

In [488]:
###DNA repair seeds that were not identified by orthofinder and have good blast hits
pfam_filt_df[(~pfam_filt_df.qseqid.isin(set(DNA_seed_list))) & (pfam_filt_df.pfam_int_frac > 0)].qseqid.unique()

array(['sp|P31378|NTH1_YEAST', 'sp|P40352|RAD26_YEAST',
       'sp|P25301|RAD57_YEAST', 'sp|Q12223|RAD59_YEAST',
       'sp|P38086|RDH54_YEAST', 'sp|Q06665|RAD34_YEAST'], dtype=object)

In [489]:
for _id in pfam_filt_df[(~pfam_filt_df.qseqid.isin(set(DNA_seed_list))) & (pfam_filt_df.pfam_int_frac > 0)].qseqid.unique():
    print(pfam_filt_df[pfam_filt_df.qseqid == _id].iloc[:,[0,10, 18,19, 20]].head(30))

                 qseqid        evalue  pfam_int_frac  interpro_int_frac  \
0  sp|P31378|NTH1_YEAST  1.820000e-47            1.0               0.75   
1  sp|P31378|NTH1_YEAST  7.760000e-47            1.0               0.75   

  shaplotype  
0          A  
1          B  
                    qseqid        evalue  pfam_int_frac  interpro_int_frac  \
278  sp|P40352|RAD26_YEAST  1.480000e-80            1.0                1.0   
279  sp|P40352|RAD26_YEAST  1.870000e-80            1.0                1.0   
280  sp|P40352|RAD26_YEAST  1.930000e-77            1.0                1.0   
281  sp|P40352|RAD26_YEAST  2.150000e-77            1.0                1.0   
283  sp|P40352|RAD26_YEAST  1.920000e-75            1.0                1.0   
284  sp|P40352|RAD26_YEAST  2.000000e-75            1.0                1.0   
285  sp|P40352|RAD26_YEAST  5.050000e-75            1.0                1.0   
286  sp|P40352|RAD26_YEAST  5.150000e-75            1.0                1.0   
290  sp|P40352|RAD26_YEAST 

In [490]:
tmp_pgt_list = []
tmp_dna_list = []
tmp_match_type_list = []
for _id in pfam_filt_df[(~pfam_filt_df.qseqid.isin(set(DNA_seed_list))) & (pfam_filt_df.pfam_int_frac > 0)].qseqid.unique():
    if _id == 'sp|P40352|RAD26_YEAST':
        tmp_df = pfam_filt_df[(pfam_filt_df.qseqid == _id) & (pfam_filt_df.evalue < 1e-50)]
        tmp_pgt_list = tmp_pgt_list + tmp_df.sseqid.tolist()
        tmp_dna_list = tmp_dna_list + tmp_df.qseqid.tolist()
        tmp_match_type_list = tmp_match_type_list + ['blast'] * len(tmp_df.qseqid.tolist())
    if _id == 'sp|P25301|RAD57_YEAST':
        tmp_df = pfam_filt_df[(pfam_filt_df.qseqid == _id) & (pfam_filt_df.evalue < 1e-4)]
        tmp_pgt_list = tmp_pgt_list + tmp_df.sseqid.tolist()
        tmp_dna_list = tmp_dna_list + tmp_df.qseqid.tolist()
        tmp_match_type_list = tmp_match_type_list + ['blast'] * len(tmp_df.qseqid.tolist())
    if _id == 'sp|P38086|RDH54_YEAST':
        tmp_df = pfam_filt_df[(pfam_filt_df.qseqid == _id) & (pfam_filt_df.evalue < 1e-50)]
        tmp_pgt_list = tmp_pgt_list + tmp_df.sseqid.tolist()
        tmp_dna_list = tmp_dna_list + tmp_df.qseqid.tolist()
        tmp_match_type_list = tmp_match_type_list + ['blast'] * len(tmp_df.qseqid.tolist())
    else:
        tmp_df = pfam_filt_df[(pfam_filt_df.qseqid == _id)]
        tmp_pgt_list = tmp_pgt_list + tmp_df.sseqid.tolist()
        tmp_dna_list = tmp_dna_list + tmp_df.qseqid.tolist()
        tmp_match_type_list = tmp_match_type_list + ['blast'] * len(tmp_df.qseqid.tolist())

In [491]:
tmp_haplotype_list = []
for pgt_match in tmp_pgt_list:
    with open(pgt_gff3_fn, 'r') as fh:
        for line in fh:
            line = line.rstrip()
            if pgt_match in line:
                tmp_haplotype_list.append(line.split('\t')[0][-1])
                break

In [492]:
pgt_match_list += tmp_pgt_list
DNA_seed_list += tmp_dna_list
haplotype_list += tmp_haplotype_list
match_type_list += tmp_match_type_list

In [519]:
pgt_match_series = pd.Series(pgt_match_list, name="Pgt_match")
DNA_seed_series = pd.Series(DNA_seed_list, name='Seed_ID')
haplotype_series = pd.Series(haplotype_list, name='haplotype')
match_type_series = pd.Series(match_type_list, name='Match_type')

In [520]:
out_df = pd.concat([DNA_seed_series, pgt_match_series, haplotype_series, match_type_series], axis =1)

In [521]:
out_df.groupby('Seed_ID').count()

Unnamed: 0_level_0,Pgt_match,haplotype,Match_type
Seed_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sp|P04819|DNLI1_YEAST,2,2,2
sp|P06777|RAD1_YEAST,2,2,2
sp|P06778|RAD52_YEAST,2,2,2
sp|P06779|RAD7_YEAST,2,2,2
sp|P06838|RAD10_YEAST,2,2,2
sp|P06839|RAD3_YEAST,1,1,1
sp|P07276|RAD2_YEAST,2,2,2
sp|P12689|REV1_YEAST,2,2,2
sp|P12753|RAD50_YEAST,2,2,2
sp|P14242|PMS1_YEAST,2,2,2


In [518]:
#strange sp|P06839|RAD3_YEAST is missing a haplotype
out_df[out_df.Seed_ID == 'sp|P06839|RAD3_YEAST']

Unnamed: 0,Seed_ID,Pgt_match,haplotype,Match_type,0
66,sp|P06839|RAD3_YEAST,PGT21_030959-T1,A,OG,


In [None]:
pd.Series

In [526]:
pd.Series(['sp|P06839|RAD3_YEAST','PGT21_034602-T1', 'B', 'blast'], index=out_df.columns)

Seed_ID       sp|P06839|RAD3_YEAST
Pgt_match          PGT21_034602-T1
haplotype                        B
Match_type                   blast
dtype: object

In [530]:
out_df = out_df.append(pd.Series(['sp|P06839|RAD3_YEAST','PGT21_034602-T1', 'B', 'blast'], index=out_df.columns), ignore_index=True)

In [531]:
out_fn = os.path.abspath('DNA_repair_machinery_orthologs.Pgt21-0.tsv')

In [532]:
out_df.to_csv(out_fn, sep='\t', index=None)

In [533]:
!tail {out_fn}

sp|P38086|RDH54_YEAST	PGT21_019144-T1	B	blast
sp|P38086|RDH54_YEAST	PGT21_017365-T1	A	blast
sp|P38086|RDH54_YEAST	PGT21_005374-T3	B	blast
sp|P38086|RDH54_YEAST	PGT21_005678-T1	A	blast
sp|P38086|RDH54_YEAST	PGT21_025967-T1	B	blast
sp|P38086|RDH54_YEAST	PGT21_011475-T1	A	blast
sp|P38086|RDH54_YEAST	PGT21_023882-T1	A	blast
sp|Q06665|RAD34_YEAST	PGT21_016716-T1	B	blast
sp|Q06665|RAD34_YEAST	PGT21_016568-T1	A	blast
sp|P06839|RAD3_YEAST	PGT21_034602-T1	B	blast


In [534]:
out_df.groupby('Seed_ID').count()

Unnamed: 0_level_0,Pgt_match,haplotype,Match_type
Seed_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sp|P04819|DNLI1_YEAST,2,2,2
sp|P06777|RAD1_YEAST,2,2,2
sp|P06778|RAD52_YEAST,2,2,2
sp|P06779|RAD7_YEAST,2,2,2
sp|P06838|RAD10_YEAST,2,2,2
sp|P06839|RAD3_YEAST,2,2,2
sp|P07276|RAD2_YEAST,2,2,2
sp|P12689|REV1_YEAST,2,2,2
sp|P12753|RAD50_YEAST,2,2,2
sp|P14242|PMS1_YEAST,2,2,2
