In [1]:
import os
import pandas as pd
from core import read_dom_table

In [11]:
padloc_hmm_info = pd.read_table('https://raw.githubusercontent.com/padlocbio/padloc-db/refs/heads/master/hmm_meta.txt')

## Set up hmm databases

In [2]:
#padloc_db = '/home/gridsan/pdeweirdt/.conda/envs/padloc/data/hmm/padlocdb.hmm'
df_db = '../data3/interim/defense_finder_db.hmm'

In [43]:
top_dir = '/home/deweirdt/.macsyfinder/models/CasFinder/profiles/'
cas_finder_hmms = [os.path.join(top_dir, x) for x in os.listdir(top_dir) if x.endswith('.hmm') and x not in {'cas12a_V-A_4.hmm', 'cas4_V_1.hmm', 'cas9_II-B_3.hmm'}]

In [44]:
if 'defense_finder_db.hmm' not in os.listdir('../data3/interim/'):
    os.system(f"cat {' '.join(cas_finder_hmms)} /home/deweirdt/.macsyfinder/models/defense-finder-models/profiles/*.hmm > {df_db}")

## Seach E. coli 3000

In [6]:
ecoli_3k_novel_df = pd.read_parquet('../data3/interim/ecoli3k_predicted_defense_struct.pq')
ecoli_3k_novel_df = (ecoli_3k_novel_df[ecoli_3k_novel_df['hit_category'] == 'Predicted novel defense gene']
                     .reset_index(drop=True))
len(ecoli_3k_novel_df)

1041

In [7]:
ecoli_3k_novel_ids = '../data3/interim/ecoli_novel_ids.txt'

In [8]:
ecoli_3k_novel_df[['product_accession']].to_csv(ecoli_3k_novel_ids, header=False, index=False)

In [48]:
ecoli_3k_seqs = '../data/ecoli/interim/unique_seqs.faa'

In [3]:
ecoli_3k_novel_seqs = '../data3/interim/ecoli_3k_unique_novel_seqs.faa'

In [11]:
!conda run -n beaker seqtk subseq {ecoli_3k_seqs} {ecoli_3k_novel_ids} > {ecoli_3k_novel_seqs}

In [6]:
ecoli_3k_padloc_out = '../data3/interim/ecoli_3k_padloc_domtbl.txt'
ecoli_3k_padloc_temp_out = '../data/tmp/temp_ecoli_3k'

In [13]:
!conda run -n beaker hmmsearch --cpu 40 --acc --noali --domtblout {ecoli_3k_padloc_out} -o {ecoli_3k_padloc_temp_out} {padloc_db} {ecoli_3k_novel_seqs}


In [4]:
ecoli_3k_df_out = '../data3/interim/ecoli_3k_df_domtbl.txt'
ecoli_3k_df_temp_out = '../data/tmp/temp_ecoli_3k'

In [5]:
!conda run -n defensefinder hmmsearch --cpu 40 --acc --noali --domtblout {ecoli_3k_df_out} -o {ecoli_3k_df_temp_out} {df_db} {ecoli_3k_novel_seqs}


## Analyze hits

In [7]:
ecoli_3k_padloc_domtbl = read_dom_table(ecoli_3k_padloc_out)
ecoli_3k_padloc_domtbl['hmm_db'] = 'padloc'
ecoli_3k_padloc_domtbl['seq_db'] = 'ecoli_3k'
ecoli_3k_df_domtbl = read_dom_table(ecoli_3k_df_out)
ecoli_3k_df_domtbl['hmm_db'] = 'df'
ecoli_3k_df_domtbl['seq_db'] = 'ecoli_3k'

In [8]:
ecoli_3k_df_domtbl['target'].nunique()

348

In [9]:
for dom_df in [ecoli_3k_padloc_domtbl, ecoli_3k_df_domtbl]:
    dom_df['hmm_ali_len'] = dom_df['hmm_to'] - dom_df['hmm_from'] + 1
    dom_df['env_ali_len'] = dom_df['env_to'] - dom_df['env_from'] + 1
    dom_df['hmm_cov'] = dom_df['hmm_ali_len'] / dom_df['qlen']
    dom_df['t_cov'] = dom_df['env_ali_len'] / dom_df['tlen']


In [12]:
merged_ecoli_3k_padloc_domtbl = (ecoli_3k_padloc_domtbl.merge(padloc_hmm_info[['hmm.accession', 'e.val.threshold', 
                                                                                   'hmm.coverage.threshold', 'target.coverage.threshold']]
                                                                  .rename(columns={'hmm.accession': 'query_accession'}), 
                                                                  how='inner', 
                                                                  on='query_accession'))

In [13]:
merged_ecoli_3k_padloc_domtbl[['hmm.coverage.threshold', 'target.coverage.threshold', 'hmm_cov', 't_cov', 'dom_i_evalue', 'e.val.threshold']].agg(['mean','median', 'std', 'max', 'min'])

Unnamed: 0,hmm.coverage.threshold,target.coverage.threshold,hmm_cov,t_cov,dom_i_evalue,e.val.threshold
mean,0.528323,0.497367,0.307379,0.463579,18.295088,6.549725e-06
median,0.5,0.5,0.222475,0.395789,0.000435,1e-05
std,0.162951,0.169773,0.24879,0.284221,69.079561,9.080199e-06
max,0.9,0.8,1.0,1.0,1000.0,0.0001
min,0.3,0.15,0.002404,0.030896,0.0,9.999999999999999e-101


In [14]:
filtered_ecoli_3k_padloc_domtbl = merged_ecoli_3k_padloc_domtbl[(merged_ecoli_3k_padloc_domtbl['dom_i_evalue'] < merged_ecoli_3k_padloc_domtbl['e.val.threshold']) & 
                                                                    (merged_ecoli_3k_padloc_domtbl['hmm_cov'] > merged_ecoli_3k_padloc_domtbl['hmm.coverage.threshold']) &
                                                                    (merged_ecoli_3k_padloc_domtbl['t_cov'] > merged_ecoli_3k_padloc_domtbl['target.coverage.threshold']) & 
                                                                    (~merged_ecoli_3k_padloc_domtbl['query'].str.startswith('PDC'))]
filtered_ecoli_3k_padloc_domtbl['target'].nunique()

28

In [15]:
(filtered_ecoli_3k_padloc_domtbl.sort_values('dom_i_evalue')
 .groupby('query')
 .head(1))

Unnamed: 0,target,target_accession,tlen,query,query_accession,qlen,seq_evalue,seq_score,seq_bias,dom_n,...,targ_description,hmm_db,seq_db,hmm_ali_len,env_ali_len,hmm_cov,t_cov,e.val.threshold,hmm.coverage.threshold,target.coverage.threshold
1807,WP_001310496.1,-,688.0,REase_III_00008,PDLC03654,1154.0,0.0,1456.6,17.2,1.0,...,-,padloc,ecoli_3k,680.0,685.0,0.589255,0.99564,1e-05,0.5,0.3
996,WP_001375260.1,-,366.0,HEC-08_WP_093018341.1,PDLC04162,358.0,8.9e-175,576.7,6.6,1.0,...,-,padloc,ecoli_3k,351.0,366.0,0.980447,1.0,1e-20,0.7,0.7
1298,WP_064764923.1,-,390.0,pAgo-associated_WP_077117439.1,PDLC02404,368.0,7.7e-133,439.5,0.0,1.0,...,-,padloc,ecoli_3k,366.0,379.0,0.994565,0.971795,1e-08,0.5,0.5
739,WP_064764923.1,-,390.0,Aga_Helical-REase,PDLC02411,378.0,4.6e-107,354.2,0.0,1.0,...,-,padloc,ecoli_3k,376.0,378.0,0.994709,0.969231,1e-08,0.5,0.5
1614,WP_032142271.1,-,333.0,REase_II_00053,PDLC03451,293.0,2.5999999999999998e-86,285.1,0.8,1.0,...,-,padloc,ecoli_3k,268.0,285.0,0.914676,0.855856,1e-05,0.5,0.3
1196,WP_039023233.1,-,280.0,Aga_Nuclease,PDLC02413,267.0,2.6e-73,242.3,0.0,1.0,...,-,padloc,ecoli_3k,265.0,271.0,0.992509,0.967857,1e-08,0.5,0.5
1672,WP_182289208.1,-,615.0,JukA_WP_144076013.1,PDLC04111,250.0,1.4e-47,158.0,1.4,1.0,...,-,padloc,ecoli_3k,232.0,234.0,0.928,0.380488,1e-05,0.5,0.3
809,WP_182289208.1,-,615.0,JukA_WP_224020616.1,PDLC04113,245.0,1.6e-45,151.4,0.6,1.0,...,-,padloc,ecoli_3k,234.0,233.0,0.955102,0.378862,1e-05,0.5,0.3
45,WP_064764923.1,-,390.0,pAgo-associated_WP_015898278.1,PDLC02407,401.0,3.8e-32,108.0,0.0,1.0,...,-,padloc,ecoli_3k,244.0,321.0,0.608479,0.823077,1e-08,0.5,0.5
1942,WP_072644308.1,-,224.0,HEC-09_WP_196893758.1,PDLC04163,203.0,4.2e-31,103.9,0.1,1.0,...,-,padloc,ecoli_3k,200.0,224.0,0.985222,1.0,1e-20,0.7,0.7


In [16]:
df_i_eval = 0.001
df_hmm_cov = 0.4

In [17]:
filtered_ecoli_3k_df_domtbl = ecoli_3k_df_domtbl[(ecoli_3k_df_domtbl['dom_i_evalue'] < df_i_eval) & 
                                                     (ecoli_3k_df_domtbl['hmm_cov'] > df_hmm_cov)]
filtered_ecoli_3k_df_domtbl['target'].nunique()

200

In [21]:
top_ecoli_3k = (filtered_ecoli_3k_df_domtbl.sort_values('dom_i_evalue')
                .groupby('target')
                .head(1))

In [22]:
len(top_ecoli_3k)

200

In [24]:
top_ecoli_3k['query'].str.contains('DS').sum()

123

In [26]:
top_ecoli_3k['query'].value_counts()

query
VCA0374                 10
DS-1__DS-1A              8
DS-36__DS-36             7
DS-13__DS-13B            7
gcu24__gcu24             7
                        ..
DS-6__DS-6B              1
DS-29__DS-29             1
DS-32__DS-32B            1
AspAB__AspB              1
Thoeris__ThsB_Global     1
Name: count, Length: 80, dtype: int64

In [25]:
200/1041

0.19212295869356388