In [2]:
import pandas as pd
import os
import glob
import xml.etree.ElementTree as ET
import networkx as nx
from tqdm import tqdm
from core import read_mmseqs_results

In [3]:
def get_xml_file_paths(top_directory):
    pattern = f"{top_directory}/**/*.xml"
    file_paths = glob.glob(pattern, recursive=True)    
    return file_paths

In [4]:
def get_exhchangable_profiles(file_path):
    model = os.path.basename(file_path).split('.')[0]
    tree = ET.parse(os.path.expanduser(file_path))
    root = tree.getroot()
    data = []
    exchangeable_names = []
    parent_names = []
    for gene in root.findall('.//gene'):
        gene_name = gene.attrib.get('name', '')
        exchangeables = gene.findall('.//exchangeables')
        if exchangeables:
            parent_names.append(gene_name)
            for exch in exchangeables:
                exch_genes = exch.findall('.//gene')
                for sub_gene in exch_genes:
                    exch_name = sub_gene.attrib.get('name', '')
                    data.append({'model': model, 'parent_profile': gene_name, 'profile': exch_name})
                    exchangeable_names.append(exch_name)
        data.append({'model': model, 'parent_profile': gene_name, 'profile': gene_name})
    df = pd.DataFrame(data)
    rm_exchangable = [x for x in exchangeable_names if x not in parent_names]
    df = df[~df['parent_profile'].isin(rm_exchangable)].reset_index(drop=True)
    return df

## Part 1: Get defense genes to use for modeling

In [5]:
defense_finder_genes_df = pd.read_parquet('../data/interim/defense_finder_genes_genomes.pq')
defense_homolog_df = pd.read_parquet('../data/interim/defense_finder_homologs_profile_names.pq')
select_cluster_df = pd.read_parquet('../data3/interim/refseq_cover_clusters.pq')

In [6]:
true_defense_seq_ids = defense_finder_genes_df['seq_id'].drop_duplicates()
print(len(true_defense_seq_ids))
defense_homolog_seq_ids = defense_homolog_df['seq_id'].drop_duplicates()
print(len(defense_homolog_seq_ids))

241596
2272123


In [7]:
%%time
all_seqs = pd.read_parquet('../data/interim/refseq_seq_ids.pq')

CPU times: user 1min 9s, sys: 47.3 s, total: 1min 56s
Wall time: 2min 29s


In [8]:
minimal_true_defense_seq_ids = true_defense_seq_ids[true_defense_seq_ids
                                                    .isin(select_cluster_df['cluster_id'])]

In [9]:
len(minimal_true_defense_seq_ids)

14623

In [10]:
select_defense_finder_genes = (defense_finder_genes_df[defense_finder_genes_df['seq_id']
                                                       .isin(minimal_true_defense_seq_ids)]
                               .groupby('seq_id')
                               .sample(n=1, random_state=7))

In [11]:
select_defense_finder_genes['gene_name'].value_counts().head(20)

gene_name
RM_Type_IV__Type_IV_REases                     1363
RM_Type_II__Type_II_REases                      495
RM__Type_I_S                                    451
RM_Type_IIG__Type_IIG                           373
RM_Type_II__Type_II_MTases                      258
Shedu__SduA                                     195
RloC__RloC                                      183
Borvo__BovA_addition                            179
AbiH__AbiH                                      164
Septu__PtuA                                     160
AbiD__AbiD                                      155
RM__Type_I_REases                               134
Septu__PtuB                                     120
Lit__Lit                                        108
Borvo__BovA                                     101
Eleos__LeoBC                                     97
Lamassu-Fam__LmuA_effector_Cap4_nuclease_II      93
Rst_PARIS__DUF4435                               93
Mokosh_TypeI__MkoA                               90
Sp

## Part2: Get HMM clusters

### Edge type: exchangable

In [12]:
definition_file_paths = get_xml_file_paths(os.path.expanduser('~/defense-finder-models/definitions/'))

In [13]:
len(definition_file_paths)

229

In [14]:
gene_group_list = []
for f in tqdm(definition_file_paths):
    gene_group_list.append(get_exhchangable_profiles(f))
gene_group_df = pd.concat(gene_group_list)

100%|██████████| 229/229 [00:20<00:00, 11.04it/s]


In [15]:
gene_group_edges = (gene_group_df.merge(gene_group_df, how='inner', on='parent_profile')
                    .query('profile_x != profile_y')
                    [['profile_x', 'profile_y']]
                    .drop_duplicates()
                    .rename(columns={'profile_x': 'gene_name_x', 
                                     'profile_y': 'gene_name_y'}))

In [16]:
len(gene_group_edges)

16686

In [17]:
all_genes = gene_group_df['profile'].drop_duplicates()
len(all_genes)

1018

In [18]:
G = nx.from_pandas_edgelist(gene_group_edges, 'gene_name_x', 'gene_name_y')
nodes = list(G.nodes)
missing_nodes = all_genes[~all_genes.isin(nodes)].to_list()
G.add_nodes_from(missing_nodes)
print('# Edges:', G.number_of_edges())
print('# Nodes:', G.number_of_nodes())
components = nx.connected_components(G)
component_list = []
for i, comp in enumerate(components):
    subgraph = G.subgraph(comp)
    max_degree_node = max(subgraph, key=lambda node: subgraph.degree(node))
    for name in comp:
        component_list.append({'gene_name': name, 
                               'component': i, 
                               'component_name': max_degree_node})
print('# Components:', i)
exchangable_component_df = pd.DataFrame(component_list)
exchangable_component_df['component_name'].value_counts().head(20)

# Edges: 8343
# Nodes: 1018
# Components: 389


component_name
Cas__cas8a3_I-A_4                76
Menshen__NsnC_2632405575         41
Cas__csm2gr11_III-A_3            37
Cas__cas5_I-E_13                 36
Cas__cmr1gr7_III-B_III-C_2       31
Cas__cas6_I_II_III_IV_V_VI_10    27
Cas__cas12c_V-C_2                26
Cas__cse2gr11_I-E_3              23
Cas__cas7_I-A_1                  22
Cas__csx1_III_17                 19
Cas__csx19_III-D_2               19
Cas__cmr3gr5_III-B_III-C_4       19
Cas__csx21_III-D_1               19
CBASS__Sensing_SAVED             17
RosmerTA__RmrT_2623274509        14
RosmerTA__RmrA_2617826694        14
Cas__csn2_II-A_13                13
Cas__cas3HD_I_3                  13
Cas__cas2_I_II_III_IV_V_VI_6     12
Cas__cas10_III-D_1               12
Name: count, dtype: int64

In [19]:
exchangable_component_df

Unnamed: 0,gene_name,component,component_name
0,Septu__PtuA_2,0,Septu__PtuA
1,Septu__PtuA,0,Septu__PtuA
2,Retron_I_A__ATPase_TypeIA,0,Septu__PtuA
3,Septu__PtuB_2,1,Septu__PtuB_2
4,Retron_I_A__HNH_TIGR02646,1,Septu__PtuB_2
...,...,...,...
1013,PD-T4-2__PD-T4-2_B,385,PD-T4-2__PD-T4-2_B
1014,PD-T4-3__PD-T4-3,386,PD-T4-3__PD-T4-3
1015,PD-T4-4__PD-T4-4_A,387,PD-T4-4__PD-T4-4_A
1016,PD-T4-4__PD-T4-4_B,388,PD-T4-4__PD-T4-4_B


In [20]:
exchangable_component_df = exchangable_component_df.rename(columns={'component': 'defense_exchangable_num',
                                                                    'component_name': 'defense_exchangable_name'})

In [21]:
defense_seqs = all_seqs[all_seqs['seq_id'].isin(true_defense_seq_ids)]

In [22]:
merged_select_genes = (select_defense_finder_genes
                       .merge(exchangable_component_df, how='inner',
                              on='gene_name')
                       .merge(defense_seqs, how='inner',
                              on='seq_id'))

In [24]:
defense_seq_f = '../data3/interim/select_defense_seqs.faa'

In [25]:
with open(defense_seq_f, 'w') as f:
    for _, row in merged_select_genes.iterrows():
        print('>' + row['seq_id'], file=f)
        print(row['seq'], file=f)

In [26]:
defense_edges = (merged_select_genes[['seq_id', 'defense_exchangable_name']]
                 .merge(merged_select_genes[['seq_id', 'defense_exchangable_name']], 
                        how='inner', on='defense_exchangable_name'))

In [24]:
defense_edges.to_parquet('../data3/interim/defense_exchangable_edges.pq', index=False)

In [29]:
merged_select_genes.to_parquet('../data3/interim/defense_finder_model_seq_info.pq', index=False)