In [1]:
import pandas as pd
from Bio import SeqIO
import os
from tqdm import tqdm
from joblib import Parallel, delayed
from functools import reduce
import operator

In [2]:
model_seqs = pd.read_parquet('../data/interim/model_seqs.pq')
defense_finder_genes_df = pd.read_parquet('../data/interim/defense_finder_genes_genomes.pq')
cluster_df = pd.read_table('../data/interim/refseqs_clusters_mode1.tsv', 
                           names=['cluster_id', 'seq_id'])

In [3]:
%%time
seq_id_assemblies = (pd.read_csv('../data/interim/seq_assemblies.csv', 
                                 names=['product_accession', 'seq_id', 'sequence', 'genome_id']))

CPU times: user 4min 57s, sys: 31.4 s, total: 5min 29s
Wall time: 5min 30s


In [4]:
n_defense = len(defense_finder_genes_df)
n_seqs = len(seq_id_assemblies)
print('# Defense genes:', n_defense)
print('# Sequences: ', n_seqs)
print('Frac defense: ', n_defense/n_seqs)

# Defense genes: 244022
# Sequences:  68258704
Frac defense:  0.0035749580009605806


In [4]:
seq_id_assemblies = seq_id_assemblies.drop(columns=['sequence'])

In [5]:
test_model_seqs = model_seqs[model_seqs['split'] == 'test'].reset_index(drop=True)

In [6]:
test_cluster_ids = test_model_seqs['cluster_id'].unique()
len(test_cluster_ids)

102002

In [7]:
filtered_cluster_df = cluster_df[cluster_df['cluster_id'].isin(test_cluster_ids)].reset_index(drop=True)
len(filtered_cluster_df)

2372993

In [8]:
filtered_seq_id_assemblies = (seq_id_assemblies.merge(filtered_cluster_df, 
                                                      how='inner', on=['seq_id'])
                              .reset_index(drop=True))
len(filtered_seq_id_assemblies)

2466300

In [9]:
defense_genes_systems = defense_finder_genes_df[['sys_id', 'gene_name']].drop_duplicates()

In [10]:
test_clust_defense_systems = (test_model_seqs[['cluster_id', 'gene_name']]
                               .dropna()
                               .drop_duplicates()
                               .merge(defense_genes_systems, how='inner', on='gene_name')
                               [['cluster_id', 'sys_id']]
                               .drop_duplicates())
test_defense_clusts = test_clust_defense_systems['cluster_id'].drop_duplicates().to_list()
len(test_defense_clusts)

6362

In [11]:
test_defense_genes = test_model_seqs['gene_name'].dropna().drop_duplicates().to_list()
len(test_defense_genes)

97

In [12]:
train_defense_finder_genes_df = (defense_finder_genes_df[~defense_finder_genes_df['gene_name']
                                                         .isin(test_defense_genes)]
                                 .reset_index(drop=True))
len(train_defense_finder_genes_df)/len(defense_finder_genes_df)

0.9301415446148298

In [13]:
faa_base_dir = '../data/genome_downloads/faa/'
faa_files = [x for x in os.listdir(faa_base_dir) if x[-4:] == '.faa']
len(faa_files)

17454

In [14]:
def get_genome_cluster_counts(f, faa_base_dir=faa_base_dir, filtered_seq_id_assemblies=filtered_seq_id_assemblies, 
                              train_defense_finder_genes_df=train_defense_finder_genes_df, 
                              test_defense_clusts=test_defense_clusts):
    genome_records = SeqIO.parse(faa_base_dir + f, 'fasta')
    genome = f.split('.')[0]
    genome_df = filtered_seq_id_assemblies[filtered_seq_id_assemblies['genome_id'] == genome]
    genome_accessions = genome_df['product_accession'].to_list()
    genome_systems = train_defense_finder_genes_df[train_defense_finder_genes_df['genome'] == genome]
    i = 1
    genome_cluster_count_list = []
    for r in genome_records:
        r_id = r.id
        if r_id in genome_accessions:
            r_defense = 0
            r_cluster = genome_df.loc[(genome_df['product_accession'] == r_id), 
                                      'cluster_id'].item()
            if r_cluster in test_defense_clusts: # remove systems that are co-operonic w/ cluster before counting
                cluster_systems = test_clust_defense_systems.loc[test_clust_defense_systems['cluster_id'] == r_cluster, 
                                                                 'sys_id'].to_list()
                r_genome_systems = genome_systems[~genome_systems['sys_id'].isin(cluster_systems)]
            else:
                r_genome_systems = genome_systems
            if len(r_genome_systems):
                if (r_genome_systems['hit_pos'] - i).abs().min() < 11:
                    r_defense = 1
            genome_cluster_count_list.append({'cluster_id': r_cluster, 
                                              'defense_neighbor': r_defense})
        i += 1
    return genome_cluster_count_list

In [15]:
all_cluster_count_list = [get_genome_cluster_counts(f) for f in tqdm(faa_files, position=0)]

100%|██████████| 17454/17454 [2:12:53<00:00,  2.19it/s]  


In [16]:
all_cluster_count_list = reduce(operator.concat, all_cluster_count_list)

In [17]:
all_cluster_count_df = pd.DataFrame(all_cluster_count_list)
agg_cluster_count = (all_cluster_count_df.groupby('cluster_id')
                     .agg(total=('cluster_id', 'count'), 
                          defense_total =('defense_neighbor', 'sum'))
                     .reset_index()) 
agg_cluster_count['defense_percent'] = agg_cluster_count['defense_total']/agg_cluster_count['total']

In [18]:
merged_test_model_seqs = test_model_seqs.merge(agg_cluster_count, how='inner', on='cluster_id')
assert len(merged_test_model_seqs) == len(test_model_seqs)

In [19]:
merged_test_model_seqs.to_csv('../data/interim/cluster_defense_neighbor_test_scores.csv', index=False)

In [20]:
merged_test_model_seqs

Unnamed: 0,protein_id,seq_id,seq,assembly,cluster_id,split,defense,gene_name,protein_context_id,total,defense_total,defense_percent
0,WP_082687150.1,0000131212b640442569537f5a84e127f7b807991f612d...,MSLRSSYESLLWRSIALAGADGSVERKMLAAVGLQFLGAVAMAAFA...,GCF_001488575,0000131212b640442569537f5a84e127f7b807991f612d...,test,False,,WP_082687150.1|NZ_LN831302.1|473518|-,4,0,0.0
1,WP_010902217.1,76091e4b264b0db4ab9012f0dd30ea94d649f6bfac0d7a...,MNITQAYKRSLWWSMDMVGATGSVERKMLTAVGLQFLAAGGMAFLT...,GCF_004799605,0000131212b640442569537f5a84e127f7b807991f612d...,test,False,,WP_010902217.1|NZ_CP038631.1|448935|-,4,0,0.0
2,WP_269785431.1,8cb035bc5a3d2c19290cff24b7ed4fc67a0e17baf17577...,MTFRGTYESLLWRSIALAGADGSVERKMLTAVGLQFASAVAMGAFA...,GCF_021233435,0000131212b640442569537f5a84e127f7b807991f612d...,test,False,,WP_269785431.1|NZ_CP089468.1|444977|-,4,0,0.0
3,WP_232570346.1,93c941d9b3e66212ecbe3a8d67356eaab319bf6237cc22...,MSIRTAYEAWLWRSISLVGADGSVEGKMLTAVGLQFAGAVAMAALA...,GCF_021233415,0000131212b640442569537f5a84e127f7b807991f612d...,test,False,,WP_232570346.1|NZ_CP089466.1|2004297|+,4,0,0.0
4,WP_059107350.1,0000443dc15d579934c57e4b2b1533f58a44a3040003e8...,MKPLERQLELQELQQLFSHLGALSQIERNVVECLLNDYKPREIAAQ...,GCF_016028295,0000443dc15d579934c57e4b2b1533f58a44a3040003e8...,test,False,,WP_059107350.1|NZ_CP065712.1|372047|+,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
176366,WP_011012457.1,846ac31093fb1aeda44ad9f98f06dced408fb0d38eaba0...,MMKFCPNCGSLMTSRETPRGTIFVCRGCEMEILRTEDIETIIKLPL...,GCF_008245085,846ac31093fb1aeda44ad9f98f06dced408fb0d38eaba0...,test,True,ShosTA__ShosT,WP_011012457.1|NZ_CP023154.1|1220687|+,1,0,0.0
176367,WP_205097683.1,019fe0d4bc415b735b1bcfa7304d13b86b3691db3f11cb...,MFKKIKINVNTLTPLYIGGTDSEIPISEYVKKTEVDSKGKSVVQLL...,GCF_016908145,019fe0d4bc415b735b1bcfa7304d13b86b3691db3f11cb...,test,True,Cas__csm5gr7_III-A_3,WP_205097683.1|NZ_JAFBDI010000001.1|103574|-,1,0,0.0
176368,WP_243119700.1,4612b203f1a4d9cc09927511a068c31ecb6adaa22cfdb4...,MPAYWYWEFFENWLIEPQDGEIQLFKLGHNGPVQEARMHASIRPDT...,GCF_004369225,4612b203f1a4d9cc09927511a068c31ecb6adaa22cfdb4...,test,True,Cas__cmr3gr5_III-B_III-C_8,WP_243119700.1|NZ_QFFZ01000003.1|157157|-,1,0,0.0
176369,WP_187302819.1,bb26bb85d04cad15da2cecfa25aeedd26d7c7542b37eff...,MKHFLETHQVKLTTIGPVFIGSGELLMKKEYILDRKKRRVSIVNPN...,GCF_014333425,bb26bb85d04cad15da2cecfa25aeedd26d7c7542b37eff...,test,True,Cas__csm5gr7_III-A_2,WP_187302819.1|NZ_JACRYT010000006.1|82334|+,1,1,1.0
