In [None]:
import os
import hashlib
from joblib import Parallel, delayed
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [8]:
def unzip_file(file):
    os.system('gunzip ' + file)


def encode_protein(p):
    return hashlib.sha224(p.encode('utf-8')).hexdigest()


def process_seq(seq, name, assembly, f):
    seq_id = encode_protein(seq)
    print(','.join([name, seq_id, seq, assembly]), file=f)


def process_faa_file(faa_dir, faa_file, out_dir):
    assembly = faa_file.split('.')[0]
    with open(out_dir + assembly + '.csv', 'w') as f:
        seq = ''
        for line in open(faa_dir + faa_file, 'r'):
            if '>' in line:
                if seq:
                    process_seq(seq, name, assembly, f)
                    seq = ''
                name = line.split(' ')[0][1:]
            else:
                seq += line.strip()
        process_seq(seq, name, assembly, f)


def process_unique_split(out_dir, unique_seq_df, i):
    with open(out_dir + str(i) + '.faa', 'w') as f:
        for _, row in unique_seq_df.iterrows():
            print('>' + row['seq_id'], file=f)
            print(row['seq'], file=f)

In [3]:
faa_dir = '../data/ecoli/raw/faa/'
# outputs
seq_assembly_dir = '../data/ecoli/interim/seq_assemblies/'

In [4]:
print('Unzipping faa files')
faa_files = [file for file in os.listdir(faa_dir) if '.gz' in file]
_ = Parallel(n_jobs=48)(delayed(unzip_file)(faa_dir + f) for f in tqdm(faa_files))

Unzipping faa files


100%|██████████| 3000/3000 [00:04<00:00, 717.00it/s]


In [6]:
import os
from joblib import Parallel, delayed
from tqdm import tqdm

In [9]:
print('Unzipping ft files')
ft_dir = '../data/ecoli/raw/ft/'
ft_files = [file for file in os.listdir(ft_dir) if '.gz' in file]
_ = Parallel(n_jobs=48)(delayed(unzip_file)(ft_dir + f) for f in tqdm(ft_files))

Unzipping ft files



  0%|          | 0/3000 [00:00<?, ?it/s][A
  2%|▏         | 48/3000 [00:00<00:25, 116.88it/s][A
  3%|▎         | 96/3000 [00:00<00:18, 156.27it/s][A
  5%|▍         | 144/3000 [00:00<00:13, 204.57it/s][A
  6%|▋         | 192/3000 [00:00<00:11, 251.71it/s][A
 10%|▉         | 288/3000 [00:01<00:07, 352.79it/s][A
 13%|█▎        | 384/3000 [00:01<00:05, 472.03it/s][A
 16%|█▌        | 480/3000 [00:01<00:04, 536.32it/s][A
 19%|█▉        | 576/3000 [00:01<00:04, 585.92it/s][A
 22%|██▏       | 672/3000 [00:01<00:04, 569.83it/s][A
 26%|██▌       | 768/3000 [00:01<00:04, 555.95it/s][A
 29%|██▉       | 864/3000 [00:02<00:04, 528.61it/s][A
 32%|███▏      | 960/3000 [00:02<00:03, 536.43it/s][A
 35%|███▌      | 1056/3000 [00:02<00:03, 557.48it/s][A
 38%|███▊      | 1152/3000 [00:02<00:03, 568.85it/s][A
 42%|████▏     | 1248/3000 [00:02<00:03, 557.90it/s][A
 45%|████▍     | 1344/3000 [00:02<00:03, 503.37it/s][A
 48%|████▊     | 1440/3000 [00:03<00:02, 527.94it/s][A
 51%|█████     | 

In [10]:
print('Unzipping fna files')
fna_dir = '../data/ecoli/raw/fna/'
fna_files = [file for file in os.listdir(fna_dir) if '.gz' in file]
_ = Parallel(n_jobs=48)(delayed(unzip_file)(fna_dir + f) for f in tqdm(fna_files))

Unzipping fna files



  0%|          | 0/3000 [00:00<?, ?it/s][A
  3%|▎         | 96/3000 [00:00<00:04, 590.80it/s][A
  6%|▋         | 192/3000 [00:00<00:05, 547.69it/s][A
 10%|▉         | 288/3000 [00:00<00:06, 413.82it/s][A
 13%|█▎        | 384/3000 [00:01<00:08, 302.08it/s][A
 16%|█▌        | 480/3000 [00:01<00:08, 281.19it/s][A
 19%|█▉        | 576/3000 [00:01<00:09, 267.02it/s][A
 22%|██▏       | 672/3000 [00:02<00:09, 255.65it/s][A
 26%|██▌       | 768/3000 [00:02<00:09, 243.48it/s][A
 29%|██▉       | 864/3000 [00:03<00:09, 220.80it/s][A
 32%|███▏      | 960/3000 [00:03<00:09, 221.61it/s][A
 35%|███▌      | 1056/3000 [00:04<00:08, 223.04it/s][A
 38%|███▊      | 1152/3000 [00:04<00:08, 214.08it/s][A
 42%|████▏     | 1248/3000 [00:05<00:08, 218.04it/s][A
 45%|████▍     | 1344/3000 [00:05<00:07, 230.36it/s][A
 48%|████▊     | 1440/3000 [00:05<00:06, 231.90it/s][A
 51%|█████     | 1536/3000 [00:06<00:06, 243.09it/s][A
 54%|█████▍    | 1632/3000 [00:06<00:05, 246.93it/s][A
 58%|█████▊   

In [5]:
if 'seq_assemblies' in os.listdir('../data/ecoli/interim/'):
    os.system('rm -r ' + seq_assembly_dir)

In [6]:
print('Deduplicating fasta files')
faa_files = [file for file in os.listdir(faa_dir) if '.faa' in file]
os.system('mkdir ' + seq_assembly_dir)
_ = Parallel(n_jobs=48)(delayed(process_faa_file)(faa_dir, faa_file, seq_assembly_dir) for faa_file in tqdm(faa_files))


Deduplicating fasta files


100%|██████████| 3000/3000 [00:03<00:00, 873.67it/s] 


In [7]:
seq_assemblies_f = '../data/ecoli/interim/seq_assemblies.csv'

In [8]:
seq_assembly_fs = [f for f in os.listdir(seq_assembly_dir) if (f[-4:] == '.csv')]

In [9]:
if 'seq_assemblies.csv' in os.listdir('../data/ecoli/interim/'):
    os.system('rm ' + seq_assemblies_f)

In [10]:
def concatenate_file(f):
    with open(os.path.join(seq_assembly_dir, f), 'rb') as infile:
        return infile.read()

In [11]:
with open(seq_assemblies_f, 'wb') as outfile:
    with ThreadPoolExecutor(max_workers=48) as executor:
        # Map the function to the files in parallel
        for content in tqdm(executor.map(concatenate_file, seq_assembly_fs), total=len(seq_assembly_fs)):
            outfile.write(content)

100%|██████████| 3000/3000 [00:05<00:00, 502.87it/s]


In [12]:
!rm -r {seq_assembly_dir}

In [13]:
%%time
seq_assemblies_df = pd.read_csv(seq_assemblies_f, 
                                names=['product_accession', 'seq_id', 'seq', 'assembly'])

CPU times: user 46.4 s, sys: 2.74 s, total: 49.1 s
Wall time: 49.2 s


In [14]:
unique_accession_assemblies = seq_assemblies_df[['product_accession', 'assembly']].drop_duplicates()

In [15]:
unique_accession_assemblies['assembly'].nunique()

3000

In [16]:
unique_accession_assemblies.to_parquet('../data/ecoli/interim/unique_accession_assemblies.pq', index=False)

In [17]:
unique_seq_assemblies = seq_assemblies_df[['product_accession', 'seq_id', 'seq']].drop_duplicates()

In [18]:
unique_seq_assemblies.to_parquet('../data/ecoli/interim/unique_seqs.pq', index=False)

In [None]:
unique_seq_assemblies

In [19]:
unique_seq_assemblies.shape

(1070803, 3)

In [5]:
unique_seq_fa = '../data/ecoli/interim/unique_seqs.faa'

In [6]:
with open(unique_seq_fa, 'w') as f:
    for _, row in tqdm(unique_seq_assemblies.iterrows(), 
                       total=len(unique_seq_assemblies)):
        seq = row['seq']
        if 'X' in seq:
            seq = seq.replace('X', '')
        if 'U' in seq:
            seq = seq.replace('U', 'C')
        if 'B' in seq:
            seq = seq.replace('B', 'N')
        if 'J' in seq:
            seq = seq.replace('J', 'L')
        print('>' + row['product_accession'], file=f)
        print(seq, file=f)

100%|██████████| 1070803/1070803 [00:54<00:00, 19797.20it/s]


In [23]:
if 'mmseqs' in os.listdir('../data/ecoli/interim/'):
    os.system('rm -r ../data/ecoli/interim/mmseqs/')
os.mkdir('../data/ecoli/interim/mmseqs/')

In [24]:
unique_seq_db = '../data/ecoli/interim/mmseqs/unique_seqs.db'

In [25]:
%%time
!conda run -n beaker \
mmseqs createdb {unique_seq_fa} {unique_seq_db} \
--dbtype 1

createdb ../data/ecoli/interim/unique_seqs.faa ../data/ecoli/interim/mmseqs/unique_seqs.db --dbtype 1 

MMseqs Version:       	13.45111
Database type         	1
Shuffle input database	true
Createdb mode         	0
Write lookup file     	1
Offset of numeric ids 	0
Compressed            	0
Verbosity             	3

Converting sequences
Time for merging to unique_seqs.db_h: 0h 0m 0s 838ms
Time for merging to unique_seqs.db: 0h 0m 0s 682ms
Database type: Aminoacid
Time for processing: 0h 0m 6s 594ms

CPU times: user 91.6 ms, sys: 122 ms, total: 214 ms
Wall time: 8.82 s


In [26]:
%%time
!conda run -n beaker mmseqs createindex {unique_seq_db}  ../data/tmp

createindex ../data/ecoli/interim/mmseqs/unique_seqs.db ../data/tmp 

MMseqs Version:          	13.45111
Seed substitution matrix 	nucl:nucleotide.out,aa:VTML80.out
k-mer length             	0
Alphabet size            	nucl:5,aa:21
Compositional bias       	1
Max sequence length      	65535
Max results per query    	300
Mask residues            	1
Mask lower case residues 	0
Spaced k-mers            	1
Spaced k-mer pattern     	
Sensitivity              	7.5
k-score                  	0
Check compatible         	0
Search type              	0
Split database           	0
Split memory limit       	0
Verbosity                	3
Threads                  	96
Min codons in orf        	30
Max codons in length     	32734
Max orf gaps             	2147483647
Contig start mode        	2
Contig end mode          	2
Orf start mode           	1
Forward frames           	1,2,3
Reverse frames           	1,2,3
Translation table        	1
Translate orf            	0
Use all table starts     	false
Offset

In [27]:
cov = 0.8
min_seq_id = 0.3
threads = 48
cluster_mode = 1

In [28]:
clust_db = '../data/ecoli/interim/mmseqs/unique_seqs_clust.db'

In [30]:
%%time 
!conda run -n beaker \
mmseqs cluster {unique_seq_db} {clust_db} ../data/tmp \
-c {cov} --min-seq-id {min_seq_id} --threads {threads} --cluster-mode {cluster_mode}

cluster ../data/ecoli/interim/mmseqs/unique_seqs.db ../data/ecoli/interim/mmseqs/unique_seqs_clust.db ../data/tmp -c 0.8 --min-seq-id 0.3 --threads 48 --cluster-mode 1 

MMseqs Version:                     	13.45111
Substitution matrix                 	nucl:nucleotide.out,aa:blosum62.out
Seed substitution matrix            	nucl:nucleotide.out,aa:VTML80.out
Sensitivity                         	4
k-mer length                        	0
k-score                             	2147483647
Alphabet size                       	nucl:5,aa:21
Max sequence length                 	65535
Max results per query               	20
Split database                      	0
Split mode                          	2
Split memory limit                  	0
Coverage threshold                  	0.8
Coverage mode                       	0
Compositional bias                  	1
Diagonal scoring                    	true
Exact k-mer matching                	0
Mask residues                       	1
Mask lower case residues 

In [31]:
clust_out = '../data/ecoli/interim/seq_clusters.txt'

In [32]:
%%time
!conda run -n beaker \
mmseqs createtsv {unique_seq_db} {unique_seq_db} {clust_db} {clust_out}

createtsv ../data/ecoli/interim/mmseqs/unique_seqs.db ../data/ecoli/interim/mmseqs/unique_seqs.db ../data/ecoli/interim/mmseqs/unique_seqs_clust.db ../data/ecoli/interim/seq_clusters.txt 

MMseqs Version:                 	13.45111
First sequence as representative	false
Target column                   	1
Add full header                 	false
Sequence source                 	0
Database output                 	false
Threads                         	96
Compressed                      	0
Verbosity                       	3

Time for merging to seq_clusters.txt: 0h 0m 1s 431ms
Time for processing: 0h 0m 3s 560ms

CPU times: user 33.4 ms, sys: 131 ms, total: 164 ms
Wall time: 5.12 s


In [34]:
cluster_df = pd.read_table(clust_out, names=['cluster_id', 'product_accession'])

In [35]:
cluster_df['cluster_id'].nunique()

71334