# Filter Pfam-A.seed

## Convert Pfam-A.seed (stockholm) to FASTA

In [None]:
module load intel/23.2.0-fasrc01 openmpi/4.1.5-fasrc03
data_dir="data"
fasta_file="Pfam_A.seed.fasta"
# Use esl-reformat to turn the Pfam-A.seed (stockholm format) into fasta format
/n/eddy_lab/software/bin/esl-reformat -o ${data_dir}/${fasta_file} fasta /n/eddy_lab/data/pfam-35.0/Pfam-A.seed

## Split Pfam-A.seed.fasta into smaller FASTA files

In [None]:
import sys
sys.path.insert(0, '../library')
import hmmscan_utils as utils

data_dir = "data"
fasta_file = "Pfam_A.seed.fasta"
num_jobs = 50

utils.split_fasta_file(fasta_file, data_dir, num_jobs)

Split files will be named `split_{i}_Pfam_A.seed.fasta`

## HMM Scan the split FASTA files

In [None]:
data_dir="data"
fasta_file="Pfam_A.seed.fasta"
sbatch --array=1-$num_jobs ./bash_scripts/hmmscan.sh $data_dir $fasta_file

Output will be in the format `split_{i}_Pfam_A.seed.fasta_scan.txt`

Scans are performed with e-value inclusion threshold of 0.01

## Find sequences not that did not have any hmmscan hits

In [31]:
import sys
sys.path.insert(0, '../library')
import hmmscan_utils as utils
from tqdm import tqdm

data_dir = "data"
all_missing_seqs = []
for i in tqdm(range(1,51)):
    fasta_file = f"{data_dir}/split_{i}_Pfam_A.seed.fasta"
    hmm_scan = f"{data_dir}/split_{i}_Pfam_A.seed.fasta_scan.txt"
    missing_seqs = utils.find_missing_sequences(hmm_scan,fasta_file)
    all_missing_seqs += missing_seqs

with open("Pfam_A.seed.no_hits.txt", "w") as file:
    for sequence in all_missing_seqs:
        file.write(sequence + "\n")

100%|██████████| 50/50 [06:52<00:00,  8.24s/it]


## Clean up split files

In [None]:
data_dir="data"
rm ${data}/split*
