## Create Examples with Real Domains + Shuffled sections

In [5]:
from Bio import SeqIO
from Bio.Seq import Seq
import os
import glob
import pickle
import random
from tqdm import tqdm
import sys
# sys.path.insert(0, '../library')
import hmmscan_utils as hu

random.seed(100)
# Load maps
with open("../data/maps.pkl", "rb") as f:
    maps = pickle.load(f)

# Directory containing the fasta files
pid = 40
dir_path = f"../data/test_fasta_{pid}"
out_path = f"../data/test_fasta_{pid}_shuffled"

# Iterate over each fasta file in the directory
for fasta_file in tqdm(glob.glob(os.path.join(dir_path, 'split_*_test_ids_full.fasta'))):
    # Get the corresponding scan
    file_suffix = fasta_file.split("/")[-1] 
    hmmscan_dict = hu.parse_hmmscan_results(f"../data/test_scan_{pid}/{file_suffix}_scan.txt")

    # Use BioPython to load the fasta file
    sequences = list(SeqIO.parse(fasta_file, "fasta"))

    for seq in sequences:
        # print(seq.id)
        # Get clan_vector
        try:
            _, clan_vector = hu.generate_domain_position_list(hmmscan_dict, seq.id, maps)
        except:
            # print(f"Error with {seq.id}")
            continue
        # find the indices where the clan vector is 656
        indices = [i for i, x in enumerate(clan_vector) if x == 656]

        # If there are no clan 656 domains, skip
        if len(indices) == 0:
            continue

        # Shuffle seq.seq only where clan 656 is present
        sequence = list(seq.seq)
        values = [sequence[i] for i in indices]
        random.shuffle(values)
        for i, index in enumerate(indices):
            sequence[index] = values[i]
        seq.seq = Seq("".join(sequence))
    
    # Write the shuffled sequences to a new fasta file
    with open(f"{out_path}/{file_suffix}", "w") as f:
        SeqIO.write(sequences, f, "fasta")


100%|██████████| 50/50 [26:30<00:00, 31.81s/it]
