In [19]:
import Bio

The goal is to incorporate negative examples (shuffled SwissProt sequences that we sample from) and small positive examples (only the domain region) for a finetuning dataset

## Load Negative and Positive examples

In [21]:
from Bio import SeqIO

# Load domain_train.fasta
with open("../data/train_fasta/domain_train.fasta", "r") as domain_file:
    domain_records = list(SeqIO.parse(domain_file, "fasta"))

# Load shuffle_train.fasta
with open("../data/train_fasta/shuffle_train.fasta", "r") as shuffle_file:
    shuffle_records = list(SeqIO.parse(shuffle_file, "fasta"))


## Merge and Shuffle

In [28]:
import random
random.seed(100)
# Assuming domain_records and shuffle_records are your lists
merged_records = domain_records + shuffle_records

# Shuffle the merged list
random.shuffle(merged_records)
# ENSURE UNIQUE IDS/LABELS
for i,seq in enumerate(merged_records):
    seq.description = seq.description.split()[0]+f" {i}"
    seq.seq = seq.seq.upper()

## Shard and Write

In [3]:
import os
os.mkdir("../data/train_fasta_finetune")

In [29]:
with open("../data/train_fasta_finetune/train_ids_full.fasta","w") as f:       
    SeqIO.write(merged_records,f,"fasta")

In [30]:
import sys
sys.path.insert(0, '../library')
import hmmscan_utils as hu

data_dir = "../data/train_fasta_finetune"
fasta_file = "train_ids_full.fasta"
num_jobs = 50

hu.split_fasta_file(fasta_file, data_dir, num_jobs)

## Create Examples with Real Domains + Shuffled sections

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
import os
import glob
import pickle
import random
from tqdm import tqdm
import sys
sys.path.insert(0, '../library')
import hmmscan_utils as hu

random.seed(100)
# Load maps
with open("../data/maps.pkl", "rb") as f:
    maps = pickle.load(f)

# Directory containing the fasta files
dir_path = "../data/train_fasta"
out_path = "../data/train_fasta_finetune"

# Iterate over each fasta file in the directory
for fasta_file in tqdm(glob.glob(os.path.join(dir_path, 'split_*_train_ids_full.fasta'))):
    # Get the corresponding scan
    file_suffix = fasta_file.split("/")[-1] 
    hmmscan_dict = hu.parse_hmmscan_results(f"../data/train_scan/{file_suffix}_scan.txt")

    # Use BioPython to load the fasta file
    sequences = list(SeqIO.parse(fasta_file, "fasta"))

    for seq in sequences:
        # print(seq.id)
        # Get clan_vector
        try:
            _, clan_vector = hu.generate_domain_position_list(hmmscan_dict, seq.id, maps)
        except:
            # print(f"Error with {seq.id}")
            continue
        # find the indices where the clan vector is 656
        indices = [i for i, x in enumerate(clan_vector) if x == 656]

        # If there are no clan 656 domains, skip
        if len(indices) == 0:
            continue

        # Shuffle seq.seq only where clan 656 is present
        sequence = list(seq.seq)
        values = [sequence[i] for i in indices]
        random.shuffle(values)
        for i, index in enumerate(indices):
            sequence[index] = values[i]
        seq.seq = Seq("".join(sequence))
    
    # Write the shuffled sequences to a new fasta file
    with open(f"{out_path}/{file_suffix}", "w") as f:
        SeqIO.write(sequences, f, "fasta")


  0%|          | 0/50 [00:00<?, ?it/s]