In [29]:
# Import the necessary libraries
import re
from Bio import SeqIO
import random
import json

In [30]:
data_dir = "../data/ecoli_yeast.fasta"  # path to the input FASTA file
output_dir = "../data/fragmented_ecoli_yeast.jsonl"  # path to the output JSONL file
records = list(SeqIO.parse(data_dir, "fasta"))
random.shuffle(records) # shuffle the records for random sampling
ecoli = [r for r in records if "Escherichia coli" in r.description]
yeast = [r for r in records if "Saccharomyces cerevisiae" in r.description]

In [31]:
def trypsin_digest(sequence):
    """Simulates trypsin digestion of a protein sequence."""
    cleavage_pattern = r"(?<=[KR])(?!P)" # cuts after K or R unless followed by P
    fragments = re.split(cleavage_pattern, sequence) # split the sequence based on the cleavage pattern
    return [f for f in fragments if len(f) > 0]

In [32]:
# Fragment the sequences
fragmented_sequences = []
for r1, r2 in zip(ecoli, yeast):
    fragments1 = trypsin_digest(str(r1.seq))
    fragments2 = trypsin_digest(str(r2.seq))
    fragments = fragments1 + fragments2
    random.shuffle(fragments) # shuffle the fragments
    fragmented_sequences.append((r1.id, r2.id, fragments))
print(fragmented_sequences[0])

('sp|A7ZJV1|IF1_ECO24', 'sp|A6ZQH4|CIS3_YEAS7', ['AK', 'IGSIVANR', 'K', 'NYIR', 'NSGTLELTLK', 'ILTGDK', 'DSSCK', 'DVISQIGDGQVQATSAAATDSQVQASSTATPTSSEK', 'DGVLTDAK', 'TSSTNATSSSCATPSLK', 'IVFR', 'GR', 'ISSSASK', 'GR', 'SR', 'QFQFDGPPPQAGAIYAAGWSITEDGYLALGDSDVFYQCLSGNFYNLYDQNVAEQCSAIHLEAVSLVDC', 'VTVELTPYDLSK', 'NVALAASVAALSATASAEGYTPGEPWSTLTPTGSISCGAAEYTTTFGIAVQAITSSK', 'R', 'EDNIEMQGTVLETLPNTMFR', 'VELENGHVVTAHISGK', 'MR', 'MAK', 'MQFK'])


In [33]:
# Save output to JSONL
with open(output_dir, "w") as f:
    for ecoli_id, yeast_id, fragments in fragmented_sequences:
        record = {
            "ecoli_id": ecoli_id,
            "yeast_id": yeast_id,
            "fragments": fragments
        }
        f.write(json.dumps(record) + "\n")