In [40]:
# Import the necessary libraries
import re
from Bio import SeqIO
import random
import json

In [41]:
data_dir = "../data/ecoli_yeast.fasta"  # path to the input FASTA file
output_ecoli_dir = "../data/fragmented_ecoli.jsonl"  # path to the ecoli output
output_yeast_dir = "../data/fragmented_yeast.jsonl"  # path to the yeast output
output_mixture_dir = "../data/fragmented_mixture.jsonl"  # path to the mixture output
records = list(SeqIO.parse(data_dir, "fasta")) # parse the FASTA file and create a list of records
random.shuffle(records) # shuffle the records for random sampling
ecoli = [r for r in records if "Escherichia coli" in r.description]
yeast = [r for r in records if "Saccharomyces cerevisiae" in r.description]

In [42]:
def trypsin_digest(sequence):
    """Simulates trypsin digestion of a protein sequence."""
    cleavage_pattern = r"(?<=[KR])(?!P)" # cuts after K or R unless followed by P
    fragments = re.split(cleavage_pattern, sequence) # split the sequence based on the cleavage pattern
    return [f for f in fragments if len(f) > 0]

In [None]:
# Fragment ecoli and yeast separately, and a mixture of them
fragmented_ecoli = []
fragmented_yeast = []
fragmented_mixture = []

for r in ecoli:
    fragments = trypsin_digest(str(r.seq))
    random.shuffle(fragments) # shuffle for realism
    fragmented_ecoli.append({"ecoli_id": r.id, "fragments": fragments})

for r in yeast:
    fragments = trypsin_digest(str(r.seq))
    random.shuffle(fragments) # shuffle for realism
    fragmented_yeast.append({"yeast_id": r.id, "fragments": fragments})

for r1, r2 in zip(ecoli, yeast):
    mixture = trypsin_digest(str(r1.seq)) + trypsin_digest(str(r2.seq))
    random.shuffle(mixture) # shuffle for realism
    fragmented_mixture.append({"ecoli_id": r1.id, "yeast_id": r2.id, "fragments": mixture})

print(f"Ecoli: {len(fragmented_ecoli)}, Yeast: {len(fragmented_yeast)}, Mixture: {len(fragmented_mixture)}")
print(f"Sample Ecoli Record: {fragmented_ecoli[0]}")
print(f"Sample Yeast Record: {fragmented_yeast[0]}")
print(f"Sample Mixture Record: {fragmented_mixture[0]}")

Ecoli: 23285, Yeast: 7924, Mixture: 7924
Sample Ecoli Record: {'ecoli_id': 'sp|B7NPL8|SYM_ECO7I', 'fragments': ['HTIMVANLAPR', 'VNPFK', 'GDSVSFDEYWK', 'EFGK', 'GMFLPDR', 'ALYNR', 'YFYVWLDAPIGYMGSFK', 'DIFLLSPDAGAKPGHQVK', 'YQR', 'VDLR', 'VLMTYLKPVLPK', 'SAYPDPQALIGR', 'FGISEGMVMAAGPGGK', 'EIMALADLANR', 'KPTNLFVHGYVTVNGAK', 'QVEALVEASK', 'GTFIK', 'LTER', 'ASTWLNHFDADSLR', 'GTCPK', 'MSK', 'K', 'LLR', 'TFTDAAEVIGEAWESR', 'R', 'YVDEQAPWVVAK', 'DAPYFGFEIPNAPGK', 'SR', 'AEAFLNTELTWDGIQQPLLGHK', 'DADLQAICSMGINLFR', 'VALIENAEFVEGSDK', 'FDGVLASELADPQLYK', 'DIVYFHSLFWPAMLEGSNFR', 'IDDIDLNLEDFVQR', 'IDMK', 'DSTAELYHFIGK', 'NLCDK', 'VNADIVNK', 'DSEHFFFDLPSFSEMLQAWTR', 'LK', 'FVK', 'ENGFIK', 'LTLDLGGEK', 'LSSR', 'GHEVNFICADDAHGTPIMLK', 'K', 'R', 'NAGFINK', 'AAAAPVTGPLADDPIQETITFDDFAK', 'MR', 'AVR', 'NR', 'QLSELIYSR', 'TISQLYDPEK', 'VVNLASR', 'MQEWFESGLQQWDISR', 'K', 'AQQLGITPEQMIGEMSQEHQTDFAGFNISYDNYHSTHSEENR', 'SGALQEQVANK', 'MR', 'MTQVAK', 'EEVK', 'SPDQYGDNCEVCGATYSPTELIEPK', 'R', 'CK', 'SVVSGATP

In [None]:
# Save all three outputs as JSONL files
for path, data in [
    (output_ecoli_dir, fragmented_ecoli),
    (output_yeast_dir, fragmented_yeast),
    (output_mixture_dir, fragmented_mixture),
]:
    with open(path, "w") as f:
        for record in data:
            f.write(json.dumps(record) + "\n")
    print(f"Wrote {len(data)} records to {path}")

Wrote 23285 records to ../data/fragmented_ecoli.jsonl
Wrote 7924 records to ../data/fragmented_yeast.jsonl
Wrote 7924 records to ../data/fragmented_mixture.jsonl
