In [37]:
from Bio import SeqIO
import random
import json

In [38]:
data_dir = "../data/ecoli_yeast.fasta"  # path to the input FASTA file
output_dir = "../data/tangled_ecoli_yeast.jsonl"  # path to the output JSONL file
records = list(SeqIO.parse(data_dir, "fasta"))
random.shuffle(records)
ecoli = [r for r in records if "Escherichia coli" in r.description]
yeast = [r for r in records if "Saccharomyces cerevisiae" in r.description]

In [39]:
def equal_chunks(seq, splits):
    """Splits a sequence into approximately equal chunks."""
    n = len(seq)
    chunk_size = n // splits
    remainder = n % splits
    chunks = []
    start = 0
    for i in range(splits):
        end = start + chunk_size
        if i < remainder: # distribute the remainder evenly
            end += 1
        chunks.append(seq[start:end])
        start = end
    return chunks

def simple_tangle(seq1, seq2, splits):
    """Tangles by interleaving chunks of two sequences."""
    chunks1 = equal_chunks(seq1, splits)
    chunks2 = equal_chunks(seq2, splits)
    tangled = ""
    for c1, c2 in zip(chunks1, chunks2):
        tangled += c1 + c2
    return tangled

In [40]:
# Tangle the ecoli and yeast sequences
tangled_sequences = []
for r1, r2 in zip(ecoli, yeast):
    tangled_seq = simple_tangle(str(r1.seq), str(r2.seq), splits=10)
    tangled_sequences.append((r1.id, r2.id, tangled_seq))

print(tangled_sequences[0])

('sp|B7NFJ2|FDHE_ECOLU', 'sp|P47075|VTC4_YEAST', 'MSIRIIPQDELGSSEKRTADMIPPLLFPRLKMKFGEHLSKSLIRQYSYYYISYDDLKTELEDNLSKNNGQWTQELETDFLESLEIELDKVYTFCKVKHSEVFRRNLYNRRAERLRELAENNPLGDYLRFAALIAHVKEVQEQVQHTVRLLDSNNPPTQLDFEILEEELSDIIADVHDLAKFSRLNYTGFQKIIKKHDKKTGFILKPVAQEVVLYDHPLEMDLTTRIKEASAQGKPPLDFQVRLDSKPFFKENYDELVVKISQLYDIARTSGRPIKGDSSAGGKQQNFVRQTTKYWVHPDNITELKLIILKIHVLPRDKHWQKLLMALIAELKPEMSGPALAHLPVLVFNTNKEFEREDSAITSIYFDNENLDLYYGRLRKDEGAEAHRLRWYGGMSTDTIFVERKTHREDWTGVIENLEKASTQELEDMASALFASDFSSVSSDEKSVKARFALKERHVNDFLKGKYTVDQVFAKMRKEGKKPMNEIENLEALASEIQYVMLKKKLRPVVRSFYNRKAPFIWAALSLYWAQMANLIPGKARAEYGEQTAFQLPGDARVRISLDTELTMVREDNFDGVDRTHKNWRRTDIGVDWPFKQLDDKDICRFPYAVLEVKLQTQLRQYCPVCGSMPVSSMVQIGTTQGLRYLHCNLGQEPPEWVRELVGSHLVEPVPKFSKFIHGVATLLNDKVDSIPFWLPQMDVDIRKPPLPTNIEITRPGRSDNECETEWHVVRVKCSNCEQSGKLHYWSLDDEQADNDFDEDDEDDAALVAAMTNAPGNSLDIEESVGYGATSAPTSNTNHVVESANAAYYQRKIRNAENPISKKYYAIKAESCDDCGTYLKILYQEKEPKVEAVADDEIVAFFDHYFNGDQISKIPKGTTFDTQIRAPPGKTICVPVRVEPKVYFATERTYLSWLSISILLGGVSTTLLLASLVLDARMEQEGYARSSINP

In [41]:
# Save output to JSONL
with open(output_dir, "w") as f:
    for ecoli_id, yeast_id, tangled_seq in tangled_sequences:
        record = {
            "ecoli_id": ecoli_id,
            "yeast_id": yeast_id,
            "tangled_sequence": tangled_seq
        }
        f.write(json.dumps(record) + "\n")