# Imports

In [7]:
from subprocess import Popen
from pathlib import Path

# Utility functions

Define utility functions.

In [12]:
def get_uuid():
    return str(uuid.uuid4())

def load_genome_sequence(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    sequence = ''.join([line.strip() for line in lines if not line.startswith('>')])
    return sequence.upper()

def build_kmers(sequence, ksize):
    kmers = []
    n_kmers = len(sequence) - ksize + 1

    for i in range(n_kmers):
        kmer = sequence[i : i + ksize]
        kmers.append(kmer)

    return kmers

def jaccard_similarity(a, b):
    a = set(a)
    b = set(b)

    intersection = len(a.intersection(b))
    union = len(a.union(b))

    return intersection / union

# Reference simulation

Execute the reference simulation.

In [6]:
# Reference values

M = 0
c=str(Path("data", "random_genome_chr_index.csv"))
r=str(Path("data", "combined_curated_TE_lib_ATOSZM_selected.fasta"))
o="data"
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
i=0.001
m=5
n=1
k = 100   
batch_size = 64

p="reference_simulation"
cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd}"
Popen(cmd, shell=True).wait()

Mode: 0
Running Random Synthesized mode.
Prefix: reference_simulation
Repeat: data/combined_curated_TE_lib_ATOSZM_selected.fasta
Chromosome Index: data/random_genome_chr_index.csv
Genome File: None
Alpha: 0.7
Beta: 0.5
Max Copies: 5
Min Copies: 1
Upper bound of mean identity: 95
Lower bound of mean identity: 80
Upper bound of sd of mean identity: 20
Lower bound of sd of mean ideneity: 1
Max chance of intact insertion: 0.001
Seed: 1
Output Directory: data

TE library table generated successfully. Output logged to data/TEgenomeSimulator_reference_simulation_result/TEgenomeSimulator.log
mode=0, running prep_yml_config.py for Random Genome Mode.

Config file generated successfully. Output logged to data/TEgenomeSimulator_reference_simulation_result/TEgenomeSimulator.log

Genome with non-overlap random TE insertions was generated successfully. Output logged to data/TEgenomeSimulator_reference_simulation_result/TEgenomeSimulator.log

Genome with non-overlap random and nested TE insertions wa

0

Load the genome sequence data to string.

In [13]:
reference_genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"
reference_genome_sequence = load_genome_sequence(reference_genome_sequence_path)
build_kmers(reference_genome_sequence, 21)

['TGAAAAATTTTGACAGTTAAG',
 'GAAAAATTTTGACAGTTAAGC',
 'AAAAATTTTGACAGTTAAGCA',
 'AAAATTTTGACAGTTAAGCAG',
 'AAATTTTGACAGTTAAGCAGC',
 'AATTTTGACAGTTAAGCAGCC',
 'ATTTTGACAGTTAAGCAGCCA',
 'TTTTGACAGTTAAGCAGCCAA',
 'TTTGACAGTTAAGCAGCCAAA',
 'TTGACAGTTAAGCAGCCAAAC',
 'TGACAGTTAAGCAGCCAAACA',
 'GACAGTTAAGCAGCCAAACAT',
 'ACAGTTAAGCAGCCAAACATC',
 'CAGTTAAGCAGCCAAACATCT',
 'AGTTAAGCAGCCAAACATCTG',
 'GTTAAGCAGCCAAACATCTGA',
 'TTAAGCAGCCAAACATCTGAG',
 'TAAGCAGCCAAACATCTGAGC',
 'AAGCAGCCAAACATCTGAGCA',
 'AGCAGCCAAACATCTGAGCAG',
 'GCAGCCAAACATCTGAGCAGC',
 'CAGCCAAACATCTGAGCAGCG',
 'AGCCAAACATCTGAGCAGCGA',
 'GCCAAACATCTGAGCAGCGAG',
 'CCAAACATCTGAGCAGCGAGA',
 'CAAACATCTGAGCAGCGAGAT',
 'AAACATCTGAGCAGCGAGATC',
 'AACATCTGAGCAGCGAGATCA',
 'ACATCTGAGCAGCGAGATCAA',
 'CATCTGAGCAGCGAGATCAAA',
 'ATCTGAGCAGCGAGATCAAAA',
 'TCTGAGCAGCGAGATCAAAAG',
 'CTGAGCAGCGAGATCAAAAGA',
 'TGAGCAGCGAGATCAAAAGAA',
 'GAGCAGCGAGATCAAAAGAAT',
 'AGCAGCGAGATCAAAAGAATA',
 'GCAGCGAGATCAAAAGAATAT',
 'CAGCGAGATCAAAAGAATATA',
 'AGCGAGATCA