# Imports

In [12]:
from subprocess import Popen
import subprocess
from pathlib import Path
import numpy as np
import edlib
from Bio import SeqIO
import copy
import mappy as mp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import SpectralEmbedding
from itertools import product
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from Bio import Align

aligner = Align.PairwiseAligner()
aligner.match_score = 1.0       
aligner.mismatch_score = -1.0  
aligner.gap_score = -2.5       
aligner.mode = "local"    

# Reference simulation

Execute the reference simulation.

In [35]:
M = 0
c=str(Path("data", "random_genome_chr_index.csv"))
r=str(Path("data", "combined_curated_TE_lib_ATOSZM_selected.fasta"))
o="data"
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
i=0.001
m=2
n=2
k = 10

ksize = 11
scaled = 10
min_abund = 2
track_abundance = True
seed = 42

p="reference_simulation"
cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()

0

In [39]:
! which tegenomesimulator

/home/jbris/.cache/pypoetry/virtualenvs/calisim-examples-yfUJKvMn-py3.10/bin/tegenomesimulator


In [14]:
def load_genome_sequence(filepath):
    with open(filepath, 'r') as f:
        lines = f.readlines()
    sequence = ''.join([line.strip() for line in lines if not line.startswith('>')])
    return sequence.upper()

In [15]:
def all_kmers(k):
    return [''.join(p) for p in product("ACGT", repeat=k)]

kmer_list = all_kmers(k)

# ------- K-mer frequency vector -------
def kmer_vector(seq, k, kmer_list):
    seq = str(seq).upper()
    kmers = [seq[i:i+k] for i in range(len(seq)-k+1) if 'N' not in seq[i:i+k]]
    freqs = Counter(kmers)
    total = sum(freqs.values())
    return np.array([freqs.get(kmer, 0)/total if total > 0 else 0 for kmer in kmer_list])

In [25]:
reference_genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"

seq1 = load_genome_sequence(reference_genome_sequence_path)
vec1 = kmer_vector(seq1, k, kmer_list).reshape(1, -1)

similarity = cosine_similarity(vec1, vec1)[0, 0]
print(f"Cosine similarity (k={k}): {similarity:.4f}")

Cosine similarity (k=10): 1.0000


In [17]:
vec1.shape

(1, 1048576)

Derive Sourmash signature. 

# Optimisation

Ground-truth parameter values

```python
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
```

In [37]:
reference_genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"
aligner = mp.Aligner(reference_genome_sequence_path, preset="asm5")
reference_genome_sequence = "".join([
    str(record.seq)
    for record in SeqIO.parse(reference_genome_sequence_path, "fasta")
])

In [36]:
genome_sequence = reference_genome_sequence

In [26]:
res = edlib.align(genome_sequence, reference_genome_sequence, mode = "HW", task = "path")
res['editDistance']

0

In [38]:
seq1 = load_genome_sequence(reference_genome_sequence_path)
vec1 = kmer_vector(seq1, k, kmer_list).reshape(1, -1)
vec2 = kmer_vector(genome_sequence, k, kmer_list).reshape(1, -1)
similarity = cosine_similarity(vec1, vec2)[0, 0]
print(f"Cosine similarity (k={k}): {similarity:.4f}")

Cosine similarity (k=10): 0.5986


In [11]:
import numpy as np
import pandas as pd

from calisim.data_model import (
    DistributionModel,
    ParameterDataType,
    ParameterSpecification,
)
from calisim.optimisation import OptimisationMethod, OptimisationMethodModel
from calisim.evolutionary import (
	EvolutionaryMethod,
	EvolutionaryMethodModel,
)

parameter_spec = ParameterSpecification(
parameters=[
    DistributionModel(
        name="maxidn",
        distribution_name="uniform",
        distribution_args=[90, 97],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="minidn",
        distribution_name="uniform",
        distribution_args=[75.0, 85.0],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="maxsd",
        distribution_name="uniform",
        distribution_args=[15, 25],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="minsd",
        distribution_name="uniform",
        distribution_args=[1, 3],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="a",
        distribution_name="uniform",
        distribution_args=[0.6, 0.8],
        data_type=ParameterDataType.CONTINUOUS,
    ),
    DistributionModel(
        name="b",
        distribution_name="uniform",
        distribution_args=[0.4, 0.6],
        data_type=ParameterDataType.CONTINUOUS,
    ),
]
)

def objective(
    parameters: dict, simulation_id: str, observed_data: np.ndarray | None
) -> float | list[float]:
    simulation_parameters = {}
    
    maxidn=parameters["maxidn"]
    minidn=parameters["minidn"]
    maxsd=parameters["maxsd"]
    minsd=parameters["minsd"]
    a=parameters["a"]
    b=parameters["b"]

    maxidn=95
    minidn=80
    maxsd=20
    minsd=1
    a=0.7
    b=0.5
    m=1
    n=1

    p=simulation_id
    cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
    Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()


    # total_matches = 0
    # total_aligned = 0
    genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"
    genome_sequence = load_genome_sequence(genome_sequence_path)

    res = edlib.align(genome_sequence, reference_genome_sequence, mode = "HW", task = "path")
    return res['editDistance']
    
    # vec2 = kmer_vector(seq1, k, kmer_list).reshape(1, -1)
    # return vec2
    # similarity = cosine_similarity(vec1, vec2)[0, 0]
    #print(f"Cosine similarity (k={k}): {similarity:.4f}")
    return similarity
    # for name, seq, qual in mp.fastx_read(genome_sequence_path):
    #     for aln in aligner.map(seq):
    #         if aln.blen == 0:
    #             continue
    #         total_matches += aln.mlen
    #         total_aligned += aln.blen
            
    # if total_aligned == 0:
    #     return 0.0

    # identity = total_matches / total_aligned
    # return identity * 100  



specification = OptimisationMethodModel(
    experiment_name="optuna_optimisation",
    parameter_spec=parameter_spec,
    observed_data=vec1,
    method="gp",
    directions=["maximize"],
    output_labels=["Levenshtein Distance"],
    n_iterations=100,
    n_jobs=5,
    method_kwargs=dict(n_startup_trials=1),
)

calibrator = OptimisationMethod(
    calibration_func=objective, specification=specification, engine="optuna"
)

calibrator.specify().execute().analyze()


GPSampler is experimental (supported from v3.6.0). The interface can change in the future.

[I 2025-08-10 21:35:58,686] A new study created in memory with name: optuna_optimisation
[I 2025-08-10 21:36:19,424] Trial 3 finished with value: 4052.0 and parameters: {'maxidn': 91, 'minidn': 77, 'maxsd': 24, 'minsd': 1, 'a': 0.6105303486239482, 'b': 0.427156752068499}. Best is trial 3 with value: 4052.0.
[I 2025-08-10 21:36:22,764] Trial 0 finished with value: 4607.0 and parameters: {'maxidn': 90, 'minidn': 76, 'maxsd': 25, 'minsd': 3, 'a': 0.7911922717634579, 'b': 0.4323401214738847}. Best is trial 0 with value: 4607.0.
[I 2025-08-10 21:36:22,774] Trial 1 finished with value: 4488.0 and parameters: {'maxidn': 96, 'minidn': 82, 'maxsd': 21, 'minsd': 1, 'a': 0.7455358918916832, 'b': 0.41739795217859754}. Best is trial 0 with value: 4607.0.
[I 2025-08-10 21:36:22,791] Trial 2 finished with value: 4607.0 and parameters: {'maxidn': 96, 'minidn': 75, 'maxsd': 25, 'minsd': 2, 'a': 0.67734032536996

KeyboardInterrupt: 

# Optimisation

Ground-truth parameter values

```python
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
```

In [None]:
calibrator.get_parameter_estimates().estimates