# Imports

In [2]:
from subprocess import Popen
import subprocess
from pathlib import Path
import numpy as np
from sourmash import MinHash
from sourmash import signature as sig
from Bio import SeqIO
import copy

# Reference simulation

Execute the reference simulation.

In [3]:
M = 0
c=str(Path("data", "random_genome_chr_index.csv"))
r=str(Path("data", "combined_curated_TE_lib_ATOSZM_selected.fasta"))
o="data"
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
i=0.001
m=3
n=3

ksize = 51 
scaled = 100  
track_abundance = True
seed = 100

p="reference_simulation"
cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()

0

Derive Sourmash signature. 

# Optimisation

Ground-truth parameter values

```python
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
```

In [4]:
reference_genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"

mh = MinHash(n=0, ksize=ksize, scaled=scaled, track_abundance=track_abundance, seed=seed)
for record in SeqIO.parse(reference_genome_sequence_path, "fasta"):
    mh.add_sequence(str(record.seq), force=True)
reference_encoded_genome_sequence = sig.SourmashSignature(mh, name="reference_genome")
reference_encoded_genome_sequence

SourmashSignature('reference_genome', 945a20e9)

In [5]:
reference_encoded_genome_sequence.similarity(reference_encoded_genome_sequence)

0.9999999865842415

In [None]:
import numpy as np
import pandas as pd

from calisim.data_model import (
    DistributionModel,
    ParameterDataType,
    ParameterSpecification,
)
from calisim.optimisation import OptimisationMethod, OptimisationMethodModel

parameter_spec = ParameterSpecification(
parameters=[
    DistributionModel(
        name="maxidn",
        distribution_name="uniform",
        distribution_args=[88, 92],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="minidn",
        distribution_name="uniform",
        distribution_args=[78, 82],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="maxsd",
        distribution_name="uniform",
        distribution_args=[18, 22],
        data_type=ParameterDataType.DISCRETE,
    ),

]
)

def objective(
    parameters: dict, simulation_id: str, observed_data: np.ndarray | None
) -> float | list[float]:
    simulation_parameters = {}
    
    maxidn=parameters["maxidn"]
    minidn=parameters["minidn"]
    maxsd=parameters["maxsd"]

    p=simulation_id
    cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
    Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()
    
    genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"

    mh_sampled = MinHash(n=0, ksize=ksize, scaled=scaled, track_abundance=track_abundance, seed=seed)
    for record in SeqIO.parse(genome_sequence_path, "fasta"):
        mh_sampled.add_sequence(str(record.seq), force=True)
    encoded_genome_sequence = sig.SourmashSignature(mh_sampled, name=p)
    similarity = encoded_genome_sequence.similarity(reference_encoded_genome_sequence)

    return similarity

specification = OptimisationMethodModel(
    experiment_name="optuna_optimisation",
    parameter_spec=parameter_spec,
    method="tpes",
    directions=["maximize"],
    output_labels=["Jaccard Similarity"],
    n_iterations=100,
    n_jobs=4,
    method_kwargs=dict(n_startup_trials=1),
)

calibrator = OptimisationMethod(
    calibration_func=objective, specification=specification, engine="optuna"
)

calibrator.specify().execute().analyze()

[I 2025-10-09 21:04:59,425] A new study created in memory with name: optuna_optimisation
[I 2025-10-09 21:05:28,181] Trial 1 finished with value: 0.0249145561854881 and parameters: {'maxidn': 91, 'minidn': 80, 'maxsd': 22}. Best is trial 1 with value: 0.0249145561854881.
[I 2025-10-09 21:05:28,271] Trial 0 finished with value: 0.024621770452090797 and parameters: {'maxidn': 91, 'minidn': 78, 'maxsd': 18}. Best is trial 1 with value: 0.0249145561854881.
[I 2025-10-09 21:05:28,296] Trial 3 finished with value: 0.02275302657571221 and parameters: {'maxidn': 91, 'minidn': 79, 'maxsd': 18}. Best is trial 1 with value: 0.0249145561854881.
[I 2025-10-09 21:05:28,373] Trial 2 finished with value: 0.022764785918841457 and parameters: {'maxidn': 89, 'minidn': 78, 'maxsd': 19}. Best is trial 1 with value: 0.0249145561854881.
[I 2025-10-09 21:05:46,727] Trial 6 finished with value: 0.022953429398612446 and parameters: {'maxidn': 88, 'minidn': 82, 'maxsd': 22}. Best is trial 1 with value: 0.0249145