# Imports

In [2]:
from subprocess import Popen
import subprocess
from pathlib import Path
import numpy as np
from sourmash import MinHash
from sourmash import signature as sig
from Bio import SeqIO

# Reference simulation

Execute the reference simulation.

In [3]:
M = 0
c=str(Path("data", "random_genome_chr_index.csv"))
r=str(Path("data", "combined_curated_TE_lib_ATOSZM_selected.fasta"))
o="data"
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
i=0.001
m=5
n=1

ksize = 51 
scaled = 100  
track_abundance = True
seed = 100

p="reference_simulation"
cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()

0

Derive Sourmash signature. 

# Optimisation

Ground-truth parameter values

```python
maxidn=95
minidn=80
maxsd=20
minsd=1
a=0.7
b=0.5
```

In [4]:
reference_genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"

mh = MinHash(n=0, ksize=ksize, scaled=scaled, track_abundance=track_abundance, seed=seed)
for record in SeqIO.parse(reference_genome_sequence_path, "fasta"):
    mh.add_sequence(str(record.seq), force=True)
reference_encoded_genome_sequence = sig.SourmashSignature(mh, name="reference_genome")
reference_encoded_genome_sequence

SourmashSignature('reference_genome', ed226136)

In [None]:
import numpy as np
import pandas as pd

from calisim.data_model import (
    DistributionModel,
    ParameterDataType,
    ParameterSpecification,
)
from calisim.optimisation import OptimisationMethod, OptimisationMethodModel

parameter_spec = ParameterSpecification(
parameters=[
    DistributionModel(
        name="maxidn",
        distribution_name="uniform",
        distribution_args=[90, 95],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="minidn",
        distribution_name="uniform",
        distribution_args=[80, 83],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="maxsd",
        distribution_name="uniform",
        distribution_args=[20, 25],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="minsd",
        distribution_name="uniform",
        distribution_args=[1, 1],
        data_type=ParameterDataType.DISCRETE,
    ),
    DistributionModel(
        name="a",
        distribution_name="uniform",
        distribution_args=[0.7, 0.9],
        data_type=ParameterDataType.CONTINUOUS,
    ),
    DistributionModel(
        name="b",
        distribution_name="uniform",
        distribution_args=[0.5, 0.8],
        data_type=ParameterDataType.CONTINUOUS,
    ),
]
)

def objective(
    parameters: dict, simulation_id: str, observed_data: np.ndarray | None
) -> float | list[float]:
    simulation_parameters = {}
    
    maxidn=parameters["maxidn"]
    minidn=parameters["minidn"]
    maxsd=parameters["maxsd"]
    minsd=parameters["minsd"]
    a=parameters["a"]
    b=parameters["b"]

    p=simulation_id
    cmd = f"tegenomesimulator -M {M} -p {p} -c {c} -r {r} -o {o} -a {a} -b {b} -i {i} -m {m} -n {n} --maxidn {maxidn} --minidn {minidn} --maxsd {maxsd} --minsd {minsd} -s {seed}"
    Popen(cmd, shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL).wait()
    
    genome_sequence_path = f"{o}/TEgenomeSimulator_{p}_result/{p}_genome_sequence_out_final.fasta"

    mh = MinHash(n=0, ksize=ksize, scaled=scaled, track_abundance=track_abundance, seed=seed)
    for record in SeqIO.parse(genome_sequence_path, "fasta"):
        mh.add_sequence(str(record.seq), force=True)
    encoded_genome_sequence = sig.SourmashSignature(mh, name=p)
    similarity = encoded_genome_sequence.similarity(reference_encoded_genome_sequence)

    return similarity

specification = OptimisationMethodModel(
    experiment_name="optuna_optimisation",
    parameter_spec=parameter_spec,
    method="tpes",
    directions=["maximize"],
    output_labels=["Jaccard Similarity"],
    n_iterations=100,
    n_jobs=4,
    method_kwargs=dict(n_startup_trials=1),
)

calibrator = OptimisationMethod(
    calibration_func=objective, specification=specification, engine="optuna"
)

calibrator.specify().execute().analyze()

[I 2025-09-30 14:02:04,688] A new study created in memory with name: optuna_optimisation
[I 2025-09-30 14:02:21,291] Trial 0 finished with value: 0.03972417128016814 and parameters: {'maxidn': 95, 'minidn': 82, 'maxsd': 22, 'minsd': 1, 'a': 0.7, 'b': 0.5}. Best is trial 0 with value: 0.03972417128016814.
[I 2025-09-30 14:02:21,323] Trial 2 finished with value: 0.03841529973658442 and parameters: {'maxidn': 95, 'minidn': 83, 'maxsd': 21, 'minsd': 1, 'a': 0.7, 'b': 0.5}. Best is trial 0 with value: 0.03972417128016814.
[I 2025-09-30 14:02:21,749] Trial 1 finished with value: 0.03150130927034278 and parameters: {'maxidn': 92, 'minidn': 80, 'maxsd': 20, 'minsd': 1, 'a': 0.7, 'b': 0.5}. Best is trial 0 with value: 0.03972417128016814.
[I 2025-09-30 14:02:22,260] Trial 3 finished with value: 0.03290853805296512 and parameters: {'maxidn': 90, 'minidn': 81, 'maxsd': 23, 'minsd': 1, 'a': 0.7, 'b': 0.5}. Best is trial 0 with value: 0.03972417128016814.
[I 2025-09-30 14:02:38,561] Trial 4 finishe

<calisim.optimisation.implementation.OptimisationMethod at 0x7f9fc027bc70>