In [17]:
%load_ext autoreload
%autoreload 2

In [22]:
import sys
sys.path.append('../src')

import os.path
from calculator.models import CalculateDistancesForProjectWorkflowInput
from common.models import Enhancer3dProject, Enhancer3dProjectConfiguration, Enhancer3dProjectDataset, Enhancer3dEnhancerAtlasDatasetType, Enhancer3dGencodeAnnotationDatasetType, ChromatinRegion

In [25]:
with open("../data/projects/8k_models_project.list", "r") as f:
    models = f.readlines()

models = [model.strip() for model in models if model.strip()]

# '2025-04-06 12:34:01    1905728 models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr10_10300866_12302793.coordinates.npy' -> models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr10_10300866_12302793
models = list(set([model.split(" ")[-1].split(".")[0] for model in models]))

len(models), models[0]

(8625,
 'models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr7_54723172_57700542')

In [31]:
# models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr7_54723172_57700542
#          ^cll    ^spc                       ^ref/mod                 ^chr ^start   ^end

# Rule 1
# if species Deni and cell line GM12878, then
# - enhancer dataset: enhanceratlas2_liftovered_hg38_filtered_by_chrom_in_regions_GM12878_Deni_with_converted_regions.tsv
# - gene dataset: gencode.v40.annotation_genes_converted_in_regions_GM12878_Deni_with_mod_regions_labelled.tsv
# if species Neanderthals and cell line GM12878, then
# - enhancer dataset: enhanceratlas2_liftovered_hg38_filtered_by_chrom_in_regions_GM12878_Nean_with_converted_regions.tsv
# - gene dataset: gencode.v40.annotation_genes_converted_in_regions_GM12878_Nean_with_mod_regions_labelled.tsv
# if species Denisovans and cell line HFFC6, then
# - enhancer dataset: enhanceratlas2_liftovered_hg38_filtered_by_chrom_in_regions_HFFC6_Deni_with_converted_regions.tsv
# ... and so on

# Rule 2
# if ref/mod flag is set to mod, then
# - enhancer dataset type: TSV_LIFTOVERED_MOD
# - gene dataset type: TSV_LIFTOVERED_MOD
# if ref/mod flag is set to ref, then
# - enhancer dataset type: TSV_LIFTOVERED_REF
# - gene dataset type: TSV_LIFTOVERED_REF

def extract_data_from_model_name(model_name):
    # models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr7_54723172_57700542
    if len(parts := model_name.split("_")) == 13:
        _, cell_line, species, _, _, _, ref_mod_flag, _, _, _, chromosome, start, end = parts
        if ref_mod_flag == "ref2":
            ref_mod_flag = "ref"

    elif len(parts := model_name.split("_")) == 12:
        _, cell_line, species, _, _, _, _, _, _, chromosome, start, end = parts
        ref_mod_flag = "ref"

    start = int(start)
    end = int(end)
    return {
        "cell_line": cell_line,
        "species": species,
        "ref_mod_flag": ref_mod_flag,
        "chromosome": chromosome,
        "start": start,
        "end": end
    }


def get_enhancer_atlas_dataset_name_from_model_name(model_name):
    data = extract_data_from_model_name(model_name)
    cell_line = data["cell_line"]
    species = data["species"]
    ref_mod_flag = data["ref_mod_flag"]

    # Rule 1
    # enhanceratlas2_liftovered_hg38_filtered_by_chrom_in_regions_GM12878_Deni_with_converted_regions
    enhancer_dataset_name = f"enhanceratlas2_liftovered_hg38_filtered_by_chrom_in_regions_{cell_line}_{species}_with_converted_regions"

    # Rule 2
    if ref_mod_flag == "mod":
        enhancer_dataset_type = Enhancer3dEnhancerAtlasDatasetType.TSV_LIFTOVERED_MOD
    else:
        enhancer_dataset_type = Enhancer3dEnhancerAtlasDatasetType.TSV_LIFTOVERED_REF

    return enhancer_dataset_name, enhancer_dataset_type


def get_gencode_annotation_dataset_name_from_model_name(model_name):
    data = extract_data_from_model_name(model_name)
    cell_line = data["cell_line"]
    species = data["species"]
    ref_mod_flag = data["ref_mod_flag"]

    # Rule 1
    # gencode.v40.annotation_genes_converted_in_regions_GM12878_Deni_with_mod_regions_labelled
    gencode_annotation_dataset_name = f"gencode.v40.annotation_genes_converted_in_regions_{cell_line}_{species}_with_mod_regions_labelled"

    # Rule 2
    if ref_mod_flag == "mod":
        gencode_annotation_dataset_type = Enhancer3dGencodeAnnotationDatasetType.TSV_LIFTOVERED_MOD
    else:
        gencode_annotation_dataset_type = Enhancer3dGencodeAnnotationDatasetType.TSV_LIFTOVERED_REF

    return gencode_annotation_dataset_name, gencode_annotation_dataset_type


In [32]:
models_by_cell_lines = {
    "GM12878": [model for model in models if "GM12878" in model],
    "HFFC6": [model for model in models if "HFFC6" in model],
    "WTC11": [model for model in models if "WTC11" in model],
    "H1ESC": [model for model in models if "H1ESC" in model]
}

requests = [
    CalculateDistancesForProjectWorkflowInput(
        project=Enhancer3dProject(
            id=f"8k_models_project_{cell_line}",
            authors=["Nikita Kozlov", "Michał Własnowolski"],
            species=["Denisovans", "Neanderthals"],
            cell_lines=[cell_line]
        ),
        datasets=[
            Enhancer3dProjectDataset(
                ensemble_id=model,
                ensemble_region=ChromatinRegion(
                    chromosome=extract_data_from_model_name(model)["chromosome"],
                    start=extract_data_from_model_name(model)["start"],
                    end=extract_data_from_model_name(model)["end"]
                ),
                enhancer_atlas_dataset_name=get_enhancer_atlas_dataset_name_from_model_name(model)[0],
                enhancer_atlas_dataset_type=get_enhancer_atlas_dataset_name_from_model_name(model)[1],
                gencode_annotation_dataset_name=get_gencode_annotation_dataset_name_from_model_name(model)[0],
                gencode_annotation_dataset_type=get_gencode_annotation_dataset_name_from_model_name(model)[1]
            )
            for model in models
        ],
        configuration=Enhancer3dProjectConfiguration()
    )
    for cell_line, models in models_by_cell_lines.items()
]

In [34]:
# with open("8k_models_project.json", "w") as f:
#     f.write(request.model_dump_json(indent=4))

for request in requests:
    with open(f"../data/{request.project.id}.json", "w") as f:
        f.write(request.model_dump_json(indent=4))
