Feature cv training (#55)

* performance optimizations * train multiple repeats on single node in parallel * bug fix * fix bug in indexing when subset_samples() removed something * sleep between jobs; stop if any job fails * format with black * bug fixes * add test for MultiphenoDataloader * update environments * uncomment rules * bug fixes * subset samples in training_dataset rule * example config.yaml * use gpu queue for compute_burdens * bugfix since dask reading didn't work any more * allow evaluation of all repeat combinations * allow analysis of each n_repeats and for all repeat combinations * option to provide burden file * allow seed gene alpha to be defined in config * change sorting order to get the best model * adaptations to analyze multiple repeats and use script wo seed genes * allow to provide a sample file and do separate indexing for pheno and geno to ensure indices are correct * automatize generation of figure 3 (associations & repliation) * generate cv splits with related samples in the same split * average burdens * average burdens * cross-validation like trainign * add missing cv_utils * write average burdens or each combination to single zarr file to avoid zarr issues * add logging information * make maf column a param * add logging * pipeline replictaion and plotting * evaluate all repeat combis with and without seed genes * update lsf.yaml * small updates * per-gene pval aggregation * aggregate pval per gene * bugfix- only load burdens if not skip burdens * logging info * updates and fixes * load burdens only for genes analysed in current chunk to save memory * small changes to pipeline * standardizing/qt-transform of combined test set x/y arrays * my_quantile_transform for numpy arrays * bugfix * remove unnecessary code * remove unnecessary wildcards * make averaging part of associate.py * allow seed genes/baselines to be missing (to allow assoc. testing for non-training phenotypes) * updates * gene-specific common variant covariates for conditional analysis * bugfix * post-hoc conditioning on common variants * restructure pipelines * removing redundant options * add cv_utils cli * simplify script (only evaluate one repeat combi/average burdens); aggregate baseline pvalues; make bonferroni correction default * removal of redundant wildcards, updates and fixes * bugfixes * baseline discoveries only required for training phenotypes * remove not needed code * update configs * formatting * manually merge changes from feature-regenie to account for gene-specific annotations * allow different sample orders in phenotype_df and genotypes.h5 * change sample ids to be bytes as it is in the real data * update pipelines * update gitignore * pipeline updates * manually update github actions to be like master * bug fixes * checkout tests from master * make phenotype indices string as they are in real data * 'add gene_id' column * manually merge with master so tests can pass * bugfixes * use gene_id column instead of gene_ids * pipeline updates and fixes * update test config * adding age2 and age_sex to example data * update config * set tests folder to main version * checkout preprocssing files from main * checkout from main * manually merge sample_id changes from main * pipeline bugfixes and renamings * fixup! Format Python code with psf/black pull_request * remove gene_ids column * integrating suggested PR changes * fixup! Format Python code with psf/black pull_request --------- Co-authored-by: Brian Clarke <brian.clarke@dkfz.de> Co-authored-by: PMBio <PMBio@users.noreply.github.com>
PMBio · Feb 23, 2024 · 4d9ef64 · 4d9ef64
1 parent ada0aaa
commit 4d9ef64
Show file tree

Hide file tree

Showing 33 changed files with 2,151 additions and 326 deletions.
diff --git a/.gitignore b/.gitignore
@@ -164,4 +164,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
-/docs/apidocs/
+
+pipelines/deprecated_pipelines
+
diff --git a/deeprvat/cv_utils.py b/deeprvat/cv_utils.py
@@ -0,0 +1,215 @@
+import pandas as pd
+import yaml
+import os
+import sys
+from typing import Optional
+import re
+
+# import pickle
+import logging
+import click
+import copy
+import zarr
+import numpy as np
+from numcodecs import Blosc
+from pathlib import Path
+from deeprvat.utils import (
+    standardize_series,
+    my_quantile_transform,
+)
+
+logging.basicConfig(
+    format="[%(asctime)s] %(levelname)s:%(name)s: %(message)s",
+    level="INFO",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__name__)
+DATA_SLOT_DICT = {
+    "deeprvat": ["data", "training_data"],
+    "seed_genes": ["data"],
+}
+
+module_folder_dict = {
+    "seed_genes": "baseline",
+    "deeprvat": "deeprvat",
+    "alternative_burdens": "alternative_burdens",
+}
+
+
+@click.group()
+def cli():
+    pass
+
+
+@cli.command()
+@click.option("--module", "-m", multiple=True)
+@click.option("--fold", type=int)
+@click.option("--fold-specific-baseline", is_flag=True)
+@click.option("--n-folds", type=int, default=5)
+@click.argument("input_config", type=click.Path(exists=True))
+@click.argument("out_path", type=click.Path(), default="./")
+def spread_config(
+    input_config, out_path, module, fold_specific_baseline, fold, n_folds
+):
+    data_modules = module
+
+    with open(input_config) as f:
+        config_template = yaml.safe_load(f)
+    split = "train"
+    cv_path = f"{config_template['cv_path']}/{n_folds}_fold"
+    for module in data_modules:
+        config = copy.deepcopy(config_template)
+        data_slots = DATA_SLOT_DICT[module]
+        for data_slot in data_slots:
+            sample_file = f"{cv_path}/samples_{split}{fold}.pkl"
+            logger.info(f"setting sample file {sample_file}")
+            config[data_slot]["dataset_config"]["sample_file"] = sample_file
+
+        if (module == "deeprvat") | (module == "deeprvat_pretrained"):
+            logger.info("Writing baseline directories")
+            old_baseline = copy.deepcopy(config["baseline_results"])
+            if fold_specific_baseline:
+                config["baseline_results"] = [
+                    {"base": f'{r["base"]}/cv_split{fold}/baseline', "type": r["type"]}
+                    for r in old_baseline
+                ]
+            logger.info(config["baseline_results"])
+        logger.info(f"Writing config for module {module}")
+        with open(f"{out_path}/{module_folder_dict[module]}/config.yaml", "w") as f:
+            yaml.dump(config, f)
+
+
+@cli.command()
+@click.option("--fold", type=int)
+@click.option("--n-folds", type=int, default=5)
+@click.argument("input_config", type=click.Path(exists=True))
+@click.argument("out_file", type=click.Path())
+def generate_test_config(input_config, out_file, fold, n_folds):
+    with open(input_config) as f:
+        config = yaml.safe_load(f)
+    cv_path = f"{config['cv_path']}/{n_folds}_fold"
+    split = "test"
+    sample_file = f"{cv_path}/samples_{split}{fold}.pkl"
+    logger.info(f"setting sample file {sample_file}")
+    for data_slot in DATA_SLOT_DICT["deeprvat"]:
+        config[data_slot]["dataset_config"]["sample_file"] = sample_file
+    with open(out_file, "w") as f:
+        yaml.dump(config, f)
+
+
+@cli.command()
+@click.option("--link-burdens", type=click.Path())
+@click.option("--burden-dirs", "-b", multiple=True)
+@click.argument("out_dir", type=click.Path(), default="./")
+@click.argument("config_file", type=click.Path(exists=True))
+def combine_test_set_burdens(
+    out_dir,
+    link_burdens,
+    burden_dirs,
+    config_file,
+):
+    with open(config_file) as f:
+        config = yaml.safe_load(f)
+    compression_level = 1
+    skip_burdens = link_burdens is not None
+    n_total_samples = []
+    for burden_dir in burden_dirs:
+        print(burden_dir)
+        this_y = zarr.open(f"{burden_dir}/y.zarr")
+        this_x = zarr.open(f"{burden_dir}/x.zarr")
+        # this_burdens = zarr.open(f'{burden_dir}/burdens.zarr')
+
+        assert this_y.shape[0] == this_x.shape[0]  # == this_burdens.shape[0]
+        n_total_samples.append(this_y.shape[0])
+
+    n_total_samples = np.sum(n_total_samples)
+    print(f"Total number of samples {n_total_samples}")
+    if not skip_burdens:
+        this_burdens = zarr.open(
+            f"{burden_dir}/burdens.zarr"
+        )  # any burden tensor (here from the last file to get dims 1 -n)
+        burdens = zarr.open(
+            Path(out_dir) / "burdens.zarr",
+            mode="a",
+            shape=(n_total_samples,) + this_burdens.shape[1:],
+            chunks=(1000, 1000),
+            dtype=np.float32,
+            compressor=Blosc(clevel=compression_level),
+        )
+        print(f"burdens shape: {burdens.shape}")
+    else:
+        burdens = None
+
+    y = zarr.open(
+        Path(out_dir) / "y.zarr",
+        mode="a",
+        shape=(n_total_samples,) + this_y.shape[1:],
+        chunks=(None, None),
+        dtype=np.float32,
+        compressor=Blosc(clevel=compression_level),
+    )
+    x = zarr.open(
+        Path(out_dir) / "x.zarr",
+        mode="a",
+        shape=(n_total_samples,) + this_x.shape[1:],
+        chunks=(None, None),
+        dtype=np.float32,
+        compressor=Blosc(clevel=compression_level),
+    )
+
+    start_idx = 0
+
+    for burden_dir in burden_dirs:
+        this_y = zarr.open(f"{burden_dir}/y.zarr")[:]
+        end_idx = start_idx + this_y.shape[0]
+        this_x = zarr.open(f"{burden_dir}/x.zarr")[:]
+        if not skip_burdens:
+            logger.info("writing burdens")
+            this_burdens = zarr.open(f"{burden_dir}/burdens.zarr")[:]
+            burdens[start_idx:end_idx] = this_burdens
+        print((start_idx, end_idx))
+        y[start_idx:end_idx] = this_y
+        x[start_idx:end_idx] = this_x
+        start_idx = end_idx
+
+    y_transformation = config["data"]["dataset_config"].get("y_transformation", None)
+    standardize_xpheno = config["data"]["dataset_config"].get(
+        "standardize_xpheno", True
+    )
+
+    ## Analogously to what is done in densegt
+    if standardize_xpheno:
+        this_x = x[:]
+        logger.info("  Standardizing combined covariates")
+        for col in range(this_x.shape[1]):
+            this_x[:, col] = standardize_series(this_x[:, col])
+        x[:] = this_x
+    if y_transformation is not None:
+        this_y = y[:]
+        n_unique_y_val = np.count_nonzero(~np.isnan(np.unique(this_y)))
+        if n_unique_y_val == 2:
+            logger.warning(
+                "Not applying y transformation because y only has two values and seems to be binary"
+            )
+            y_transformation = None
+        if y_transformation is not None:
+            if y_transformation == "standardize":
+                logger.info("  Standardizing combined target phenotype (y)")
+                for col in range(this_y.shape[1]):
+                    this_y[:, col] = standardize_series(this_y[:, col])
+            elif y_transformation == "quantile_transform":
+                logger.info(f"  Quantile transforming combined target phenotype (y)")
+                for col in range(this_y.shape[1]):
+                    this_y[:, col] = my_quantile_transform(this_y[:, col])
+            y[:] = this_y
+    print("done")
+    if link_burdens is not None:
+        source_path = Path(out_dir) / "burdens.zarr"
+        source_path.unlink(missing_ok=True)
+        source_path.symlink_to(link_burdens)
+    genes = np.load(f"{burden_dirs[0]}/genes.npy")
+    np.save(Path(out_dir) / "genes.npy", genes)
+
+
+if __name__ == "__main__":
+    cli()