In [1]:
import hail
from pyspark import SparkContext, SparkConf
#!/usr/bin/env python3
import os
import json
from pathlib import Path
from math import ceil
import hail as hl
from src.data_processing.vcf.hail_metods import parse_empty  # helper for empty values
from src.utils.general.string_operations import trim_prefix  # helper for trimming prefixes
hl.stop()

In [2]:


hail_home = Path(hl.__file__).parent.__str__()

###########################################################################
# Spark/Hail Configuration
###########################################################################
def setup_spark_config(tmp_dir):
    path = Path.cwd().resolve().parents[0] / "src" / "config" / "spark_conf.json"
    with path.open() as f:
        conf_data = json.load(f)

    #conf_data["spark.jars"] = conf_data["spark.jars"].format(hail_home=hail_home)
    conf_data["spark.driver.extraClassPath"] = conf_data[
        "spark.driver.extraClassPath"
    ].format(hail_home=hail_home)
    conf_data["spark.local.dir"] = conf_data["spark.local.dir"].format(local_dir=tmp_dir)

    os.environ["SPARK_JAVA_OPTS"] = "-XX:-UsePerfData"
    conf = SparkConf().setAll(conf_data.items())
    return conf
    
def init_spark_and_run(conf, out_dir, tmp_dir):
    sc = SparkContext(conf=conf)

    conf.set("spark.jars", f"{hail_home}/backend/hail-all-spark.jar")
    conf.set("spark.driver.extraClassPath", f"{hail_home}/backend/hail-all-spark.jar")
    conf.set("spark.driver.extraJavaOptions","-XX:-UsePerfData")
    conf.set("spark.executor.extraJavaOptions", "-XX:-UsePerfData")
    conf.set("spark.eventLog.dir", f"{out_dir}")

    os.environ["TMPDIR"] = f"{tmp_dir}"
    hl.init(
        backend="spark",
        sc=sc,
        min_block_size=4096,
        tmp_dir=tmp_dir,
        local_tmpdir=tmp_dir,
        log=hl.utils.timestamp_path(f"{tmp_dir}/logfile", f".skat.log")
    )

    print(sc.getConf().getAll())


# Log output directory
out_dir = "/mnt/sdb/tmp-skat/"
tmp_dir = "/mnt/sdb/tmp-skat"
conf = setup_spark_config(tmp_dir)

init_spark_and_run(conf, out_dir, tmp_dir)
# Define the output directories.
skat_mtx_output_dir = "/mnt/sdb/tmp-skat/skat_mtx"
intermediate_dir = os.path.join(skat_mtx_output_dir, "intermediate_batches")


###########################################################################
# Define contig recoding dictionary for GRCh37.
###########################################################################
contig_recoding = {f"chr{i}": str(i) for i in range(1, 23)}
contig_recoding.update({"chrX": "X", "chrY": "Y", "chrM": "MT"})

SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
SLF4J: Defaulting to no-operation (NOP) logger implementation
SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.4
SparkUI available at http://tu-kliin-1-oligogen-1-virtual--1475-7.openstacklocal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.133-4c60fddb171a
LOGGING: writing to /mnt/sdb/tmp-skat/logfile-20250604-0005.skat.log


[('spark.repl.local.jars', ''), ('spark.local.dir', '/mnt/sdb/tmp-skat'), ('spark.executor.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED --add-opens=java.base/sun.nio.ch=ALL-UNNAMED --add-opens=java.base/sun.nio.cs=ALL-UNNAMED --add-opens=java.base/sun.security.action=ALL-UNNAMED --add-opens=java.base/sun.util.calendar=ALL-UNNAMED --add-opens=java.security.jgss/sun.security.krb5=ALL-UNNAMED -Djdk.reflect.useDirectMethodHandle=false -XX:+UseG1GC'), ('spark.kryo.regi

In [3]:

###########################################################################
# Helper Functions
###########################################################################
def merge_matrix_tables_cols(matrix_tables):
    """
    Merge MatrixTables by columns (keeping only the essential entry fields).
    """
    combined_mt = matrix_tables[0].select_entries(
        AD=matrix_tables[0].AD,
        DP=matrix_tables[0].DP,
        GQ=matrix_tables[0].GQ,
        GT=matrix_tables[0].GT,
        PL=matrix_tables[0].PL,
        AC=matrix_tables[0].AC
    )
    for mt in matrix_tables[1:]:
        mt_common = mt.select_entries(
            AD=mt.AD,
            DP=mt.DP,
            GQ=mt.GQ,
            GT=mt.GT,
            PL=mt.PL,
            AC=mt.AC
        )
        combined_mt = combined_mt.union_cols(mt_common, row_join_type="outer")
    return combined_mt

def process_vcf(vcf_path, metadata=None, annotate=False):
    """
    Process a single VCF file.

    Since the VCF is already annotated, we extract the CSQ field from INFO to obtain
    the row-level "gene" annotation (and optional fields HGNC_ID and MAX_AF) required
    for SKAT grouping.

    Steps:
      1. Import the VCF using GRCh37 with force_bgz and contig recoding.
      2. Convert GQ to int32 and split multiallelic records.
      3. Create a temporary field "csq_array" by splitting the first INFO.CSQ string on "|".
         Then extract:
            - gene: csq_array[1] if present, else "Unknown".
            - HGNC_ID: parsed from csq_array[2] if numeric, else missing.
            - MAX_AF: parsed from csq_array[3] using parse_empty (defaults to 0.0).
      4. Compute entry-level annotations:
            - AC: Number of alternate alleles (from GT).
            - VF: Variant fraction (AD[1] / DP).
      5. Optionally annotate globals with metadata.
      6. Drop the INFO field and temporary csq_array, then filter entries with VF ≥ 0.3 and DP > 30.
      7. Select only the essential entry fields.
    """
    mt = hl.import_vcf(
        str(vcf_path),
        force_bgz=True,
        reference_genome='GRCh37',
        contig_recoding=contig_recoding
    )
    mt = mt.annotate_entries(GQ=hl.int32(mt.GQ))
    mt = hl.split_multi_hts(mt)

    # Extract VEP CSQ info and store in a temporary field "csq_array"
    mt = mt.annotate_rows(
        csq_array = hl.if_else(
            hl.is_missing(mt.info.CSQ) | (hl.len(mt.info.CSQ) == 0),
            hl.empty_array(hl.tstr),
            mt.info.CSQ.first().split("\\|")
        )
    )
    mt = mt.annotate_rows(
        gene = hl.if_else(hl.len(mt.csq_array) > 1, mt.csq_array[1], "Unknown"),
        HGNC_ID = hl.if_else(hl.len(mt.csq_array) > 2,
                              hl.if_else(mt.csq_array[2].matches(r'^-?\d+$'),
                                         hl.int32(mt.csq_array[2]),
                                         hl.missing(hl.tint32)),
                              hl.missing(hl.tint32)),
        MAX_AF = hl.if_else(hl.len(mt.csq_array) > 3,
                            hl.float(parse_empty(mt.csq_array[3])),
                            0.0)
    )
    
    # Compute entry-level annotations.
    mt = mt.filter_rows(mt.MAX_AF < 0.01) # Filter out common variants (>1%)
    mt = mt.annotate_entries(
        AC = mt.GT.n_alt_alleles(),
        VF = hl.float(mt.AD[1] / mt.DP)
    )

    # Annotate globals with metadata if provided.
    if metadata is not None:
        prefix = trim_prefix(Path(vcf_path).stem)
        phen, mut_val = metadata.get(prefix, ["NA", "NA"])
        if len(phen) == 0:
            phen = "NA"
        if len(mut_val) == 0:
            mut_val = "NA"
        mt = mt.annotate_globals(metadata=hl.struct(phenotype=phen, mutation=mut_val))
        mt = mt.annotate_cols(pheno = hl.literal(phen)[mt.s])
    # Clean up: Drop the INFO field and temporary csq_array; filter entries.
    mt = mt.drop(mt.info, mt.csq_array)
    mt = mt.filter_entries(mt.VF >= 0.3, keep=True)
    mt = mt.filter_entries(mt.DP > 30, keep=True)
    
    mt = mt.annotate_globals(prefix=trim_prefix(Path(vcf_path).stem))

    # Keep only the essential entry fields.
    mt = mt.select_entries(AD=mt.AD, DP=mt.DP, GQ=mt.GQ, GT=mt.GT, PL=mt.PL, AC=mt.AC)
    mt = mt.select_rows(mt.gene, mt.HGNC_ID, mt.MAX_AF)
    return mt

def process_directory_in_batches(directory, batch_size=1, metadata=None, annotate=False, pheno_value=1, batch_prefix="batch_"):
    """
    Process VCF files in the specified directory in batches.
    For each batch:
       - Process up to batch_size files using process_vcf.
       - Merge them by columns.
       - Checkpoint the resulting MatrixTable to disk.
    Returns a list of checkpoint file paths.
    """
    files = sorted([Path(directory) / f for f in os.listdir(directory) if f.endswith(".vcf") or f.endswith(".vcf.gz")])
    n_batches = ceil(len(files) / batch_size)
    batch_checkpoints = []
    for i in range(n_batches):
        batch_checkpoint_path = Path(intermediate_dir).joinpath(f"{batch_prefix}{i}.mt")
        batch_files = files[i * batch_size:(i + 1) * batch_size]
        mt_list = []
        if not batch_checkpoint_path.exists():
            for f in batch_files:
                mt = process_vcf(f, metadata=metadata, annotate=annotate)
                mt = mt.annotate_cols(pheno=pheno_value)
                mt_list.append(mt)
                #print(f"Processed file: {f} with entry schema: {mt.entry.dtype}")
            # Merge the batch.
            batch_mt = mt_list[0]
            for mt in mt_list[1:]:
                batch_mt = batch_mt.union_cols(mt)
            batch_mt = batch_mt.write(str(batch_checkpoint_path), overwrite=True)
            del batch_mt
            print(f"Saved batch {i} with {len(batch_files)} VCFs to {batch_checkpoint_path}")
        else:
            print(f"Batch {i} with {len(batch_files)} VCFs exists in {batch_checkpoint_path}. Skipping...")
        batch_checkpoints.append(batch_checkpoint_path)
    return batch_checkpoints

def merge_batches(checkpoint_paths):
    """
    Read intermediate batch MatrixTables and merge them by columns.
    """
    mt_list = [hl.read_matrix_table(str(path)) for path in checkpoint_paths]
    merged = mt_list[0]
    merged = merged.filter_rows(merged.MAX_AF < 0.01, keep=True)
    for mt in mt_list[1:]:
        merged = merged.union_cols(mt)
    return merged

In [4]:
import hail as hl
from pathlib import Path
from math import ceil

def merge_matrix_tables_cols(mt_paths, tmp_dir, tag, chunk_size=10, max_intermediates=10):
    """
    Merge many MatrixTables by columns using a two-level chunked strategy to avoid JVM IR limits.

    Parameters:
        mt_paths (list[str or MatrixTable]): Paths or MatrixTable objects to merge.
        tmp_dir (str or Path): Directory for intermediate checkpoints.
        chunk_size (int): Max number of MTS per merge chunk.
        max_intermediates (int): Max # of intermediate chunks before doing a second-level merge.
    """
    tmp_dir = Path(tmp_dir)
    tmp_dir.mkdir(parents=True, exist_ok=True)

    # Ensure all inputs are file paths (checkpoint in place if needed)
    prepared_paths = []
    for i, mt in enumerate(mt_paths):
        if isinstance(mt, hl.MatrixTable):
            path = tmp_dir / f"input_checkpoint_{i}.mt"
            mt = mt.checkpoint(str(path), overwrite=True)
            prepared_paths.append(str(path))
        else:
            prepared_paths.append(str(mt))

    def _merge_chunks(paths, level):
        chunk_paths = []
        num_chunks = ceil(len(paths) / chunk_size)

        for i in range(num_chunks):
            chunk = paths[i * chunk_size : (i + 1) * chunk_size]
            print(f"[Level {level}] Merging chunk {i+1}/{num_chunks} with {len(chunk)} files")

            mt = hl.read_matrix_table(chunk[0])
            for path in chunk[1:]:
                mt_next = hl.read_matrix_table(path)
                mt = mt.union_cols(mt_next, row_join_type="outer")

            chunk_path = tmp_dir / f"level{level}_chunk_{i}_{tag}.mt"
            mt = mt.checkpoint(str(chunk_path), overwrite=True)
            chunk_paths.append(str(chunk_path))
            del mt

        return chunk_paths

    # First-level merge
    intermediate_paths = _merge_chunks(prepared_paths, level=1)

    # If too many intermediates, do a second-level merge
    if len(intermediate_paths) > max_intermediates:
        print(f"Too many intermediates ({len(intermediate_paths)}), doing second-level merge...")
        intermediate_paths = _merge_chunks(intermediate_paths, level=2)

    # Final merge
    print("Final merge...")
    final_mt = hl.read_matrix_table(intermediate_paths[0])
    for path in intermediate_paths[1:]:
        mt = hl.read_matrix_table(path)
        final_mt = final_mt.union_cols(mt, row_join_type="outer")

    return final_mt


In [5]:

###########################################################################
# Main Processing for Positive and Negative Groups in Batches
###########################################################################
positive_dir = "/mnt/sdb/tmp3/positive_LIHAS_639"
negative_dir = "/mnt/sdb/tmp3/negative_LIHAS_9059"

# # Process negative group in batches.
neg_batch_paths = process_directory_in_batches(
     negative_dir, batch_size=10, metadata=None, annotate=False, pheno_value=0, batch_prefix="neg_batch_")
# Process positive group in batches.
pos_batch_paths = process_directory_in_batches(
    positive_dir, batch_size=10, metadata=None, annotate=False, pheno_value=1, batch_prefix="pos_batch_")

# Merge all batches for each group.
#positive path
#inp = Path("/mnt/sdb/tmp3/skat_mtx/intermediate_batches/")
#pos_batch_paths = sorted([Path(inp) / f for f in os.listdir(inp) if f.endswith(".mt") and f.startswith("pos")])
#neg_batch_paths = sorted([Path(inp) / f for f in os.listdir(inp) if f.endswith(".mt") and f.startswith("neg")])
#print(pos_batch_paths)
mt_positive = merge_matrix_tables_cols(pos_batch_paths, tmp_dir, tag="pos")
mt_positive = mt_positive.checkpoint(os.path.join(skat_mtx_output_dir, "positive_group.mt"), overwrite=True)

print(neg_batch_paths)
mt_negative = merge_matrix_tables_cols(neg_batch_paths, tmp_dir, tag="neg")
mt_negative = mt_negative.checkpoint(os.path.join(skat_mtx_output_dir, "negative_group.mt"), overwrite=True)

#mt_positive = hl.read_matrix_table("/mnt/sdb/tmp3/skat_mtx/positive_group.mt/")
#mt_positive = mt_positive.filter_rows(mt_positive.MAX_AF < 0.01, keep=True)

Batch 0 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_0.mt. Skipping...
Batch 1 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_1.mt. Skipping...
Batch 2 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_2.mt. Skipping...
Batch 3 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_3.mt. Skipping...
Batch 4 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_4.mt. Skipping...
Batch 5 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_5.mt. Skipping...
Batch 6 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_6.mt. Skipping...
Batch 7 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_7.mt. Skipping...
Batch 8 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_8.mt. Skipping...
Batch 9 with 10 VCFs exists in /mnt/s

SLF4J: Failed to load class "org.slf4j.impl.StaticMDCBinder".
SLF4J: Defaulting to no-operation MDCAdapter implementation.
SLF4J: See http://www.slf4j.org/codes.html#no_static_mdc_binder for further details.
                                                                                

Saved batch 768 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_768.mt


                                                                                

Saved batch 769 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_769.mt


                                                                                

Saved batch 770 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_770.mt


                                                                                

Saved batch 771 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_771.mt


                                                                                

Saved batch 772 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_772.mt


                                                                                

Saved batch 773 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_773.mt


                                                                                

Saved batch 774 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_774.mt


                                                                                

Saved batch 775 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_775.mt


                                                                                

Saved batch 776 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_776.mt


                                                                                

Saved batch 777 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_777.mt


                                                                                

Saved batch 778 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_778.mt


                                                                                

Saved batch 779 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_779.mt


                                                                                

Saved batch 780 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_780.mt


                                                                                

Saved batch 781 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_781.mt


                                                                                

Saved batch 782 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_782.mt


                                                                                

Saved batch 783 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_783.mt


                                                                                

Saved batch 784 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_784.mt


                                                                                

Saved batch 785 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_785.mt


                                                                                

Saved batch 786 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_786.mt


                                                                                

Saved batch 787 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_787.mt


                                                                                

Saved batch 788 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_788.mt


                                                                                

Saved batch 789 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_789.mt


                                                                                

Saved batch 790 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_790.mt


                                                                                

Saved batch 791 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_791.mt


                                                                                

Saved batch 792 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_792.mt


                                                                                

Saved batch 793 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_793.mt


                                                                                

Saved batch 794 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_794.mt


                                                                                

Saved batch 795 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_795.mt


                                                                                

Saved batch 796 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_796.mt


                                                                                

Saved batch 797 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_797.mt


                                                                                

Saved batch 798 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_798.mt


                                                                                

Saved batch 799 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_799.mt


                                                                                

Saved batch 800 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_800.mt


                                                                                

Saved batch 801 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_801.mt


                                                                                

Saved batch 802 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_802.mt


                                                                                

Saved batch 803 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_803.mt


                                                                                

Saved batch 804 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_804.mt


                                                                                

Saved batch 805 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_805.mt


                                                                                

Saved batch 806 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_806.mt


                                                                                

Saved batch 807 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_807.mt


                                                                                

Saved batch 808 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_808.mt


                                                                                

Saved batch 809 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_809.mt


                                                                                

Saved batch 810 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_810.mt


                                                                                

Saved batch 811 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_811.mt


                                                                                

Saved batch 812 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_812.mt


                                                                                

Saved batch 813 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_813.mt


                                                                                

Saved batch 814 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_814.mt


                                                                                

Saved batch 815 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_815.mt


                                                                                

Saved batch 816 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_816.mt


                                                                                

Saved batch 817 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_817.mt


                                                                                

Saved batch 818 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_818.mt


                                                                                

Saved batch 819 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_819.mt


                                                                                

Saved batch 820 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_820.mt


                                                                                

Saved batch 821 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_821.mt


                                                                                

Saved batch 822 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_822.mt


                                                                                

Saved batch 823 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_823.mt


                                                                                

Saved batch 824 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_824.mt


                                                                                

Saved batch 825 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_825.mt


                                                                                

Saved batch 826 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_826.mt


                                                                                

Saved batch 827 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_827.mt


                                                                                

Saved batch 828 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_828.mt


                                                                                

Saved batch 829 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_829.mt


                                                                                

Saved batch 830 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_830.mt


                                                                                

Saved batch 831 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_831.mt


                                                                                

Saved batch 832 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_832.mt


                                                                                

Saved batch 833 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_833.mt


                                                                                

Saved batch 834 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_834.mt


                                                                                

Saved batch 835 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_835.mt


                                                                                

Saved batch 836 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_836.mt


                                                                                

Saved batch 837 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_837.mt


                                                                                

Saved batch 838 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_838.mt


                                                                                

Saved batch 839 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_839.mt


                                                                                

Saved batch 840 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_840.mt


                                                                                

Saved batch 841 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_841.mt


                                                                                

Saved batch 842 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_842.mt


                                                                                

Saved batch 843 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_843.mt


                                                                                

Saved batch 844 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_844.mt


                                                                                

Saved batch 845 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_845.mt


                                                                                

Saved batch 846 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_846.mt


                                                                                

Saved batch 847 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_847.mt


                                                                                

Saved batch 848 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_848.mt


                                                                                

Saved batch 849 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_849.mt


                                                                                

Saved batch 850 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_850.mt


                                                                                

Saved batch 851 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_851.mt


                                                                                

Saved batch 852 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_852.mt


                                                                                

Saved batch 853 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_853.mt


                                                                                

Saved batch 854 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_854.mt


                                                                                

Saved batch 855 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_855.mt


                                                                                

Saved batch 856 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_856.mt


                                                                                

Saved batch 857 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_857.mt


                                                                                

Saved batch 858 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_858.mt


                                                                                

Saved batch 859 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_859.mt


                                                                                

Saved batch 860 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_860.mt


                                                                                

Saved batch 861 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_861.mt


                                                                                

Saved batch 862 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_862.mt


                                                                                

Saved batch 863 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_863.mt


                                                                                

Saved batch 864 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_864.mt


                                                                                

Saved batch 865 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_865.mt


                                                                                

Saved batch 866 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_866.mt


                                                                                

Saved batch 867 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_867.mt


                                                                                

Saved batch 868 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_868.mt


                                                                                

Saved batch 869 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_869.mt


                                                                                

Saved batch 870 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_870.mt


                                                                                

Saved batch 871 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_871.mt


                                                                                

Saved batch 872 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_872.mt


                                                                                

Saved batch 873 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_873.mt


                                                                                

Saved batch 874 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_874.mt


                                                                                

Saved batch 875 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_875.mt


                                                                                

Saved batch 876 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_876.mt


                                                                                

Saved batch 877 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_877.mt


                                                                                

Saved batch 878 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_878.mt


                                                                                

Saved batch 879 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_879.mt


                                                                                

Saved batch 880 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_880.mt


                                                                                

Saved batch 881 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_881.mt


                                                                                

Saved batch 882 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_882.mt


                                                                                

Saved batch 883 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_883.mt


                                                                                

Saved batch 884 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_884.mt


                                                                                

Saved batch 885 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_885.mt


                                                                                

Saved batch 886 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_886.mt


                                                                                

Saved batch 887 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_887.mt


                                                                                

Saved batch 888 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_888.mt


                                                                                

Saved batch 889 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_889.mt


                                                                                

Saved batch 890 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_890.mt


                                                                                

Saved batch 891 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_891.mt


                                                                                

Saved batch 892 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_892.mt


                                                                                

Saved batch 893 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_893.mt


                                                                                

Saved batch 894 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_894.mt


                                                                                

Saved batch 895 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_895.mt


                                                                                

Saved batch 896 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_896.mt


                                                                                

Saved batch 897 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_897.mt


                                                                                

Saved batch 898 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_898.mt


                                                                                

Saved batch 899 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_899.mt


                                                                                

Saved batch 900 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_900.mt


                                                                                

Saved batch 901 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_901.mt


                                                                                

Saved batch 902 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_902.mt


                                                                                

Saved batch 903 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_903.mt


                                                                                

Saved batch 904 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_904.mt


                                                                                

Saved batch 905 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_905.mt


                                                                                

Saved batch 906 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_906.mt


                                                                                

Saved batch 907 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_907.mt


                                                                                

Saved batch 908 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_908.mt


                                                                                

Saved batch 909 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_909.mt


                                                                                

Saved batch 910 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_910.mt


                                                                                

Saved batch 911 with 10 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_911.mt
Saved batch 912 with 1 VCFs to /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/neg_batch_912.mt
Batch 0 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_0.mt. Skipping...
Batch 1 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_1.mt. Skipping...
Batch 2 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_2.mt. Skipping...
Batch 3 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_3.mt. Skipping...
Batch 4 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_4.mt. Skipping...
Batch 5 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_5.mt. Skipping...
Batch 6 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/intermediate_batches/pos_batch_6.mt. Skipping...
Batch 7 with 10 VCFs exists in /mnt/sdb/tmp-skat/skat_mtx/

                                                                                

[Level 2] Merging chunk 2/10 with 10 files


                                                                                

[Level 2] Merging chunk 3/10 with 10 files
[Level 2] Merging chunk 4/10 with 10 files


                                                                                

[Level 2] Merging chunk 5/10 with 10 files
[Level 2] Merging chunk 6/10 with 10 files
[Level 2] Merging chunk 7/10 with 10 files
[Level 2] Merging chunk 8/10 with 10 files
[Level 2] Merging chunk 9/10 with 10 files
[Level 2] Merging chunk 10/10 with 2 files
Final merge...


                                                                                

In [6]:

print("Positive group entry schema:")
print(mt_positive.entry.dtype)
print("Negative group entry schema:")
print(mt_negative.entry.dtype)


# Merge the positive and negative groups.
mt_final = merge_matrix_tables_cols([mt_positive, mt_negative], tmp_dir, tag="final")
# Checkpoint the final merged MatrixTable.
mt_final = mt_final.checkpoint(os.path.join(skat_mtx_output_dir, "final_skat_matrix.mt"), overwrite=True)

# *** FIX KEYS: Unkey and rekey rows to ensure proper ordering. ***
mt_final = mt_final.key_rows_by()  # Remove existing row key.
mt_final = mt_final.key_rows_by(locus=mt_final.locus, alleles=mt_final.alleles)
mt_final = mt_final.repartition(100)  # Repartition as needed.


# Describe the final MatrixTable and show sample (column) table.
mt_final.describe()
mt_final.GT.show()

###########################################################################
# Step 2: SKAT Analysis
###########################################################################
# Compute PCA on the genotype (GT) values; compute 3 components.
eigenvalues, pca_scores, loadings = hl.hwe_normalized_pca(mt_final.GT, k=3)
print("PCA eigenvalues:", eigenvalues)

# Annotate each column with PCA scores.
mt_final = mt_final.annotate_cols(pca=pca_scores[mt_final.s])
print("Column annotation (PCA) sample:")
mt_final.cols().select("pca").show(5)

# Run variant QC.
mt_final = hl.variant_qc(mt_final)

# Annotate rows with a SKAT weight (using the Beta density based on the alternate allele frequency).
mt_final = mt_final.annotate_rows(weight=hl.dbeta(mt_final.variant_qc.AF[1], 1, 25))
mt_final = mt_final.annotate_rows(var_af=mt_final.variant_qc.AF)

###########################################################################
# Step 3: Run SKAT
###########################################################################
skat_results_ht = hl.skat(
    mt_final.gene,                 # Grouping key: gene (extracted from the CSQ annotation)
    mt_final.weight,               # Variant-level weight
    y=mt_final.pheno,              # Outcome: phenotype (e.g. 1 for case, 0 for control)
    x=mt_final.GT.n_alt_alleles(),  # Genotype: alternate allele count
    covariates=[
        1.0,                          # Intercept
        mt_final.pca.scores[0],       # PCA component 1
        mt_final.pca.scores[1],       # PCA component 2
        mt_final.pca.scores[2]        # PCA component 3
    ]
).checkpoint(
    f"{out_dir}/hgdp-tgp-rare-variants.skat_results.ht",
    overwrite=True
)
skat_results_ht.show(10)

###########################################################################
# Clean Up
###########################################################################
hl.stop()

Positive group entry schema:
struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, PL: array<int32>, AC: int32}
Negative group entry schema:
struct{AD: array<int32>, DP: int32, GQ: int32, GT: call, PL: array<int32>, AC: int32}
[Level 1] Merging chunk 1/1 with 2 files


                                                                                

Final merge...
----------------------------------------
Global fields:
    'prefix': str
----------------------------------------
Column fields:
    's': str
    'pheno': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'gene': str
    'HGNC_ID': int32
    'MAX_AF': float64
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'PL': array<int32>
    'AC': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


                                                                                

Unnamed: 0_level_0,Unnamed: 1_level_0,'E00643964_S5','E00737113_S4','E00995080_S6','E01002684_TSO0054'
locus,alleles,GT,GT,GT,GT
locus<GRCh37>,array<str>,call,call,call,call
1:1268003,"[""C"",""T""]",,,,
1:2235330,"[""C"",""T""]",,,,
1:2538417,"[""G"",""A""]",,,,
1:6111591,"[""G"",""A""]",,,,
1:6500676,"[""T"",""C""]",,,,
1:6505848,"[""G"",""C""]",,,,
1:6505851,"[""A"",""C""]",,,,
1:6505857,"[""G"",""C""]",,,,
1:6505881,"[""A"",""C""]",,,,
1:6505887,"[""A"",""C""]",,,,


                                                                                

PCA eigenvalues: [5.390712008304009, 4.080476437095585, 2.650305953467556]
Column annotation (PCA) sample:


Unnamed: 0_level_0,pca
s,scores
str,array<float64>
"""104220-023-001""","[3.77e-05,-2.18e-03,3.36e-04]"
"""104220-023-001""","[3.77e-05,-2.18e-03,3.36e-04]"
"""104220-023-002""","[-3.55e-08,2.84e-06,-5.86e-07]"
"""104220-023-003""","[1.13e-08,-9.83e-07,1.50e-07]"
"""104220-023-004""","[1.13e-08,-9.83e-07,1.50e-07]"


                                                                                

id,size,q_stat,p_value,fault
str,int32,float64,float64,int32
"""TTTY14""",1,3.09e-13,0.000445,0


In [12]:
init_spark_and_run(conf, out_dir, tmp_dir)
skat_results_ht = hl.read_table(f"{out_dir}/hgdp-tgp-rare-variants.skat_results.ht")

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.4
SparkUI available at http://tu-kliin-1-oligogen-1-virtual--1475-7.openstacklocal:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.133-4c60fddb171a
LOGGING: writing to /mnt/sdb/tmp-skat/logfile-20250604-0059.skat.log


[('spark.repl.local.jars', ''), ('spark.local.dir', '/mnt/sdb/tmp-skat'), ('spark.kryo.registrator', 'is.hail.kryo.HailKryoRegistrator'), ('spark.logConf', 'false'), ('spark.driver.host', 'tu-kliin-1-oligogen-1-virtual--1475-7.openstacklocal'), ('spark.driver.port', '44947'), ('spark.driver.memory', '21g'), ('spark.serializer', 'org.apache.spark.serializer.KryoSerializer'), ('spark.jars', '/opt/global-venv/lib64/python3.9/site-packages/hail/backend/hail-all-spark.jar'), ('spark.serializer.objectStreamReset', '100'), ('spark.executor.extraClassPath', './hail-all-spark.jar'), ('spark.eventLog.dir', '/mnt/sdb/tmp-skat/'), ('spark.master', 'local[*]'), ('spark.app.submitTime', '1748984719303'), ('spark.executor.cores', '3'), ('spark.submit.deployMode', 'client'), ('spark.driver.extraJavaOptions', '-Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lan

In [33]:
from hail.ggplot import *

skat_results_ht = skat_results_ht.annotate(
    p_value=hl.if_else(
        skat_results_ht.fault == 0,
        skat_results_ht.p_value,
        1,
    )
)
fig = ggplot(skat_results_ht) + geom_col(
    aes(
        x=skat_results_ht.id,
        y=-hl.log(skat_results_ht.p_value, base=10),
    )
)
fig._repr_html_(Path(skat_mtx_output_dir).joinpath("skat_html"))

TypeError: _repr_html_() takes 1 positional argument but 2 were given

In [9]:
skat_pd.sort_values(by="size", ascending=False)

Unnamed: 0,id,size,q_stat,p_value,fault,bonferroni_threshold,is_significant,log_p
0,TTTY14,1,0.0,0.000445,0,0.05,True,1.30103


In [10]:
mt_pre = mt_final.group_rows_by(mt_final.gene).aggregate(size=hl.agg.count()).cols()
mt_pre.summarize()

0,1
Non-missing,9782 (100.00%)
Missing,0
Min Size,8
Max Size,17
Mean Size,11.79
Sample Values,"['104220-023-001', '104220-023-001', '104220-023-002', '104220-023-003', '104220-023-004']"

0,1
Non-missing,9782 (100.00%)
Missing,0
Minimum,0
Maximum,1
Mean,0.07
Std Dev,0.25

0,1
Non-missing,9782 (100.00%)
Missing,0

0,1
Non-missing,9782 (100.00%)
Missing,0
Min Size,3
Max Size,3
Mean Size,3.00

0,1
Non-missing,29346 (100.00%)
Missing,0
Minimum,-0.31
Maximum,0.10
Mean,-0.00
Std Dev,0.02


In [14]:
skat_results_ht.summarize()

0,1
Non-missing,1 (100.00%)
Missing,0
Min Size,6
Max Size,6
Mean Size,6.00
Sample Values,['TTTY14']

0,1
Non-missing,1 (100.00%)
Missing,0
Minimum,1
Maximum,1
Mean,1.00
Std Dev,0.00

0,1
Non-missing,1 (100.00%)
Missing,0
Minimum,0.00
Maximum,0.00
Mean,0.00
Std Dev,0.00

0,1
Non-missing,1 (100.00%)
Missing,0
Minimum,0.00
Maximum,0.00
Mean,0.00
Std Dev,0.00

0,1
Non-missing,1 (100.00%)
Missing,0
Minimum,0
Maximum,0
Mean,0.00
Std Dev,0.00


In [17]:
mt = hl.read_matrix_table("/mnt/sdb/tmp-skat/skat_mtx/final_skat_matrix.mt/")

In [18]:
mt.describe()

----------------------------------------
Global fields:
    'prefix': str
----------------------------------------
Column fields:
    's': str
    'pheno': int32
----------------------------------------
Row fields:
    'locus': locus<GRCh37>
    'alleles': array<str>
    'gene': str
    'HGNC_ID': int32
    'MAX_AF': float64
----------------------------------------
Entry fields:
    'AD': array<int32>
    'DP': int32
    'GQ': int32
    'GT': call
    'PL': array<int32>
    'AC': int32
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------


In [24]:
n_variant_entries = mt.aggregate_entries(hl.agg.count_where(mt.GT.is_non_ref()))
print(f"Total variant entries: {n_variant_entries}")

Total variant entries: 174840


In [28]:
# Step 1: Annotate each column (sample) with its variant burden
mt = mt.annotate_cols(
    variant_burden = hl.agg.count_where(mt.GT.is_non_ref())
)

# Step 2: Convert to a Table so we can aggregate by phenotype
sample_table = mt.cols()

# Step 3: Group by phenotype and aggregate stats
burden_summary = sample_table.group_by(sample_table.pheno).aggregate(
    total_burden = hl.agg.sum(sample_table.variant_burden),
    mean_burden = hl.agg.mean(sample_table.variant_burden),
    median_burden = hl.agg.approx_median(sample_table.variant_burden),
    std_burden = hl.agg.stats(sample_table.variant_burden).stdev,
    n_samples = hl.agg.count()
)

# Step 4: Show the results
burden_summary.show()


pheno,total_burden,mean_burden,median_burden,std_burden,n_samples
int32,int64,float64,int64,float64,int64
0,160152,17.6,2,27.5,9121
1,14688,22.2,2,30.2,661
