In [ ]:
import hail as hl
import os

In [ ]:
hl.init()

In [ ]:
# Example usage of the vcfs_to_matrixtable function
vcf_path = "./tests-t/in_silico_sorted.vcf" # Example list of VCF files
destination_path = "./extra/output/mt/in_silico_sorted.mt" # Destination for the MatrixTable

In [ ]:
def vcfs_to_matrixtable(f, destination=None, write=True, annotate=True):
    """
    Converts VCF files to Hail MatrixTable format, optionally annotates with VEP, and writes to disk.
    
    Args:
    f (str or list): Path to a single VCF file, a list of VCF files, or a file containing a list of VCF file paths.
    destination (str): Path where the MatrixTable should be written.
    write (bool): If True, writes the MatrixTable to the specified destination.
    annotate (bool): If True, annotates the VCF with VEP.
    
    Returns:
    Hail MatrixTable: The loaded (and optionally annotated) MatrixTable.
    """
    files = []
    if isinstance(f, list):
        files.extend(f)
    elif not f.endswith(".vcf") and not f.endswith(".gz"):
        with open(f) as vcflist:
            for vcfpath in vcflist:
                stripped = vcfpath.strip()
                assert os.path.exists(stripped), f"Path {stripped} does not exist."
                files.append(stripped)
    else:
        assert os.path.exists(f), f"Path {f} does not exist."
        files.append(f)  # Only one file

    # Setup contig recoding to standardize chromosome naming conventions
    contig_prefix = "chr"
    contig_recoding = {f"{contig_prefix}{i}": str(i) for i in range(1, 23)}
    contig_recoding.update({"chrX": "X", "chrY": "Y"})
    
    # Import VCF(s) as MatrixTable
    mt = hl.import_vcf(files, force=True, reference_genome="GRCh37", contig_recoding=contig_recoding)

    if annotate:
        # Filter out star alleles as they can cause issues with certain annotations
        mt = mt.filter_rows(mt.alleles[1] != "*")
        mt = hl.vep(mt, config="./src/config/vep_settings.json", csq=True)  # Annotate with VEP #TODO: pane siia õige json fail

    if write:
        if destination is not None:
            if not os.path.exists(destination):
                os.makedirs(os.path.dirname(destination), exist_ok=True)
                mt.write(destination)
            else:
                raise FileExistsError(f"Destination {destination} already exists.")
    
    return mt

# Initialize Hail
# Convert VCFs to MatrixTable, annotate with VEP, and write to disk
mt = vcfs_to_matrixtable(vcf_files, destination=destination_path, write=True, annotate=True)
# Further analysis on the MatrixTable 'mt' can proceed from here