#####################################################

#### To compile Go modules with C types to work with Python run:
#### go build -o functions.so -buildmode=c-shared functions/functions.go

#####################################################

# Main function

In [None]:
import os
import sys

import hail as hl
import pyspark


hail_jars = os.path.join(
    os.sep.join(sys.executable.split(os.sep)[:6]),
    "lib",
    f"python{sys.version_info.major}.{sys.version_info.minor}",
    "site-packages",
    "hail",
    "backend",
    "hail-all-spark.jar",
)

CONF = pyspark.SparkConf().setAll(
    [
        ("spark.master", f"local[{20}]"),
        ("spark.app.name", "Hail_demonstration"),
        ("spark.jars", hail_jars),
        ("spark.driver.extraClassPath", hail_jars),
        ("spark.driver.memory", "30g"),
        ("spark.executor.extraClassPath", "./hail-all-spark.jar"),
        ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator"),
    ]
)

sc = pyspark.SparkContext(conf=CONF)

hl.init(default_reference="GRCh38", sc=sc)


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.6
SparkUI available at http://10.200.8.19:4041
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.135-034ef3e08116
LOGGING: writing to /home/froschin/work/hail/hail-20250821-2007-0.2.135-034ef3e08116.log


2025-08-21 20:08:05.335 Hail: WARN: You are trying to read /home/froschin/work/hail/data/subset.vcf with *ONE* core of parallelism. This will be very slow. If this file is block-gzipped (bgzip-ed), use force_bgz=True instead.
2025-08-21 20:08:08.654 Hail: INFO: scanning VCF for sortedness...
2025-08-21 20:08:13.228 Hail: INFO: Coerced sorted VCF - no additional import work to do
2025-08-21 20:08:38.047 Hail: INFO: wrote matrix table with 16028 rows and 836 columns in 3 partitions to /home/froschin/work/hail/data/matrix_table.mt


In [8]:
from matrix_table_consumer import MatrixTableConsumer

In [9]:
vcf_mini_path = "/home/froschin/work/hail/data/test_vcf_for_hail.vcf"  # GRCh37
vcf_subset_path = "/home/froschin/work/hail/data/subset.vcf"  # GRCh38
vcf_big_path = "/mnt/Storage/testdata/Temp/kirillovba_ethno/P2.vcf.gz"  # GRCh38

matrix_table_path = "/home/froschin/work/hail/data/matrix_table.mt"
json_path = "/home/froschin/work/hail/data/matrix.json"

In [10]:
mt = hl.import_vcf(
    vcf_subset_path,
    reference_genome="GRCh38",
    array_elements_required=False,
    force=True,
)
mt.show()

                                                                                

locus,alleles
locus<GRCh38>,array<str>
chr1:1161907,"[""T"",""C""]"
chr1:1361836,"[""A"",""G""]"
chr1:1716247,"[""A"",""C""]"
chr1:1734301,"[""T"",""C""]"
chr1:2109497,"[""T"",""C""]"
chr1:4323996,"[""C"",""T""]"
chr1:4437070,"[""G"",""A""]"
chr1:4537795,"[""T"",""C""]"
chr1:4597062,"[""G"",""A""]"
chr1:4934881,"[""G"",""A""]"


In [11]:
consumer = MatrixTableConsumer()

# Save MatrixTable

In [12]:
content = {}
content = consumer.prepare_metadata_for_saving(json_path=json_path, mt=mt)

[21-08-2025 20:08:23] - INFO - Prepare metadata for saving
[21-08-2025 20:08:23] - INFO - Extract fields
[21-08-2025 20:08:23] - INFO - Extract fields end
[21-08-2025 20:08:23] - INFO - Save json
[21-08-2025 20:08:23] - INFO - End


# Download MatrixTable

In [13]:
mt_new = consumer.prepare_metadata_for_loading(json_path=json_path)

mt_new.write(matrix_table_path, overwrite=True)
mt_new.count()

[21-08-2025 20:08:26] - INFO - Prepare metadata for loading
[21-08-2025 20:08:26] - INFO - Compress fields
[21-08-2025 20:08:26] - INFO - Create matrix table


                                                                                

(16028, 836)

# Collect rows

In [14]:
consumer = MatrixTableConsumer()
rows = consumer.collect(num_rows=1000, vcf_path=vcf_big_path, is_gzip=True)
rows[:5]

[21-08-2025 20:08:47] - INFO - Collecting data
[21-08-2025 20:08:47] - INFO - Finish


[{'POS': 10018,
  'QUAL': 12,
  'CHROM': 'chr1',
  'ID': 'chr1_10018_C_CT',
  'REF': 'C',
  'ALT': 'CT',
  'FILTER': '.',
  'INFO': 'AF=0.00130719;AQ=12;NS=765;AN=1530;MAF=0.00130719;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.000654022;ExcHet=1'},
 {'POS': 10048,
  'QUAL': 10,
  'CHROM': 'chr1',
  'ID': 'chr1_10048_CT_C',
  'REF': 'CT',
  'ALT': 'C',
  'FILTER': '.',
  'INFO': 'AF=0.00622776;AQ=10;NS=562;AN=1124;MAF=0.00622776;AC=7;AC_Het=7;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.981383'},
 {'POS': 10054,
  'QUAL': 11,
  'CHROM': 'chr1',
  'ID': 'chr1_10054_C_T',
  'REF': 'C',
  'ALT': 'T',
  'FILTER': '.',
  'INFO': 'AF=0.000740741;AQ=11;NS=675;AN=1350;MAF=0.000740741;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1'},
 {'POS': 10060,
  'QUAL': 11,
  'CHROM': 'chr1',
  'ID': 'chr1_10060_C_CT',
  'REF': 'C',
  'ALT': 'CT',
  'FILTER': '.',
  'INFO': 'AF=0.000725689;AQ=11;NS=689;AN=1378;MAF=0.000725689;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1'},
 {'POS': 10072,
  'QUAL': 12,
  'CHROM': 'chr

In [15]:
consumer = MatrixTableConsumer()
rows = consumer.collect(num_rows=1, vcf_path=vcf_mini_path, is_gzip=False)
rows

[21-08-2025 20:08:52] - INFO - Collecting data
[21-08-2025 20:08:52] - INFO - Finish


[{'POS': 338615,
  'QUAL': 90,
  'CHROM': '1',
  'ID': '2659854',
  'REF': 'G',
  'ALT': 'A',
  'FILTER': '.',
  'INFO': 'ALLELEID=2823725;CLNDISDB=MedGen:C3661900;CLNDN=not_provided;CLNHGVS=NC_000024.10:g.338615G>A;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGSCV=SCV004164128;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=ClinGen:CA10328741;GENEINFO=PPP2R3B:28227;MC=SO:0001819|synonymous_variant;ORIGIN=1;RS=773605704'}]

In [16]:
rows = consumer.collect_all(vcf_path=vcf_subset_path, is_gzip=False)
rows[:2]

[21-08-2025 20:08:57] - INFO - Collecting data
[21-08-2025 20:08:58] - INFO - Finish


[{'POS': 1161907,
  'QUAL': 76,
  'CHROM': 'chr1',
  'ID': 'chr1_1161907_T_C',
  'REF': 'T',
  'ALT': 'C',
  'FILTER': 'PASS',
  'INFO': 'AF=0.909856;AQ=76;NS=832;AN=1664;MAF=0.0901442;AC=1514;AC_Het=128;AC_Hom=1386;AC_Hemi=0;HWE=0.0867318;ExcHet=0.9724'},
 {'POS': 1361836,
  'QUAL': 78,
  'CHROM': 'chr1',
  'ID': 'chr1_1361836_A_G',
  'REF': 'A',
  'ALT': 'G',
  'FILTER': 'PASS',
  'INFO': 'AF=0.906587;AQ=78;NS=835;AN=1670;MAF=0.0934132;AC=1514;AC_Het=134;AC_Hom=1380;AC_Hemi=0;HWE=0.14797;ExcHet=0.952941'}]

In [17]:
rows = consumer.collect_all(vcf_path=vcf_big_path, is_gzip=True)
rows[:2]

[21-08-2025 20:09:02] - INFO - Collecting data
[21-08-2025 20:09:03] - INFO - Finish


reading standard input: bufio.Scanner: token too long

[{'POS': 10018,
  'QUAL': 12,
  'CHROM': 'chr1',
  'ID': 'chr1_10018_C_CT',
  'REF': 'C',
  'ALT': 'CT',
  'FILTER': '.',
  'INFO': 'AF=0.00130719;AQ=12;NS=765;AN=1530;MAF=0.00130719;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.000654022;ExcHet=1'},
 {'POS': 10048,
  'QUAL': 10,
  'CHROM': 'chr1',
  'ID': 'chr1_10048_CT_C',
  'REF': 'CT',
  'ALT': 'C',
  'FILTER': '.',
  'INFO': 'AF=0.00622776;AQ=10;NS=562;AN=1124;MAF=0.00622776;AC=7;AC_Het=7;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.981383'}]

In [18]:
print(len(rows))

7409


In [19]:
rows_hail = consumer.convert_rows_to_hail(rows=rows, reference_genome="GRCh38")
rows_hail[:5]

[Struct(locus=Locus(contig=chr1, position=10018, reference_genome=GRCh38), alleles=['C', 'CT'], rsid='chr1_10018_C_CT', qual=12, filters=None, info=Struct(info='AF=0.00130719;AQ=12;NS=765;AN=1530;MAF=0.00130719;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.000654022;ExcHet=1'), entries=[]),
 Struct(locus=Locus(contig=chr1, position=10048, reference_genome=GRCh38), alleles=['CT', 'C'], rsid='chr1_10048_CT_C', qual=10, filters=None, info=Struct(info='AF=0.00622776;AQ=10;NS=562;AN=1124;MAF=0.00622776;AC=7;AC_Het=7;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.981383'), entries=[]),
 Struct(locus=Locus(contig=chr1, position=10054, reference_genome=GRCh38), alleles=['C', 'T'], rsid='chr1_10054_C_T', qual=11, filters=None, info=Struct(info='AF=0.000740741;AQ=11;NS=675;AN=1350;MAF=0.000740741;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1'), entries=[]),
 Struct(locus=Locus(contig=chr1, position=10060, reference_genome=GRCh38), alleles=['C', 'CT'], rsid='chr1_10060_C_CT', qual=11, filters=None, info=Struct(inf

In [20]:
t = consumer.create_hail_table(rows=rows_hail, reference_genome="GRCh38")
t.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,Unnamed: 6_level_0
locus,alleles,rsid,qual,filters,info,entries
locus<GRCh38>,array<str>,str,int32,str,str,array<struct{}>
chr1:10018,"[""C"",""CT""]","""chr1_10018_C_CT""",12,,"""AF=0.00130719;AQ=12;NS=765;AN=1530;MAF=0.00130719;AC=2;AC_Het=0;AC_Hom=2;AC_Hemi=0;HWE=0.000654022;ExcHet=1""",[]
chr1:10048,"[""CT"",""C""]","""chr1_10048_CT_C""",10,,"""AF=0.00622776;AQ=10;NS=562;AN=1124;MAF=0.00622776;AC=7;AC_Het=7;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.981383""",[]
chr1:10054,"[""C"",""T""]","""chr1_10054_C_T""",11,,"""AF=0.000740741;AQ=11;NS=675;AN=1350;MAF=0.000740741;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1""",[]
chr1:10060,"[""C"",""CT""]","""chr1_10060_C_CT""",11,,"""AF=0.000725689;AQ=11;NS=689;AN=1378;MAF=0.000725689;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1""",[]
chr1:10072,"[""C"",""CT""]","""chr1_10072_C_CT""",12,,"""AF=0.000662252;AQ=12;NS=755;AN=1510;MAF=0.000662252;AC=1;AC_Het=1;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=1""",[]
chr1:10108,"[""C"",""CT""]","""chr1_10108_C_CT""",18,,"""AF=0.0808511;AQ=18;NS=355;AN=705;MAF=0.0808511;AC=57;AC_Het=48;AC_Hom=4;AC_Hemi=5;HWE=1;ExcHet=0.707804""",[]
chr1:10138,"[""T"",""G""]","""chr1_10138_T_G""",11,,"""AF=0.00134953;AQ=11;NS=741;AN=1482;MAF=0.00134953;AC=2;AC_Het=2;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.999325""",[]
chr1:10144,"[""T"",""G""]","""chr1_10144_T_G""",12,,"""AF=0.00549451;AQ=12;NS=637;AN=1274;MAF=0.00549451;AC=7;AC_Het=3;AC_Hom=4;AC_Hemi=0;HWE=6.47933e-05;ExcHet=1""",[]
chr1:10146,"[""AC"",""A""]","""chr1_10146_AC_A""",16,,"""AF=0.226471;AQ=16;NS=343;AN=680;MAF=0.226471;AC=154;AC_Het=40;AC_Hom=108;AC_Hemi=6;HWE=3.06442e-29;ExcHet=1""",[]
chr1:10150,"[""C"",""T""]","""chr1_10150_C_T""",11,,"""AF=0.00948276;AQ=11;NS=580;AN=1160;MAF=0.00948276;AC=11;AC_Het=11;AC_Hom=0;AC_Hemi=0;HWE=1;ExcHet=0.953279""",[]


# Create MatrixTable from Table

In [21]:
mt_from_table = consumer.combine_hail_matrix_table_and_table(mt=mt, table=t)
mt_from_table.show()

locus,alleles
locus<GRCh38>,array<str>
chr1:1161907,"[""T"",""C""]"
chr1:1361836,"[""A"",""G""]"
chr1:1716247,"[""A"",""C""]"
chr1:1734301,"[""T"",""C""]"
chr1:2109497,"[""T"",""C""]"
chr1:4323996,"[""C"",""T""]"
chr1:4437070,"[""G"",""A""]"
chr1:4537795,"[""T"",""C""]"
chr1:4597062,"[""G"",""A""]"
chr1:4934881,"[""G"",""A""]"


In [22]:
mt_from_table.count()

(16028, 836)