#####################################################

#### To compile Go modules with C types to work with Python run:
#### go build -o main.so -buildmode=c-shared functions/main.go

#####################################################

In [None]:
import os
import sys

import hail as hl
import pyspark


hail_jars = os.path.join(
    os.sep.join(sys.executable.split(os.sep)[:6]),
    "lib",
    f"python{sys.version_info.major}.{sys.version_info.minor}",
    "site-packages",
    "hail",
    "backend",
    "hail-all-spark.jar",
)

CONF = pyspark.SparkConf().setAll(
    [
        ("spark.master", f"local[{20}]"),
        ("spark.app.name", "Hail_demonstration"),
        ("spark.jars", hail_jars),
        ("spark.driver.extraClassPath", hail_jars),
        ("spark.driver.memory", "30g"),
        ("spark.executor.extraClassPath", "./hail-all-spark.jar"),
        ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator"),
    ]
)

sc = pyspark.SparkContext(conf=CONF)

hl.init(default_reference="GRCh38", sc=sc)


Using hl.init with a default_reference argument is deprecated. To set a default reference genome after initializing hail, call `hl.default_reference` with an argument to set the default reference genome.

pip-installed Hail requires additional configuration options in Spark referring
  to the path to the Hail Python module directory HAIL_DIR,
  e.g. /path/to/python/site-packages/hail:
    spark.jars=HAIL_DIR/backend/hail-all-spark.jar
    spark.driver.extraClassPath=HAIL_DIR/backend/hail-all-spark.jar
    spark.executor.extraClassPath=./hail-all-spark.jarRunning on Apache Spark version 3.5.6
SparkUI available at http://192.168.88.24:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.135-034ef3e08116
LOGGING: writing to /home/phil/GitHub/matrix_table_consumer/hail-20250822-2309-0.2.135-034ef3e08116.log


2025-08-22 23:09:56.256 Hail: WARN: You are trying to read ./data/test1.vcf with *ONE* core of parallelism. This will be very slow. If this file is block-gzipped (bgzip-ed), use force_bgz=True instead.
2025-08-22 23:09:58.360 Hail: INFO: scanning VCF for sortedness...
2025-08-22 23:10:00.400 Hail: INFO: Coerced sorted VCF - no additional import work to do
2025-08-22 23:10:02.577 Hail: WARN: You are trying to read ./data/test1.vcf with *ONE* core of parallelism. This will be very slow. If this file is block-gzipped (bgzip-ed), use force_bgz=True instead.
2025-08-22 23:10:07.322 Hail: WARN: You are trying to read ./data/test1.vcf with *ONE* core of parallelism. This will be very slow. If this file is block-gzipped (bgzip-ed), use force_bgz=True instead.
2025-08-22 23:10:37.552 Hail: INFO: wrote matrix table with 13 rows and 1 column in 1 partition to ./data/matrix_table.mt
2025-08-22 23:10:41.975 Hail: INFO: wrote matrix table with 13 rows and 1 column in 1 partition to ./data/matrix_tab

In [1]:
from matrix_table_consumer import MatrixTableConsumer, NUM_CPU

In [2]:
# Downloaded from https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/

vcf_big_path = "./data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz"  # GRCh37
vcf_mini_path = "./data/test1.vcf"  # GRCh38

matrix_table_path = "./data/matrix_table.mt"
json_path = "./data/matrix.json"

In [5]:
mt = hl.import_vcf(
    vcf_mini_path,
    reference_genome="GRCh38",
    array_elements_required=False,
    force=True,
)
mt.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,'tumor'
locus,alleles,GT
locus<GRCh38>,array<str>,call
chr1:13100,"[""C"",""T""]",0/1
chr1:25734793,"[""C"",""T""]",0/1
chr1:37323930,"[""C"",""T""]",0/1
chr2:1234,"[""C"",""T""]",0/1
chr2:1235,"[""C"",""T""]",0/1
chr3:1234,"[""C"",""T""]",0/1
chr4:1234,"[""C"",""T""]",0/1
chr4:1235,"[""C"",""T""]",0/1
chr4:1236,"[""C"",""T""]",0/1
chr5:1234,"[""C"",""T""]",0/1


In [8]:
consumer = MatrixTableConsumer(
    vcf_path=vcf_mini_path, is_gzip=False, reference_genome="GRCh38"
)

# Save MatrixTable

In [9]:
content = {}
content = consumer.prepare_metadata_for_saving(json_path=json_path, mt=mt)

Extracting fields:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting fields: 100%|██████████| 2/2 [00:00<00:00, 34.10it/s]


# Download MatrixTable

In [None]:
mt_new = consumer.prepare_metadata_for_loading(json_path=json_path)

mt_new.write(matrix_table_path, overwrite=True)
mt_new.count()

Prepare metadata for loading: 100%|██████████| 3/3 [00:00<00:00, 325.63it/s]


(13, 1)

# Collect rows

In [3]:
consumer = MatrixTableConsumer(
    vcf_path=vcf_big_path, is_gzip=True, reference_genome="GRCh37"
)

In [10]:
rows = consumer.collect(num_rows=100_000, num_cpu=1)
rows



[{'QUAL': 100,
  'POS': 6122681,
  'CHROM': '1',
  'ID': '.',
  'REF': 'A',
  'ALT': 'T',
  'FILTER': 'PASS',
  'INFO': 'AC=7;AF=0.00139776;AN=5008;NS=2504;DP=16853;EAS_AF=0.0069;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=A|||;VT=SNP'},
 {'QUAL': 100,
  'POS': 6122705,
  'CHROM': '1',
  'ID': '.',
  'REF': 'G',
  'ALT': 'T',
  'FILTER': 'PASS',
  'INFO': 'AC=3;AF=0.000599042;AN=5008;NS=2504;DP=16928;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0.002;SAS_AF=0;AA=G|||;VT=SNP'},
 {'QUAL': 100,
  'POS': 6122757,
  'CHROM': '1',
  'ID': '.',
  'REF': 'C',
  'ALT': 'T',
  'FILTER': 'PASS',
  'INFO': 'AC=1;AF=0.000199681;AN=5008;NS=2504;DP=17669;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=C|||;VT=SNP'},
 {'QUAL': 100,
  'POS': 6122810,
  'CHROM': '1',
  'ID': '.',
  'REF': 'T',
  'ALT': 'C',
  'FILTER': 'PASS',
  'INFO': 'AC=1;AF=0.000199681;AN=5008;NS=2504;DP=19036;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.001;AA=T|||;VT=SNP'},
 {'QUAL': 100,
  'POS': 6122845,
  'CHROM': '1',
  'ID': '.',


In [5]:
rows_count = consumer.count()
rows_count

6468094

In [None]:
rows = consumer.collect_all(num_cpu=NUM_CPU)
rows[:2]

[23-08-2025 17:03:06] - INFO - Collecting data
[23-08-2025 17:03:06] - INFO - 0 lines read
[23-08-2025 17:03:06] - INFO - 50000 lines read
[23-08-2025 17:03:07] - INFO - 100000 lines read
[23-08-2025 17:03:08] - INFO - 150000 lines read
[23-08-2025 17:03:09] - INFO - 200000 lines read
[23-08-2025 17:03:10] - INFO - 250000 lines read
[23-08-2025 17:03:11] - INFO - 300000 lines read
[23-08-2025 17:03:11] - INFO - 350000 lines read
[23-08-2025 17:03:12] - INFO - 400000 lines read
[23-08-2025 17:03:13] - INFO - 450000 lines read


In [None]:
print(len(rows))

In [11]:
rows_hail = consumer.convert_rows_to_hail(rows=rows)
rows_hail[:5]

Converting rows to hail:   0%|          | 0/200000 [00:00<?, ?it/s]

Converting rows to hail: 100%|██████████| 200000/200000 [00:01<00:00, 125950.95it/s]


[Struct(locus=Locus(contig=1, position=10235, reference_genome=GRCh37), alleles=['T', 'TA'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=6;AF=0.00119808;AN=5008;NS=2504;DP=78015;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0.0051;AA=|||unknown(NO_COVERAGE);VT=INDEL'), entries=[]),
 Struct(locus=Locus(contig=1, position=10352, reference_genome=GRCh37), alleles=['T', 'TA'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE);VT=INDEL'), entries=[]),
 Struct(locus=Locus(contig=1, position=10506, reference_genome=GRCh37), alleles=['C', 'G'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9676;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP'), entries=[]),
 Struct(locus=Locus(contig=1, position=10505, reference_genome=GRCh37), alleles=['A', 'T'], rsid='.', qual=100, filters

In [25]:
t = consumer.create_hail_table(rows=rows_hail)
t.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,Unnamed: 6_level_0
locus,alleles,rsid,qual,filters,info,entries
locus<GRCh37>,array<str>,str,int32,str,str,array<struct{}>
1:10235,"[""T"",""TA""]",""".""",100,"""PASS""","""AC=6;AF=0.00119808;AN=5008;NS=2504;DP=78015;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0.0051;AA=|||unknown(NO_COVERAGE);VT=INDEL""",[]
1:10352,"[""T"",""TA""]",""".""",100,"""PASS""","""AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE);VT=INDEL""",[]
1:10505,"[""A"",""T""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9632;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10506,"[""C"",""G""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9676;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10511,"[""G"",""A""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9869;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10539,"[""C"",""A""]",""".""",100,"""PASS""","""AC=3;AF=0.000599042;AN=5008;NS=2504;DP=9203;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0.001;SAS_AF=0.001;AA=.|||;VT=SNP""",[]
1:10542,"[""C"",""T""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9007;EAS_AF=0.001;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10579,"[""C"",""A""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=5502;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10616,"[""CCGCCGTTGCAAAGGCGCGCCG"",""C""]",""".""",100,"""PASS""","""AC=4973;AF=0.993011;AN=5008;NS=2504;DP=2365;EAS_AF=0.9911;AMR_AF=0.9957;AFR_AF=0.9894;EUR_AF=0.994;SAS_AF=0.9969;VT=INDEL""",[]
1:10642,"[""G"",""A""]",""".""",100,"""PASS""","""AC=21;AF=0.00419329;AN=5008;NS=2504;DP=1360;EAS_AF=0.003;AMR_AF=0.0014;AFR_AF=0.0129;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]


# Create MatrixTable from Table

In [None]:
mt_from_table = consumer.combine_hail_matrix_table_and_table(mt=mt, table=t)
mt_from_table.show()

In [None]:
mt_from_table.count()