#####################################################

#### To compile Go modules with C types to work with Python run:
#### go build -o main.so -buildmode=c-shared functions/main.go

#####################################################

In [None]:
import os
import sys

import hail as hl
import pyspark


hail_jars = os.path.join(
    os.sep.join(sys.executable.split(os.sep)[:6]),
    "lib",
    f"python{sys.version_info.major}.{sys.version_info.minor}",
    "site-packages",
    "hail",
    "backend",
    "hail-all-spark.jar",
)

CONF = pyspark.SparkConf().setAll(
    [
        ("spark.master", f"local[{20}]"),
        ("spark.app.name", "Hail_demonstration"),
        ("spark.jars", hail_jars),
        ("spark.driver.extraClassPath", hail_jars),
        ("spark.driver.memory", "30g"),
        ("spark.executor.extraClassPath", "./hail-all-spark.jar"),
        ("spark.serializer", "org.apache.spark.serializer.KryoSerializer"),
        ("spark.kryo.registrator", "is.hail.kryo.HailKryoRegistrator"),
    ]
)

sc = pyspark.SparkContext(conf=CONF)

hl.init(default_reference="GRCh38", sc=sc)

In [11]:
from matrix_table_consumer import MatrixTableConsumer, NUM_CPU

In [2]:
# Downloaded from https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502/

vcf_big_path = "/home/phil/GitHub/matrix_table_consumer/data/ALL.chr1.phase3_shapeit2_mvncall_integrated_v5b.20130502.genotypes.vcf.gz"  # GRCh38

matrix_table_path = "/home/froschin/work/hail/data/matrix_table.mt"
json_path = "/home/froschin/work/hail/data/matrix.json"

In [None]:
mt = hl.import_vcf(
    vcf_big_path,
    reference_genome="GRCh37",
    array_elements_required=False,
    force=True,
)
mt.show()

In [3]:
consumer = MatrixTableConsumer(
    vcf_path=vcf_big_path, is_gzip=True, reference_genome="GRCh37"
)

# Save MatrixTable

In [None]:
content = {}
content = consumer.prepare_metadata_for_saving(json_path=json_path, mt=mt)

# Download MatrixTable

In [None]:
mt_new = consumer.prepare_metadata_for_loading(json_path=json_path)

mt_new.write(matrix_table_path, overwrite=True)
mt_new.count()

# Collect rows

In [8]:
rows = consumer.collect(num_rows=10_000, num_cpu=NUM_CPU)
rows[:2]

[22-08-2025 17:17:15] - INFO - Collecting data
[22-08-2025 17:17:15] - INFO - End


[{'QUAL': 100,
  'POS': 1638832,
  'CHROM': '1',
  'ID': '.',
  'REF': 'G',
  'ALT': 'A',
  'FILTER': 'PASS',
  'INFO': 'AC=1;AF=0.000199681;AN=5008;NS=2504;DP=21089;EAS_AF=0.001;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=-|||;VT=SNP;EX_TARGET'},
 {'QUAL': 100,
  'POS': 1638954,
  'CHROM': '1',
  'ID': '.',
  'REF': 'G',
  'ALT': 'A',
  'FILTER': 'PASS',
  'INFO': 'AC=2;AF=0.000399361;AN=5008;NS=2504;DP=17536;EAS_AF=0;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0.002;AA=-|||;VT=SNP;EX_TARGET'}]

In [None]:
rows_count = consumer.count()
rows_count

In [None]:
rows = consumer.collect_all(num_cpu=NUM_CPU)
rows[:2]

In [5]:
print(len(rows))

10000


In [6]:
rows_hail = consumer.convert_rows_to_hail(rows=rows)
rows_hail[:5]

Converting rows to hail:   0%|          | 0/10001 [00:00<?, ?it/s]Initializing Hail with default parameters...


Running on Apache Spark version 3.5.6
SparkUI available at http://10.1.1.138:4040
Welcome to
     __  __     <>__
    / /_/ /__  __/ /
   / __  / _ `/ / /
  /_/ /_/\_,_/_/_/   version 0.2.135-034ef3e08116
LOGGING: writing to /home/phil/GitHub/matrix_table_consumer/hail-20250822-1257-0.2.135-034ef3e08116.log
Converting rows to hail: 100%|██████████| 10001/10001 [00:10<00:00, 997.99it/s]


[Struct(locus=Locus(contig=1, position=10235, reference_genome=GRCh37), alleles=['T', 'TA'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=6;AF=0.00119808;AN=5008;NS=2504;DP=78015;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0.0051;AA=|||unknown(NO_COVERAGE);VT=INDEL'), entries=[]),
 Struct(locus=Locus(contig=1, position=10505, reference_genome=GRCh37), alleles=['A', 'T'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9632;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP'), entries=[]),
 Struct(locus=Locus(contig=1, position=10352, reference_genome=GRCh37), alleles=['T', 'TA'], rsid='.', qual=100, filters='PASS', info=Struct(info='AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE);VT=INDEL'), entries=[]),
 Struct(locus=Locus(contig=1, position=10539, reference_genome=GRCh37), alleles=['C', 'A'], rsid='.', qual=100, filters

In [7]:
t = consumer.create_hail_table(rows=rows_hail)
t.show()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,info,Unnamed: 6_level_0
locus,alleles,rsid,qual,filters,info,entries
locus<GRCh37>,array<str>,str,int32,str,str,array<struct{}>
1:10235,"[""T"",""TA""]",""".""",100,"""PASS""","""AC=6;AF=0.00119808;AN=5008;NS=2504;DP=78015;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0.0051;AA=|||unknown(NO_COVERAGE);VT=INDEL""",[]
1:10352,"[""T"",""TA""]",""".""",100,"""PASS""","""AC=2191;AF=0.4375;AN=5008;NS=2504;DP=88915;EAS_AF=0.4306;AMR_AF=0.4107;AFR_AF=0.4788;EUR_AF=0.4264;SAS_AF=0.4192;AA=|||unknown(NO_COVERAGE);VT=INDEL""",[]
1:10505,"[""A"",""T""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9632;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10506,"[""C"",""G""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9676;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10511,"[""G"",""A""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9869;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10539,"[""C"",""A""]",""".""",100,"""PASS""","""AC=3;AF=0.000599042;AN=5008;NS=2504;DP=9203;EAS_AF=0;AMR_AF=0.0014;AFR_AF=0;EUR_AF=0.001;SAS_AF=0.001;AA=.|||;VT=SNP""",[]
1:10542,"[""C"",""T""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=9007;EAS_AF=0.001;AMR_AF=0;AFR_AF=0;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10579,"[""C"",""A""]",""".""",100,"""PASS""","""AC=1;AF=0.000199681;AN=5008;NS=2504;DP=5502;EAS_AF=0;AMR_AF=0;AFR_AF=0.0008;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]
1:10616,"[""CCGCCGTTGCAAAGGCGCGCCG"",""C""]",""".""",100,"""PASS""","""AC=4973;AF=0.993011;AN=5008;NS=2504;DP=2365;EAS_AF=0.9911;AMR_AF=0.9957;AFR_AF=0.9894;EUR_AF=0.994;SAS_AF=0.9969;VT=INDEL""",[]
1:10642,"[""G"",""A""]",""".""",100,"""PASS""","""AC=21;AF=0.00419329;AN=5008;NS=2504;DP=1360;EAS_AF=0.003;AMR_AF=0.0014;AFR_AF=0.0129;EUR_AF=0;SAS_AF=0;AA=.|||;VT=SNP""",[]


# Create MatrixTable from Table

In [None]:
mt_from_table = consumer.combine_hail_matrix_table_and_table(mt=mt, table=t)
mt_from_table.show()

In [None]:
mt_from_table.count()