The purpose of this script is to extract some relevant counts to be cited in the manuscript.

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import pyspark.sql.types as T
import os

In [5]:
spark = SparkSession.builder.appName("manuscript_counts").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 14:30:04 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/06/11 14:30:20 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [8]:
chromosome="NONE"

if "which_chr" in os.environ:
    chromosome=os.environ['which_chr']

if chromosome=="NONE":
    print("error : did not find which chromosome we are supposed to crunch")
    exit(-1)
else:
    print("only crunching chromosome "+chromosome)

In [20]:
filename=f"/home/mcn26/varef/data/Malinois/gnomAD_variants/gnomad.genomes.v3.1.2.sites.{chromosome}.converted.vcf.gz"

#we are going to load the vcf file as a rdd first, so we can remove the comment lines 
variants_rdd = spark.sparkContext.textFile(filename)

In [21]:
#VCFs have metadata at the beginning of the file preceeded by `##`
#Followed by a header preceeded by `#`
#then the actual data. I can't just use the regular pysaprk comment filtering
#This, and the fact that pyspark only accepts single-characters as indicating
#comments lines, means I can't use spark's builtin comment filtering.


#Grab the first 100 lines. I'm certain the header will be in here.
initial_lines = variants_rdd.take(100)
#Filter out the metadata.
filtered_initial_lines = [line for line in initial_lines if not line.startswith('##')]

24/06/11 14:52:32 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 2 (TID 2): Attempting to kill Python Worker
                                                                                

In [22]:
#Extract the header line
header_line = next(line for line in filtered_initial_lines if line.startswith('#'))
#Extract a list version the header
header = header_line[1:].split('\t')

In [23]:
#Look up the position the data starts
start_line = initial_lines.index(header_line) + 1

In [24]:
#manual check:
print(initial_lines[start_line-1])
print(initial_lines[start_line])

#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
chr22	10510033	.	T	C	.	.	K562__ref=0.34182575;HepG2__ref=0.346653;SKNSH__ref=0.4794821;K562__alt=0.33857352;HepG2__alt=0.36103585;SKNSH__alt=0.49010602;K562__skew=-0.0032522278;HepG2__skew=0.014382835;SKNSH__skew=0.010623883


In [25]:
#split the variants text file on tabs
split_rdd = variants_rdd.zipWithIndex().filter(lambda x: x[1] >= start_line).map(lambda x: x[0].split('\t'))

In [28]:
#make the actual dataframe
df = spark.createDataFrame(split_rdd, schema=header)

24/06/11 14:54:31 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 4 (TID 4): Attempting to kill Python Worker
                                                                                

In [36]:
#Manual verification
df.limit(3).toPandas()#["INFO"].to_list()

24/06/11 14:58:36 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 10 (TID 10): Attempting to kill Python Worker
                                                                                

Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO
0,chr22,10510033,.,T,C,.,.,K562__ref=0.34182575;HepG2__ref=0.346653;SKNSH...
1,chr22,10510061,.,A,T,.,.,K562__ref=0.28704935;HepG2__ref=0.3354496;SKNS...
2,chr22,10510077,rs1290354662,C,A,.,.,K562__ref=0.31834492;HepG2__ref=0.3127984;SKNS...


Looks good. I'm going to say that the number of rows in this dataframe is the number of variants for which we have predictions. To make certain there are no nulls, I will check the length of strings in the INFO column, which here are just predictions. 

In [40]:
df = df.withColumn("info_length",F.length(df["INFO"]))

In [43]:
min_max_length = df.agg(
    F.min("info_length").alias("min_length"),
    F.max("info_length").alias("max_length")
)

In [44]:
min_max_length.show()

                                                                                

+----------+----------+
|min_length|max_length|
+----------+----------+
|       179|       221|
+----------+----------+



In [49]:
#This code grabs & displays the min (for chr22). By manual inspection,
#It's clear it's OK, and the variance down to ~179 characters is just due to
# differences in floating point representation

#rows_with_min_length = df.filter(df["info_length"] == 179)
#rows_with_min_length.limit(3).toPandas()["INFO"].to_list()

                                                                                

['K562__ref=3.6333477;HepG2__ref=2.6967316;SKNSH__ref=4.008317;K562__alt=4.633348;HepG2__alt=3.6102676;SKNSH__alt=5.087789;K562__skew=1.0;HepG2__skew=0.9135362;SKNSH__skew=1.0794718',
 'K562__ref=3.979684;HepG2__ref=5.8509064;SKNSH__ref=7.87134;K562__alt=4.1614814;HepG2__alt=6.564019;SKNSH__alt=7.62134;K562__skew=0.18179747;HepG2__skew=0.7131131;SKNSH__skew=-0.25']

In [45]:
print("COUNT")
print(df.count())

                                                                                

9813618