This notebook annotates all gnomad variants with their corresponding malinouis predictions, the phyloP scores associated with their genomic locations, and the genomic regions (enhancer or not) they fall within. 

(Execute notebook after crunching wig into csv, as per other file)

Import relevant libraries

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
from pyspark.sql.functions import col

Create spark session

In [3]:
if 'spark' in locals() and spark!=None:
    spark.stop()

    #are we running the actual script, or just testing?
for_real=True

spark=None

if for_real:
    #Should run in 300GB memory & 24 cores
    #give it more to be safe
    spark = SparkSession.builder \
        .appName("ANNOTATE") \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.cores", "5") \
        .config("spark.executor.memory", "50g") \
        .config("spark.driver.memory", "30g") \
        .config("spark.driver.cores", "3") \
        .config("spark.executor.memoryOverhead", "70g") \
        .getOrCreate()
    
    #assuming 3tb memory & 24 cores
    #spark = SparkSession.builder \
    #    .appName("ANNOTATE") \
    #    .config("spark.executor.instances", "4") \
    #    .config("spark.executor.cores", "5") \
    #    .config("spark.executor.memory", "710g") \
    #    .config("spark.driver.memory", "30g") \
    #    .config("spark.driver.cores", "3") \
    #    .config("spark.executor.memoryOverhead", "70g") \
    #    .config("spark.sql.shuffle.partitions", "1000") \
    #    .config("spark.shuffle.manager", "sort") \
    #    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    #    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    #    .getOrCreate()
else:
    spark = SparkSession.builder \
        .appName("ANNOTATE_TEST") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .config("spark.sql.shuffle.partitions", "10") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/04 15:12:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/04 15:12:09 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [4]:
print(spark.sparkContext.uiWebUrl)

http://r814u23n04.mccleary.ycrc.yale.edu:4041


In [5]:

#define the phylop tsv schema
phylop_schema = StructType([
    StructField("CHROM", StringType(), True),
    StructField("POS", IntegerType(), True),
    StructField("P_ANNO", FloatType(), True),
])

#read in the phylop tsv
phylop_anno = spark.read \
    .option("comment", "#") \
    .option("delimiter", "\t") \
    .schema(phylop_schema) \
    .csv("/home/mcn26/varef/scripts/noon_data/1.annotate/out_processed.tsv", header=False)

#define the big boy vcf schema
vcf_schema = StructType([
    StructField("CHROM", StringType(), True),
    StructField("POS", IntegerType(), True),
    StructField("ID", StringType(), True),
    StructField("REF", StringType(), True),
    StructField("ALT", StringType(), True),
    StructField("QUAL", StringType(), True),
    StructField("FILTER", StringType(), True),
    StructField("INFO", StringType(), True),

])

#read in the vcf data
vcf = spark.read \
    .option("comment", "#") \
    .option("delimiter", "\t") \
    .schema(vcf_schema) \
    .csv("/home/mcn26/varef/scripts/noon_data/0.merge/out/*.vcf.gz", header=False)


#define genomic region annotation (bed file) schema
bed_schema = StructType([
    StructField("CHROM", StringType(), True),
    StructField("START", IntegerType(), True),
    StructField("STOP", StringType(), True),
])

#load the encode enhancer datasets

CRE_BASEPATH="/home/mcn26/varef/data/ENCODE/SCREEN_v4_cCREs_agnostic/"

promoters=spark.read \
    .schema(bed_schema) \
    .option("delimiter", "\t") \
    .csv(CRE_BASEPATH+"GRCh38-PLS.V4.bed.gz")


prox_enhancers=spark.read \
    .schema(bed_schema) \
    .option("delimiter", "\t") \
    .csv(CRE_BASEPATH+"GRCh38-pELS.V4.bed.gz")

distal_enhancers=spark.read \
    .schema(bed_schema) \
    .option("delimiter", "\t") \
    .csv(CRE_BASEPATH+"GRCh38-dELS.V4.bed.gz")

Just for testing : remove all variants not on chromosome 22.

In [None]:
vcf = vcf.filter(col("CHROM") == "chr22" )

Add the genomic region annotations

In [6]:
def add_genomic_annotation(loci,regions,name):
    

    # as usual, we have to worry about coordinate systems
    # VCFs are 1-based
    # BEDs are 0-based
    # Chr1        T   A   C   G   T
    #           | | | | | | | | | |
    # 1 based   | 1 | 2 | 3 | 4 | 5
    # 0 based   0   1   2   3   4

    result = loci.join(
        regions,
        (loci.CHROM == regions.CHROM) & 
        (loci.POS > regions.START) & 
        (loci.POS <= regions.STOP),
        how='left'
    )

    # Add a boolean column 'is_in_region'
    result = result.withColumn("is_in_"+name, col("start").isNotNull())

    # Select only columns from df_loci and the new boolean column
    final_result = result.select(loci["*"], "is_in_"+name)
    
    return final_result

In [7]:
genomic_regions_added=add_genomic_annotation(loci=vcf,regions=promoters,name="promoter")
genomic_regions_added=add_genomic_annotation(loci=genomic_regions_added,regions=prox_enhancers,name="prox_enhancers")
genomic_regions_added=add_genomic_annotation(loci=genomic_regions_added,regions=distal_enhancers,name="distal_enhancers")

23/12/04 15:12:27 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


Add the phyloP annotations

In [8]:
phyloP_annotated = genomic_regions_added.join(phylop_anno, on=["CHROM", "POS"], how="left")

In [8]:
phyloP_annotated.write.csv("/home/mcn26/varef/scripts/noon_data/1.annotate/annotated_output_chr22_only.csv", header=True, mode="overwrite")

23/11/30 17:39:45 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                                                                                                                    