The goal of this analysis is to get an intuition for how genetic variant rarity changes with (predicted) genic consequence. 

First, this notebook will compute count tables of variant rarity category by ensembl predicted consequence. 

- Load in all gnomad variants (with no malinouis predictions)
- Discard low-quality variants. 
    - Those that don't pass gnomad's own filters
    - Those at loci queried in a low number of people
    - those with a minor allele frequency of 0
- Compute allele-frequency category
    - Compute "rare", "ultra-rare", "common", "singleton", etc...
- Extract Ensembl VEP score categories into their own columns
- Tally the number of alleles falling into each allele frequency category 
- write table to disc. 

## Import relevant libraries

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType
import pyspark.sql.functions as F

## create the spark session.

In [2]:
if 'spark' in locals() and spark!=None:
    spark.stop()

    #are we running the actual script, or just testing?
for_real=False

spark=None

if for_real:
    #assuming 1991GB memory & 24 cores
    #(ycga_bigmem maximum memory)
    spark = SparkSession.builder \
        .appName("ANNOTATE") \
        .config("spark.executor.instances", "4") \
        .config("spark.executor.cores", "5") \
        .config("spark.executor.memory", "472g") \
        .config("spark.driver.memory", "30g") \
        .config("spark.driver.cores", "3") \
        .config("spark.executor.memoryOverhead", "70g") \
        .config("spark.sql.shuffle.partitions", "1000") \
        .getOrCreate()

else:
    spark = SparkSession.builder \
        .appName("ANNOTATE_TEST") \
        .config("spark.executor.memory", "4g") \
        .config("spark.driver.memory", "4g") \
        .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/05 17:26:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load gnomad data

In [16]:
schema = StructType([
    StructField("CHROM", StringType(), True),
    StructField("POS", StringType(), True),
    StructField("ID", StringType(), True),
    StructField("REF", StringType(), True),
    StructField("ALT", StringType(), True),
    StructField("QUAL", StringType(), True),
    StructField("FILTER", StringType(), True),
    StructField("INFO", StringType(), True),
])

df = spark.read \
    .option("comment", "#") \
    .option("delimiter", "\t") \
    .option("header", "false") \
    .schema(schema) \
    .csv("/gpfs/gibbs/pi/reilly/VariantEffects/data/gnomAD/gnomAD_genomes_v3.1.2/*.vcf.gz", header=True)



## Extract relevant columns from the info field

In [17]:
keys_to_extract = [#NONE CAN BE SUBSTRINGS OF THE OTHERS OR THE REGEX WILL BE BORKED
    "AC", "AN", "AF", "vep",
]

# Apply the regexp_extract function to the DataFrame to create new columns for each key.
# The expression '([^;]*)' captures any sequence of characters that are not a semicolon,
# which is assumed to be the delimiter for the key-value pairs in the 'INFO' column.

for key in keys_to_extract:

    #df = df.withColumn(key, regexp_extract(col("INFO"), "{}=([^;]+);?".format(key), 1))
    #when we find something put it, whne we don't put None
    df = df.withColumn(key, 
                       F.when(
                           F.regexp_extract(F.col("INFO"), "{}=([^;]+);?".format(key), 1) != "",
                           F.regexp_extract(F.col("INFO"), "{}=([^;]+);?".format(key), 1)).otherwise(None))

## Filtering out low-quality variants

In [18]:
df = df.filter(
    #make sure we have the necessary population stats
    #AC is for allele count, AF is for allele frequency, and AN is for allele number.
    (F.col("AF").isNotNull()) &
    (F.col("AC").isNotNull()) &
    (F.col("AN").isNotNull()) &

    #check variant has been queried in a reasonably large number of people
    #approx 1/3 of pop size queried in this release of gnomad
    #a little less conservative than gnomad's own warning threshold
    #which is triggered when a vartiant is queried in < 1/2 population
    (F.col("AN").cast("int") > 25385) &
    
    (F.col("AF").cast("float") !=1.0) &
    (F.col("AF").cast("float") !=0.0) &
    
    #gnomad filters passed. See original gnomad vcf header for spec.
    (F.col("FILTER") == "PASS") 
    
    
    
    #(col("CHROM") == "chr22") &
)

## Count occurances of each consequence code in each vep string.

First, get a list of consequences for each variant

**Insert image here**

In [27]:
#split on commas, split on pipes, split on &, extract the consequence codes into a list


df_transformed = df.withColumn(
    "consequence_codes",
    F.transform(
        F.split(df["vep"], ","),
        lambda element: F.split(F.split(element, "\\|")[1], "&")
    )
)

#as a result of the multiple layers of delimiters, we end up with funky nested lists. 
#Let's smush them into normal lists

df_transformed = df_transformed.withColumn(
    "flattened_consequence_codes",
    F.flatten("consequence_codes")
)

df_transformed=df_transformed.drop("consequence_codes")

df_transformed=df_transformed.withColumnRenamed("flattened_consequence_codes","consequence_codes")

df_transformed = df_transformed.withColumn("num_conseq_codes", F.size(df_transformed["consequence_codes"]))


In [28]:
df_transformed.limit(5).toPandas()

23/12/05 18:11:36 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: chr2, 10185, rs561592428, T, C, ., AC0, AC=0;AN=78;AF=0;AC_oth=0;AN_oth=0;AC_ami=0;AN_ami=0;AC_sas=0;AN_sas=2;AF_sas=0;AC_fin=0;AN_fin=4;AF_fin=0;AC_eas=0;AN_eas=6;AF_eas=0;AC_amr=0;AN_amr=18;AF_amr=0;AC_afr=0;AN_afr=2;AF_afr=0;AC_mid=0;AN_mid=0;AC_asj=0;AN_asj=6;AF_asj=0;AC_nfe=0;AN_nfe=40;AF_nfe=0;cadd_raw_score=0.428063;cadd_phred=5.736;vep=C|intergenic_variant|MODIFIER|||Intergenic||||||||||||1|||SNV||||||||||||||||||||||||
 Schema: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
Expected: CHROM but found: chr2
CSV file: file:///gpfs/gibbs/pi/reilly/VariantEffects/data/gnomAD/gnomAD_genomes_v3.1.2/gnomad.genomes.v3.1.2.sites.chr2.subinfo.vcf.gz


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,AC,AN,AF,vep,consequence_codes,num_conseq_codes
0,chr2,10275,rs1213078649,CCCTAACCCTTTACCCTAACCCGAACCCCTAACCCCTAACCCCTAA...,C,.,PASS,AC=2;AN=25518;AF=7.8376e-05;AC_oth=0;AN_oth=32...,2,25518,7.8376e-05,-|intergenic_variant|MODIFIER|||Intergenic||||...,[intergenic_variant],1
1,chr2,10276,.,CCTAACCCTTTACCCTAACCCGAACCCCTAACCCCTAACCCCTAACCCT,C,.,PASS,AC=1;AN=27482;AF=3.63875e-05;AC_oth=0;AN_oth=3...,1,27482,3.63875e-05,-|intergenic_variant|MODIFIER|||Intergenic||||...,[intergenic_variant],1
2,chr2,10277,rs1558169985,C,CT,.,PASS,AC=1;AN=25498;AF=3.92188e-05;AC_oth=0;AN_oth=3...,1,25498,3.92188e-05,T|intergenic_variant|MODIFIER|||Intergenic||||...,[intergenic_variant],1
3,chr2,10277,rs574205379,C,G,.,PASS,AC=3;AN=25498;AF=0.000117656;AC_oth=0;AN_oth=3...,3,25498,0.000117656,G|intergenic_variant|MODIFIER|||Intergenic||||...,[intergenic_variant],1
4,chr2,10283,rs1247897532,C,G,.,PASS,AC=1;AN=27516;AF=3.63425e-05;AC_oth=0;AN_oth=3...,1,27516,3.63425e-05,G|intergenic_variant|MODIFIER|||Intergenic||||...,[intergenic_variant],1


Next, count how many times each consequence occurs for each variant.

In [29]:
df_with_count=df_transformed

all_consequence_codes = ["transcript_ablation", "splice_acceptor_variant", "splice_donor_variant", "stop_gained", "frameshift_variant", "stop_lost", "start_lost", "transcript_amplification","feature_elongation","feature_truncation","inframe_insertion","inframe_deletion","missense_variant","protein_altering_variant","splice_donor_5th_base_variant","splice_region_variant","splice_donor_region_variant","splice_polypyrimidine_tract_variant","incomplete_terminal_codon_variant","start_retained_variant","stop_retained_variant","synonymous_variant","coding_sequence_variant","mature_miRNA_variant","5_prime_UTR_variant","3_prime_UTR_variant","non_coding_transcript_exon_variant","intron_variant","NMD_transcript_variant","non_coding_transcript_variant","coding_transcript_variant","upstream_gene_variant","downstream_gene_variant","TFBS_ablation","TFBS_amplification","TF_binding_site_variant","regulatory_region_ablation","regulatory_region_amplification","regulatory_region_variant","intergenic_variant","sequence_variant"]
for value_to_count in all_consequence_codes:
    #for the current value we are counting, turn the array of consequence codes into an array of 1 or 0,
    #reporting whether the element matches the current value we are counting or not. 
    multi_hot = F.transform(
        F.col("consequence_codes"),
        #gets value == value_to_count, then casts that T/F to int (1/0)
        lambda x: (x == F.lit(value_to_count)).cast("int")
    )

    num_occurrences = F.aggregate(
        multi_hot,
        F.lit(0),
        lambda acc, x: acc + x
    )

    df_with_count = df_with_count.withColumn("num_"+value_to_count, num_occurrences)

In [30]:
sample=df_with_count.limit(4000)
sample=sample.withColumn("consequence_codes",F.col("consequence_codes").cast("string"))

In [31]:
sample.write.csv("/home/mcn26/varef/scripts/noon_data/delete-me.csv", header=True, mode="overwrite")

23/12/05 18:11:56 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
23/12/05 18:11:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: chr2, 10185, rs561592428, T, C, ., AC0, AC=0;AN=78;AF=0;AC_oth=0;AN_oth=0;AC_ami=0;AN_ami=0;AC_sas=0;AN_sas=2;AF_sas=0;AC_fin=0;AN_fin=4;AF_fin=0;AC_eas=0;AN_eas=6;AF_eas=0;AC_amr=0;AN_amr=18;AF_amr=0;AC_afr=0;AN_afr=2;AF_afr=0;AC_mid=0;AN_mid=0;AC_asj=0;AN_asj=6;AF_asj=0;AC_nfe=0;AN_nfe=40;AF_nfe=0;cadd_raw_score=0.428063;cadd_phred=5.736;vep=C|intergenic_variant|MODIFIER|||Intergenic||||||||||||1|||SNV||||||||||||||||||||||||
 Schema: CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO
Expected: CHROM but found: chr2
CSV file: file:///gpfs/gibbs/pi/reilly/VariantEffects/data/gnomAD/gnomAD_genomes_v3.1.2/gnomad.genomes.v3.1.2.sites.chr2.subinfo.vcf.gz
23/12/05 18:11:56 WARN CSVHeaderChecker: CSV header does not con