We will generate a number of count tables, described in sections below. 

## import 

In [1]:
### 
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F
import pyspark.sql.types as T
#StructType, StructField, IntegerType, StringType, FloatType

## create a spark session

In [2]:
conf = SparkConf() \
    .setAppName("Count")\

# Create a SparkContext with the specified configurations
if 'spark' in locals() and spark!=None:
    spark.stop()

sc = SparkContext(conf=conf)

# Create a SparkSession from the SparkContext
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/15 10:35:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/12/15 10:36:10 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


## Load in gnomad variants annotated in the last script

In [15]:
schema = T.StructType([
    T.StructField("CHROM", T.StringType(), True),
    T.StructField("POS", T.IntegerType(), True),
    T.StructField("ID", T.StringType(), True),
    T.StructField("REF", T.StringType(), True),
    T.StructField("ALT", T.StringType(), True),
    T.StructField("QUAL", T.StringType(), True),
    T.StructField("FILTER", T.StringType(), True),
    T.StructField("INFO", T.StringType(), True),
    T.StructField("K562__ref", T.FloatType(), True),
    T.StructField("HepG2__ref", T.FloatType(), True),
    T.StructField("SKNSH__ref", T.FloatType(), True),
    T.StructField("K562__alt", T.FloatType(), True),    
    T.StructField("HepG2__alt", T.FloatType(), True),
    T.StructField("SKNSH__alt", T.FloatType(), True),
    T.StructField("K562__skew", T.FloatType(), True),
    T.StructField("HepG2__skew", T.FloatType(), True),
    T.StructField("SKNSH__skew", T.FloatType(), True),
    T.StructField("AC", T.IntegerType(), True),   
    T.StructField("AN", T.IntegerType(), True),
    T.StructField("AF", T.IntegerType(), True),
    T.StructField("cadd_phred", T.FloatType(), True),
    T.StructField("is_in_promoter", T.BooleanType(), True),
    T.StructField("is_in_prox_enhancers", T.BooleanType(), True),
    T.StructField("is_in_distal_enhancers", T.BooleanType(), True),
    T.StructField("P_ANNO", T.FloatType(), True),
    T.StructField("mean_ref", T.FloatType(), True),
    T.StructField("mean_skew", T.FloatType(), True),
    T.StructField("MAF", T.FloatType(), True),
    T.StructField("category", T.StringType(), True)    
])

df = spark.read \
    .option("comment", "#") \
    .option("delimiter", ",") \
    .schema(schema) \
    .csv("/home/mcn26/varef/scripts/noon_data/1.annotate/annotated_output_CHR22.csv/*.csv", header=True)

                                                                                

## Drop columns rows with null values. 

This isn't strictly necessary. We could, for example, only drop those rows with null malinouis skew when computing malinouis-skew-based metrics, drop rows with no phyloP scores when computing phyloP-based metrics, etc etc. However, this would result in different sets of variants summarized by each graph, which could create biases : if, for example, PhyloP scores are annotated for a nonrandom set of variants. Therefore I will drop rows with null data in any relevant columns prior to subsequent analysis. 

In [16]:
df_nonull = df.dropna(subset=["CHROM","POS","cadd_phred","P_ANNO","mean_ref","mean_skew","category"])

## Create a subset table containing only variants in CREs


In [17]:
df_cre = df_nonull.filter((F.col("is_in_promoter") == True) | (F.col("is_in_prox_enhancers") == True) | (F.col("is_in_distal_enhancers") == True))

# compute count tables

All count tables will be broken down by each of the three CRE types. 

## PhyloP vs rarity
- add column : "significant"/"not significant" : threshold is 2.27
- count table of significance VS category
- dump to disc

In [18]:
df_phylop_significant=df_cre.withColumn("phylop_significant",F.col("P_ANNO")>=2.27)

phylop_count_table = df_phylop_significant.groupBy(["category","phylop_significant","is_in_promoter","is_in_prox_enhancers","is_in_distal_enhancers"]).count()

data_base_path="/home/mcn26/varef/scripts/noon_data/2.count/"

phylop_count_table.coalesce(1).write.csv(data_base_path+"phylop_count_table", mode="overwrite", header=True)

                                                                                

## CADD vs rarity
Similar approach to phylop above,

Cutoffs are 
- All
- score≥10
- score≥20
- score≥30
- score≥40
- score≥50

In [19]:
df_cadd_cutoff=df_cre.withColumn(
    "CADD>=10",F.col("cadd_phred")>=10
).withColumn(
    "CADD>=20",F.col("cadd_phred")>=20
).withColumn(
    "CADD>=30",F.col("cadd_phred")>=30
).withColumn(
    "CADD>=40",F.col("cadd_phred")>=40
).withColumn(
    "CADD>=50",F.col("cadd_phred")>=50
)

cadd_count_table = df_cadd_cutoff.groupBy(["category","CADD>=10","CADD>=20","CADD>=30","CADD>=40","CADD>=50","is_in_promoter","is_in_prox_enhancers","is_in_distal_enhancers"]).count()

data_base_path="/home/mcn26/varef/scripts/noon_data/2.count/"

cadd_count_table.coalesce(1).write.csv(data_base_path+"CADD_count_table", mode="overwrite", header=True)

                                                                                

## malinouis : reference activity & skew vs rarity

First, compute min and max of skew, reference activity.

While this does require aggregation of the entire dataset, min & max specifically ought to be fairly inexpensive to compute.

This will only be performed during initial testing, then values will be recorded & made into constants. 

In [20]:
#do that...
df_cre.agg({'mean_skew': 'max'}).show()
df_cre.agg({'mean_skew': 'min'}).show()

df_cre.agg({'mean_ref': 'max'}).show()
df_cre.agg({'mean_ref': 'min'}).show()


                                                                                

+--------------+
|max(mean_skew)|
+--------------+
|      2.957448|
+--------------+



                                                                                

+--------------+
|min(mean_skew)|
+--------------+
|     -3.208347|
+--------------+



                                                                                

+-------------+
|max(mean_ref)|
+-------------+
|     8.470641|
+-------------+





+-------------+
|min(mean_ref)|
+-------------+
|   -1.2714466|
+-------------+



                                                                                

For chromosome 22:
- computed values are
    - max(mean_skew)=2.957448
    - min(mean_skew)=-3.208347
    - max(mean_ref)=8.470641
    - min(mean_ref) = -1.2714466
- Intervals of ~0.5 skew~ (seems small) 0.2 and 1 reference activity give us our cutoffs

### helper functions

In [46]:
def cutoffs_to_tuples(cutoffs):
    ret=[]
    i=0
    while i<len(cutoffs)-1:
        i+=1
        ret.append((cutoffs[i-1],cutoffs[i]))
    return ret

def cutoffs_to_table(table,cutoffs,col_name):
    
    ret_table=table
    names=[]
    
    #first
    newname=(col_name+"<="+str(cutoffs[0])).replace(".", "-")
    names.append(newname)
    ret_table=ret_table.withColumn(
        newname,
        F.col(col_name)<=cutoffs[0]
    )

    #middle
    for start,end in cutoffs_to_tuples(cutoffs):
        newname=(str(start)+"<"+col_name+"<="+str(end)).replace(".", "-")
        names.append(newname)
        ret_table=ret_table.withColumn(
            newname,
            (F.col(col_name)>start) & (F.col(col_name)<=end)
        )

    #last
    newname=(col_name+">"+str(cutoffs[-1])).replace(".", "-")
    names.append(newname)
    ret_table=ret_table.withColumn(
        newname,
        F.col(col_name)>cutoffs[-1]
    )
    
    return (ret_table,names)
    

### skew

In [52]:
df_skew_cut,skew_thresh_names=cutoffs_to_table(table=df_cre,cutoffs=[i*0.2 for i in range(-4,4)],col_name="mean_skew")

additional_categories=["is_in_promoter","is_in_prox_enhancers","is_in_distal_enhancers","category"]

In [53]:
skew_table = df_skew_cut.groupBy(skew_thresh_names+additional_categories).count()

skew_table.coalesce(1).write.csv(data_base_path+"malinouis_skew", mode="overwrite", header=True)

                                                                                

### skew with activity

In [54]:
df_skew_and_threshold,activity_thresh_names=cutoffs_to_table(table=df_skew_cut,cutoffs=[i for i in range(-2,9,2)],col_name="mean_ref") 

In [55]:
skew_and_activity_table = df_skew_and_threshold.groupBy(skew_thresh_names+additional_categories+activity_thresh_names).count()

skew_and_activity_table.coalesce(1).write.csv(data_base_path+"malinouis_skew_and_thresh", mode="overwrite", header=True)

                                                                                