We will generate a number of count tables, described in sections below. 

## import 

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F
import pyspark.sql.types as T
import pickle

## create a spark session

In [2]:
conf = SparkConf() \
    .setAppName("Count")\

# Create a SparkContext with the specified configurations
if 'spark' in locals() and spark!=None:
    spark.stop()

sc = SparkContext(conf=conf)

# Create a SparkSession from the SparkContext
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/21 10:52:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load in gnomad variants annotated in the last script

In [3]:
df = spark.read \
    .option("comment", "#") \
    .option("delimiter", ",") \
    .csv("/gpfs/gibbs/pi/reilly/VariantEffects/scripts/noon_data/1.annotate/chr22_annotated_output.csv/*.csv", header=True)

                                                                                

## cast columns to the appropriate types & Drop columns rows with null values. 

Dropping isn't strictly necessary. We could, for example, only drop those rows with null malinouis skew when computing malinouis-skew-based metrics, drop rows with no phyloP scores when computing phyloP-based metrics, etc etc. However, this would result in different sets of variants summarized by each graph, which could create biases : if, for example, PhyloP scores are annotated for a nonrandom set of variants. Therefore I will drop rows with null data in any relevant columns prior to subsequent analysis. 

In [6]:
int_columns=["POS","AC","AN"]
float_columns=["AF","K562__ref","HepG2__ref","SKNSH__ref","K562__alt","HepG2__alt","SKNSH__alt","K562__skew","HepG2__skew","SKNSH__skew","cadd_phred","P_ANNO","mean_ref","mean_skew","MAF"]
cre_bool_columns=[]
for column in df.columns:
    if column.startswith("is_in"):
        cre_bool_columns.append(column)

In [7]:
df = df.dropna()#subset=["CHROM","POS","cadd_phred","P_ANNO","mean_ref","mean_skew","category"]+cre_bool_columns

In [8]:

for column in int_columns:
    df = df.withColumn(column, F.col(column).cast(T.IntegerType()))

for column in float_columns:
    df = df.withColumn(column, F.col(column).cast(T.FloatType()))

for column in cre_bool_columns:
    df = df.withColumn(column, F.col(column).cast(T.BooleanType()))

df_cre=df

# compute count tables

All count tables will be broken down by each of the CRE types. 

## PhyloP vs rarity
- add column : "significant"/"not significant" : threshold is 2.27
- count table of significance VS category
- dump to disc

In [9]:
df_phylop_significant=df_cre.withColumn("phylop_significant",F.col("P_ANNO")>=2.27)

phylop_count_table = df_phylop_significant.groupBy(["category","phylop_significant"]+cre_bool_columns).count()

data_base_path="/home/mcn26/varef/scripts/noon_data/2.count/"

phylop_count_table.coalesce(1).write.csv(data_base_path+"phylop_count_table", mode="overwrite", header=True)

23/12/21 10:55:38 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [10]:
phylop_count_table.toPandas()

                                                                                

Unnamed: 0,category,phylop_significant,is_in_dELS,is_in_CA,is_in_pELS,is_in_CA-H3K4me3,is_in_CA-CTCF,is_in_PLS,is_in_TF,is_in_CA-TF,count
0,ULTRARARE,False,True,False,False,False,False,False,False,False,563773
1,RARE,False,True,False,False,False,False,False,False,False,98414
2,LOW_FREQ,True,False,False,True,False,False,False,False,False,606
3,SINGLETON,True,False,False,False,False,False,False,False,True,403
4,RARE,False,False,False,False,False,False,False,False,True,1365
...,...,...,...,...,...,...,...,...,...,...,...
93,MAF_OR_AC_IS_ZERO,False,False,True,False,False,False,False,False,False,3
94,MAF_OR_AC_IS_ZERO,False,False,False,False,False,True,False,False,False,2
95,MAF_OR_AC_IS_ZERO,False,False,False,False,False,False,False,True,False,6
96,MAF_OR_AC_IS_ZERO,False,False,False,False,False,False,True,False,False,3


## CADD vs rarity
Similar approach to phylop above,

Cutoffs are 
- All
- score≥10
- score≥20
- score≥30
- score≥40
- score≥50

In [11]:
df_cadd_cutoff=df_cre.withColumn(
    "CADD>=10",F.col("cadd_phred")>=10
).withColumn(
    "CADD>=20",F.col("cadd_phred")>=20
).withColumn(
    "CADD>=30",F.col("cadd_phred")>=30
).withColumn(
    "CADD>=40",F.col("cadd_phred")>=40
).withColumn(
    "CADD>=50",F.col("cadd_phred")>=50
)

cadd_columns=["CADD>=10","CADD>=20","CADD>=30","CADD>=40","CADD>=50"]

with open("cadd_columns.pkl",'wb') as file:
    pickle.dump(cadd_columns,file)

cadd_count_table = df_cadd_cutoff.groupBy(["category"]+cadd_columns+cre_bool_columns).count()

data_base_path="/home/mcn26/varef/scripts/noon_data/2.count/"

cadd_count_table.coalesce(1).write.csv(data_base_path+"CADD_count_table", mode="overwrite", header=True)

                                                                                

## malinouis : reference activity & skew vs rarity

First, compute min and max of skew, reference activity.

While this does require aggregation of the entire dataset, min & max specifically ought to be fairly inexpensive to compute.

This will only be performed during initial testing, then values will be recorded & made into constants. 

In [12]:
## add a mean_alt column
df_cre=df_cre.withColumn("mean_alt", (F.col("K562__alt") + F.col("HepG2__alt") + F.col("SKNSH__alt")) / 3)

### helper functions

In [13]:
def get_column_names(var):
    final_names=[]
    for sub in var:
        final_names.append(sub[0])
    return final_names

def dump_cutoff_names_to_disc(var,name):
    #so we don't have to hard-code the names in multiple files. 
    with open(name+'.pkl', 'wb') as file:
        final_names=get_column_names(var)
        pickle.dump(final_names, file)

#(-Inf,1), [1,2), [2,4), [4,6), [6,Inf) (note first bin we would call as not active)
def make_cutoff(name):
    return [
        [name+"_(-Inf,1)",(F.col(name)<1)],
        [name+"_[1,2)",(F.col(name)>=1)&(F.col(name)<2)],
        [name+"_[2,4)",(F.col(name)>=2)&(F.col(name)<4)],
        [name+"_[4,6)",(F.col(name)>=4)&(F.col(name)<6)],
        [name+"_[6,Inf)",(F.col(name)>=6)]
    ]

def apply_cutoffs(df,cutoffs):
    df_working=df
    for name,cutoff_condition in cutoffs:
        df_working=df_working.withColumn(name,cutoff_condition)
    return df_working

In [14]:
skew_cutoffs=[
    ["mean_skew_(-Inf, -1.5)",(F.col("mean_skew")<-1.5)],
    ["mean_skew_[-1.5, -1)",(F.col("mean_skew")>=-1.5) & (F.col("mean_skew")<-1)],
    ["mean_skew_[-1,-0.5)",(F.col("mean_skew")>=-1) & (F.col("mean_skew")<-.5)],
    ["mean_skew_[-0.5,0)",(F.col("mean_skew")>=-0.5) & (F.col("mean_skew")<0)],
    ["mean_skew_[0,0.5)",(F.col("mean_skew")>=0) & (F.col("mean_skew")<0.5)],
    ["mean_skew_[0.5,1)",(F.col("mean_skew")>=0.5) & (F.col("mean_skew")<1)],
    ["mean_skew_[1,1.5)",(F.col("mean_skew")>=1) & (F.col("mean_skew")<1.5)],
    ["mean_skew_[1.5,Inf)",(F.col("mean_skew")>=1.5)],
]#(note middle two bins ([-0.5, 0) & [0, 0.5)) we would not call as emvars)

dump_cutoff_names_to_disc(skew_cutoffs,"skew_cutoffs")

In [15]:
mean_ref_cutoffs=make_cutoff("mean_ref")
dump_cutoff_names_to_disc(mean_ref_cutoffs,"mean_ref_cutoffs")

mean_alt_cutoffs=make_cutoff("mean_alt")
dump_cutoff_names_to_disc(mean_alt_cutoffs,"mean_alt_cutoffs")

In [16]:
##df_cre = df_cre.withColumn("mean_skew", F.round(df["mean_skew"], 2))

In [17]:
df_cre=apply_cutoffs(df_cre,skew_cutoffs)
df_cre=apply_cutoffs(df_cre,mean_alt_cutoffs)
df_cre=apply_cutoffs(df_cre,mean_ref_cutoffs)

In [18]:
df_cre_backup=df_cre

In [19]:
df_cre

DataFrame[CHROM: string, POS: int, ID: string, REF: string, ALT: string, QUAL: string, FILTER: string, INFO: string, K562__ref: float, HepG2__ref: float, SKNSH__ref: float, K562__alt: float, HepG2__alt: float, SKNSH__alt: float, K562__skew: float, HepG2__skew: float, SKNSH__skew: float, AC: int, AN: int, AF: float, cadd_phred: float, is_in_dELS: boolean, is_in_CA: boolean, is_in_pELS: boolean, is_in_CA-H3K4me3: boolean, is_in_CA-CTCF: boolean, is_in_PLS: boolean, is_in_TF: boolean, is_in_CA-TF: boolean, P_ANNO: float, mean_ref: float, mean_skew: float, MAF: float, category: string, mean_alt: double, mean_skew_(-Inf, -1.5): boolean, mean_skew_[-1.5, -1): boolean, mean_skew_[-1,-0.5): boolean, mean_skew_[-0.5,0): boolean, mean_skew_[0,0.5): boolean, mean_skew_[0.5,1): boolean, mean_skew_[1,1.5): boolean, mean_skew_[1.5,Inf): boolean, mean_alt_(-Inf,1): boolean, mean_alt_[1,2): boolean, mean_alt_[2,4): boolean, mean_alt_[4,6): boolean, mean_alt_[6,Inf): boolean, mean_ref_(-Inf,1): boolean

In [20]:
to_group_by=["category"]+cre_bool_columns+get_column_names(skew_cutoffs)+get_column_names(mean_ref_cutoffs)+get_column_names(mean_alt_cutoffs)

In [21]:
#some of the column names have commas, which can cause a problem. Let's replace them with underscores.
renamed_column_map = {col: col.replace(',', '^').replace('.','&') for col in to_group_by}

for old_name, new_name in renamed_column_map.items():
    df_cre = df_cre.withColumnRenamed(old_name, new_name)

In [22]:
skew_and_activity_table = df_cre.groupBy(list(renamed_column_map.values())).count()

In [23]:
skew_and_activity_table

DataFrame[category: string, is_in_dELS: boolean, is_in_CA: boolean, is_in_pELS: boolean, is_in_CA-H3K4me3: boolean, is_in_CA-CTCF: boolean, is_in_PLS: boolean, is_in_TF: boolean, is_in_CA-TF: boolean, mean_skew_(-Inf^ -1&5): boolean, mean_skew_[-1&5^ -1): boolean, mean_skew_[-1^-0&5): boolean, mean_skew_[-0&5^0): boolean, mean_skew_[0^0&5): boolean, mean_skew_[0&5^1): boolean, mean_skew_[1^1&5): boolean, mean_skew_[1&5^Inf): boolean, mean_ref_(-Inf^1): boolean, mean_ref_[1^2): boolean, mean_ref_[2^4): boolean, mean_ref_[4^6): boolean, mean_ref_[6^Inf): boolean, mean_alt_(-Inf^1): boolean, mean_alt_[1^2): boolean, mean_alt_[2^4): boolean, mean_alt_[4^6): boolean, mean_alt_[6^Inf): boolean, count: bigint]

In [24]:
skew_and_activity_table.coalesce(1).write.csv(data_base_path+"malinouis_skew_and_thresh", mode="overwrite", header=True)

                                                                                

In [23]:
from pyspark.sql.functions import col
from functools import reduce
from operator import or_

condition = reduce(or_, [col(c).isNull() for c in df_cre.columns])

# Applying the filter
null_rows = df_cre.filter(condition)
z=null_rows.toPandas()

#df_cre["mean_skew_[-1&5^ -1)"]

                                                                                

In [24]:
import pandas as pd
with pd.option_context('display.max_columns', None):
    display(z)


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,K562__ref,HepG2__ref,SKNSH__ref,K562__alt,HepG2__alt,SKNSH__alt,K562__skew,HepG2__skew,SKNSH__skew,AC,AN,AF,cadd_phred,is_in_dELS,is_in_CA,is_in_pELS,is_in_CA-H3K4me3,is_in_CA-CTCF,is_in_PLS,is_in_TF,is_in_CA-TF,P_ANNO,mean_ref,mean_skew,MAF,category,mean_alt,mean_skew_(-Inf^ -1&5),mean_skew_[-1&5^ -1),mean_skew_[-1^-0&5),mean_skew_[-0&5^0),mean_skew_[0^0&5),mean_skew_[0&5^1),mean_skew_[1^1&5),mean_skew_[1&5^Inf),mean_alt_(-Inf^1),mean_alt_[1^2),mean_alt_[2^4),mean_alt_[4^6),mean_alt_[6^Inf),mean_ref_(-Inf^1),mean_ref_[1^2),mean_ref_[2^4),mean_ref_[4^6),mean_ref_[6^Inf)
