We will generate a number of count tables, described in sections below. 

## import 

In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql import functions as F
import pyspark.sql.types as T
import pickle
import pandas as pd

## create a spark session

In [2]:
conf = SparkConf() \
    .setAppName("Count")\

# Create a SparkContext with the specified configurations
if 'spark' in locals() and spark!=None:
    spark.stop()

sc = SparkContext(conf=conf)

# Create a SparkSession from the SparkContext
spark = SparkSession(sc)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/22 21:57:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Load in gnomad variants annotated in the last script

In [3]:
#loading in all autosomes
#Skipping sex chromosomes, see readme
df = spark.read \
    .option("comment", "#") \
    .option("delimiter", ",") \
    .csv("/gpfs/gibbs/pi/reilly/VariantEffects/scripts/noon_data/2.filter/*.csv/*.csv", header=True)

24/01/22 21:57:34 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

## cast columns to the appropriate types & Drop columns rows with null values. 

Dropping isn't strictly necessary. We could, for example, only drop those rows with null malinouis skew when computing malinouis-skew-based metrics, drop rows with no phyloP scores when computing phyloP-based metrics, etc etc. However, this would result in different sets of variants summarized by each graph, which could create biases : if, for example, PhyloP scores are annotated for a nonrandom set of variants. Therefore I will drop rows with null data in any relevant columns prior to subsequent analysis. 

In [4]:
int_columns=["POS","AC","AN"]
float_columns=["AF","K562__ref","HepG2__ref","SKNSH__ref","K562__alt","HepG2__alt","SKNSH__alt","K562__skew","HepG2__skew","SKNSH__skew","cadd_phred","P_ANNO","mean_ref","mean_skew","MAF"]
cre_bool_columns=[]
for column in df.columns:
    if column.startswith("is_in"):
        cre_bool_columns.append(column)

In [5]:
df = df.dropna()#subset=["CHROM","POS","cadd_phred","P_ANNO","mean_ref","mean_skew","category"]+cre_bool_columns

In [6]:

for column in int_columns:
    df = df.withColumn(column, F.col(column).cast(T.IntegerType()))

for column in float_columns:
    df = df.withColumn(column, F.col(column).cast(T.FloatType()))

for column in cre_bool_columns:
    df = df.withColumn(column, F.col(column).cast(T.BooleanType()))

    
df_cre=df

### Compute pleitropy

"Pleitropy" here refers to a variant which is an emVar in multiple cell-types. We're calling emVars as anything with abs(skew)>=0.5 and max(alt activitym ref activity)>=1

In [7]:
#first we compute whether each variant can be called an emvar in each cell-type. 
for cell_type in ["K562","SKNSH","HepG2"]:
    df_cre = df_cre.withColumn(f"emVar_{cell_type}", 
                           (F.abs(F.col(f"{cell_type}__skew")) >= 0.5) & 
                           (F.greatest(F.col(f"{cell_type}__ref"), F.col(f"{cell_type}__alt")) >= 1.0))

#next, we count the number of cell-types each variant is an emvar in to compute the pleitropy. 
df_cre = df_cre.withColumn("pleio", F.col("emVar_K562").cast("int") + F.col("emVar_SKNSH").cast("int") + F.col("emVar_HepG2").cast("int"))

# compute count tables

All count tables will be broken down by each of the CRE types. 

## PhyloP vs rarity
- add column : "significant"/"not significant" : threshold is 2.27
- count table of significance VS category
- dump to disc

In [8]:
df_phylop_significant=df_cre.withColumn("phylop_significant",F.col("P_ANNO")>=2.27)

In [None]:
phylop_count_table = df_phylop_significant.groupBy(["category","phylop_significant"]+cre_bool_columns).count()

In [8]:
data_base_path="/home/mcn26/varef/scripts/noon_data/3.count/"

In [12]:
#ACTION
phylop_count_table.coalesce(1).write.csv(data_base_path+"phylop_count_table", mode="overwrite", header=True)

                                                                                

## PhyloP VS pleiotropy
Are variants which cause skew in different numbers of cell-types conserved more or less?

In [10]:
phylop_pleio = df_phylop_significant.groupBy(["pleio","phylop_significant"]+cre_bool_columns).count()

In [11]:
#ACTION
phylop_pleio.coalesce(1).write.csv(data_base_path+"phylop_pleio", mode="overwrite", header=True)

24/01/22 18:08:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## CADD vs rarity
Similar approach to phylop above,

Cutoffs are 
- All
- score≥10
- score≥20
- score≥30
- score≥40
- score≥50

In [13]:
df_cadd_cutoff=df_cre.withColumn(
    "CADD>=10",F.col("cadd_phred")>=10
).withColumn(
    "CADD>=20",F.col("cadd_phred")>=20
).withColumn(
    "CADD>=30",F.col("cadd_phred")>=30
).withColumn(
    "CADD>=40",F.col("cadd_phred")>=40
).withColumn(
    "CADD>=50",F.col("cadd_phred")>=50
)

cadd_columns=["CADD>=10","CADD>=20","CADD>=30","CADD>=40","CADD>=50"]

with open("cadd_columns.pkl",'wb') as file:
    pickle.dump(cadd_columns,file)

cadd_count_table = df_cadd_cutoff.groupBy(["category"]+cadd_columns+cre_bool_columns).count()


In [22]:
#ACTION
cadd_count_table.coalesce(1).write.csv(data_base_path+"CADD_count_table", mode="overwrite", header=True)

ERROR:root:KeyboardInterrupt while sending command.         (1726 + 10) / 10743]
Traceback (most recent call last):
  File "/home/mcn26/.conda/envs/mcn_vareff/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/mcn26/.conda/envs/mcn_vareff/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/mcn26/.conda/envs/mcn_vareff/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 



## CADD VS pleiotropy

In [None]:
cadd_pleio_table = df_cadd_cutoff.groupBy(["pleio"]+cadd_columns+cre_bool_columns).count()

In [None]:
#ACTION
cadd_pleio_table.coalesce(1).write.csv(data_base_path+"CADD_pleio", mode="overwrite", header=True)

## malinouis : reference activity & skew vs rarity


In [9]:
## add a mean_alt column
df_cre=df_cre.withColumn("mean_alt", (F.col("K562__alt") + F.col("HepG2__alt") + F.col("SKNSH__alt")) / 3)

### helper functions

In [10]:
def get_column_names(var):
    final_names=[]
    for sub in var:
        final_names.append(sub[0])
    return final_names

def dump_cutoff_names_to_disc(var,name):
    #so we don't have to hard-code the names in multiple files. 
    with open(name+'.pkl', 'wb') as file:
        final_names=get_column_names(var)
        pickle.dump(final_names, file)

#(-Inf,1), [1,2), [2,4), [4,6), [6,Inf) (note first bin we would call as not active)
def make_cutoff(name):
    return [
        [f"{name}_(-Inf,-6)", (F.col(name) < -6)]
    ] + [
        [f"{name}_[{i},{i+1})", (F.col(name) >= i) & (F.col(name) < i+1)] for i in range(-6, 6)
    ] + [
        [f"{name}_[6,Inf)", (F.col(name) >= 6)]
    ]

def apply_cutoffs(df,cutoffs):
    df_working=df
    for name,cutoff_condition in cutoffs:
        df_working=df_working.withColumn(name,cutoff_condition)
    return df_working

In [11]:
make_cutoff("test")

[['test_(-Inf,-6)', Column<'(test < -6)'>],
 ['test_[-6,-5)', Column<'((test >= -6) AND (test < -5))'>],
 ['test_[-5,-4)', Column<'((test >= -5) AND (test < -4))'>],
 ['test_[-4,-3)', Column<'((test >= -4) AND (test < -3))'>],
 ['test_[-3,-2)', Column<'((test >= -3) AND (test < -2))'>],
 ['test_[-2,-1)', Column<'((test >= -2) AND (test < -1))'>],
 ['test_[-1,0)', Column<'((test >= -1) AND (test < 0))'>],
 ['test_[0,1)', Column<'((test >= 0) AND (test < 1))'>],
 ['test_[1,2)', Column<'((test >= 1) AND (test < 2))'>],
 ['test_[2,3)', Column<'((test >= 2) AND (test < 3))'>],
 ['test_[3,4)', Column<'((test >= 3) AND (test < 4))'>],
 ['test_[4,5)', Column<'((test >= 4) AND (test < 5))'>],
 ['test_[5,6)', Column<'((test >= 5) AND (test < 6))'>],
 ['test_[6,Inf)', Column<'(test >= 6)'>]]

In [33]:


start_int = -9   # Now corresponds to -4.5 (represented as -9 * 0.5)
end_int = 9      # Now corresponds to 4.5 (represented as 9 * 0.5)
step_int = 1     # Step of 0.5 (represented as 1 * 0.5)

skew_cutoffs = [
    ["mean_skew_(-Inf, -4.0)", (F.col("mean_skew") < -4.0)]
    if i == start_int
    else ["mean_skew_(4.0, Inf)", (F.col("mean_skew") >= 4.0)]
    if i == end_int - step_int
    else [f"mean_skew_[{i * 0.5:.1f}, {(i + step_int) * 0.5:.1f})", (F.col("mean_skew") >= i * 0.5) & (F.col("mean_skew") < (i + step_int) * 0.5)]
    for i in range(start_int, end_int, step_int)
]

dump_cutoff_names_to_disc(skew_cutoffs,"skew_cutoffs")

In [23]:
for i in skew_cutoffs:
    print(i)

['mean_skew_(-Inf, -4.0)', Column<'(mean_skew < -4.0)'>]
['mean_skew_[-4.0, -3.5)', Column<'((mean_skew >= -4.0) AND (mean_skew < -3.5))'>]
['mean_skew_[-3.5, -3.0)', Column<'((mean_skew >= -3.5) AND (mean_skew < -3.0))'>]
['mean_skew_[-3.0, -2.5)', Column<'((mean_skew >= -3.0) AND (mean_skew < -2.5))'>]
['mean_skew_[-2.5, -2.0)', Column<'((mean_skew >= -2.5) AND (mean_skew < -2.0))'>]
['mean_skew_[-2.0, -1.5)', Column<'((mean_skew >= -2.0) AND (mean_skew < -1.5))'>]
['mean_skew_[-1.5, -1.0)', Column<'((mean_skew >= -1.5) AND (mean_skew < -1.0))'>]
['mean_skew_[-1.0, -0.5)', Column<'((mean_skew >= -1.0) AND (mean_skew < -0.5))'>]
['mean_skew_[-0.5, 0.0)', Column<'((mean_skew >= -0.5) AND (mean_skew < 0.0))'>]
['mean_skew_[0.0, 0.5)', Column<'((mean_skew >= 0.0) AND (mean_skew < 0.5))'>]
['mean_skew_[0.5, 1.0)', Column<'((mean_skew >= 0.5) AND (mean_skew < 1.0))'>]
['mean_skew_[1.0, 1.5)', Column<'((mean_skew >= 1.0) AND (mean_skew < 1.5))'>]
['mean_skew_[1.5, 2.0)', Column<'((mean_skew

In [24]:
mean_ref_cutoffs=make_cutoff("mean_ref")
dump_cutoff_names_to_disc(mean_ref_cutoffs,"mean_ref_cutoffs")

mean_alt_cutoffs=make_cutoff("mean_alt")
dump_cutoff_names_to_disc(mean_alt_cutoffs,"mean_alt_cutoffs")

In [12]:
##df_cre = df_cre.withColumn("mean_skew", F.round(df["mean_skew"], 2))

In [25]:
df_cre=apply_cutoffs(df_cre,skew_cutoffs)
df_cre=apply_cutoffs(df_cre,mean_alt_cutoffs)
df_cre=apply_cutoffs(df_cre,mean_ref_cutoffs)

In [26]:
df_cre_backup=df_cre

In [27]:
df_cre

DataFrame[CHROM: string, POS: int, ID: string, REF: string, ALT: string, QUAL: string, FILTER: string, INFO: string, K562__ref: float, HepG2__ref: float, SKNSH__ref: float, K562__alt: float, HepG2__alt: float, SKNSH__alt: float, K562__skew: float, HepG2__skew: float, SKNSH__skew: float, AC: int, AN: int, AF: float, cadd_phred: float, is_in_dELS: boolean, is_in_CA: boolean, is_in_pELS: boolean, is_in_CA-H3K4me3: boolean, is_in_CA-CTCF: boolean, is_in_PLS: boolean, is_in_TF: boolean, is_in_CA-TF: boolean, P_ANNO: float, mean_ref: float, mean_skew: float, MAF: float, category: string, emVar_K562: boolean, emVar_SKNSH: boolean, emVar_HepG2: boolean, pleio: int, mean_alt: double, mean_skew_(-Inf, -4.0): boolean, mean_skew_[-4.0, -3.5): boolean, mean_skew_[-3.5, -3.0): boolean, mean_skew_[-3.0, -2.5): boolean, mean_skew_[-2.5, -2.0): boolean, mean_skew_[-2.0, -1.5): boolean, mean_skew_[-1.5, -1.0): boolean, mean_skew_[-1.0, -0.5): boolean, mean_skew_[-0.5, 0.0): boolean, mean_skew_[0.0, 0.5)

In [28]:
to_group_by=["category"]+cre_bool_columns+get_column_names(skew_cutoffs)+get_column_names(mean_ref_cutoffs)+get_column_names(mean_alt_cutoffs)

In [29]:
#some of the column names have commas, which can cause a problem. Let's replace them with underscores.
renamed_column_map = {col: col.replace(',', '^').replace('.','&') for col in to_group_by}

for old_name, new_name in renamed_column_map.items():
    df_cre = df_cre.withColumnRenamed(old_name, new_name)

In [30]:
skew_and_activity_table = df_cre.groupBy(list(renamed_column_map.values())).count()

In [31]:
skew_and_activity_table

DataFrame[category: string, is_in_dELS: boolean, is_in_CA: boolean, is_in_pELS: boolean, is_in_CA-H3K4me3: boolean, is_in_CA-CTCF: boolean, is_in_PLS: boolean, is_in_TF: boolean, is_in_CA-TF: boolean, mean_skew_(-Inf^ -4&0): boolean, mean_skew_[-4&0^ -3&5): boolean, mean_skew_[-3&5^ -3&0): boolean, mean_skew_[-3&0^ -2&5): boolean, mean_skew_[-2&5^ -2&0): boolean, mean_skew_[-2&0^ -1&5): boolean, mean_skew_[-1&5^ -1&0): boolean, mean_skew_[-1&0^ -0&5): boolean, mean_skew_[-0&5^ 0&0): boolean, mean_skew_[0&0^ 0&5): boolean, mean_skew_[0&5^ 1&0): boolean, mean_skew_[1&0^ 1&5): boolean, mean_skew_[1&5^ 2&0): boolean, mean_skew_[2&0^ 2&5): boolean, mean_skew_[2&5^ 3&0): boolean, mean_skew_[3&0^ 3&5): boolean, mean_skew_[3&5^ 4&0): boolean, mean_skew_(4&0^ Inf): boolean, mean_ref_(-Inf^-6): boolean, mean_ref_[-6^-5): boolean, mean_ref_[-5^-4): boolean, mean_ref_[-4^-3): boolean, mean_ref_[-3^-2): boolean, mean_ref_[-2^-1): boolean, mean_ref_[-1^0): boolean, mean_ref_[0^1): boolean, mean_ref_

In [32]:
skew_and_activity_table.coalesce(1).write.csv(data_base_path+"malinouis_skew_and_thresh", mode="overwrite", header=True)

24/01/22 22:15:54 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [48]:
from pyspark.sql.functions import col
from functools import reduce
from operator import or_

condition = reduce(or_, [col(c).isNull() for c in df_cre.columns])

# Applying the filter
null_rows = df_cre.filter(condition)
z=null_rows.toPandas()

#df_cre["mean_skew_[-1&5^ -1)"]

                                                                                

In [49]:
import pandas as pd
with pd.option_context('display.max_columns', None):
    display(z)


Unnamed: 0,CHROM,POS,ID,REF,ALT,QUAL,FILTER,INFO,K562__ref,HepG2__ref,SKNSH__ref,K562__alt,HepG2__alt,SKNSH__alt,K562__skew,HepG2__skew,SKNSH__skew,AC,AN,AF,cadd_phred,is_in_dELS,is_in_CA,is_in_pELS,is_in_CA-H3K4me3,is_in_CA-CTCF,is_in_PLS,is_in_TF,is_in_CA-TF,P_ANNO,mean_ref,mean_skew,MAF,category,mean_alt,mean_skew_(-Inf^ -1&4),mean_skew_[-1&20^ -1&00),mean_skew_[-1&00^ -0&80),mean_skew_[-0&80^ -0&60),mean_skew_[-0&60^ -0&40),mean_skew_[-0&40^ -0&20),mean_skew_[-0&20^ 0&00),mean_skew_[0&00^ 0&20),mean_skew_[0&20^ 0&40),mean_skew_[0&40^ 0&60),mean_skew_[0&60^ 0&80),mean_skew_[0&80^ 1&00),mean_skew_[1&00^ 1&20),mean_skew_[1&20^ 1&40),mean_skew_(1&4^ Inf),mean_alt_(-Inf^1),mean_alt_[1^2),mean_alt_[2^3),mean_alt_[3^4),mean_alt_[4^5),mean_alt_[5^6),mean_alt_[6^Inf),mean_ref_(-Inf^1),mean_ref_[1^2),mean_ref_[2^3),mean_ref_[3^4),mean_ref_[4^5),mean_ref_[5^6),mean_ref_[6^Inf)
