In [1]:
%%configure -f
{
    "driverMemory": "16G", "driverCores": 8,
    "executorMemory": "8G", "executorCores": 6, "numExecutors": 3,
    "conf": {
        "spark.jars.packages": "org.mongodb.spark:mongo-spark-connector_2.12:10.4.0,tools.kot.nk2:connector:1.3.7"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
6,,pyspark,idle,,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
6,,pyspark,idle,,,,✔


In [2]:
from typing import List

from pyspark import SparkFiles
from subprocess import call
import sys


def install_deps(deps: List[str]) -> None:
    call([sys.executable, '-m', 'pip', 'install', '-q', '-t', SparkFiles.getRootDirectory(), *deps])


install_deps(['numpy', 'matplotlib', 'pandas', 'scipy', 'seaborn', 'statsmodels', 'pyarrow', 'pymongo'])

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [6]:
from pyspark.sql import functions as F, types as T
import numpy as np
from scipy import stats
from statsmodels.sandbox.stats.multicomp import multipletests

@F.udf(T.ArrayType(T.DoubleType()))
def diff(A, B):
    return np.abs(np.array(A) - np.array(B)).tolist()

@F.udf(T.DoubleType())
def var(A):
    return float(np.var(A))

@F.udf(T.DoubleType())
def avg(A):
    return float(np.mean(A))

@F.udf(T.DoubleType())
def mannwhiteneyu(ref, mod):
    result = stats.mannwhitneyu(np.array(ref), np.array(mod), alternative='two-sided')
    return float(result.pvalue)

@F.udf(T.DoubleType())
def bonferroni_correction(pvalues, alpha=0.05):
    reject, pvals_corrected, _, _ = multipletests(pvalues, alpha=alpha, method='bonferroni')
    return float(np.mean(pvals_corrected))


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc: SparkContext
spark: SparkSession

project_configuration_df = (
    spark
    .read
    .format("mongodb")
    .option("database", "enhancer3d")
    .option("collection", "project_configuration")
    .load()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
ensembles_list_by_project_df = (
    project_configuration_df
    .select(F.col('_id.project_id').alias('project_id'), F.col('datasets.ensemble_id').alias('ensemble_id'))
    # blow up the list of ensembles
    .withColumn('ensemble_id', F.explode(F.col('ensemble_id')))
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
%%pretty
ensembles_list_by_project_df.show(n=5, truncate=False)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

project_id,ensemble_id
8k_models_project_GM12878,models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr7_54723172_57700542
8k_models_project_GM12878,models3D_GM12878_Deni_models3D_GM12878_Deni_ref2_results_GM12878_Deni_chr7_0_1069141
8k_models_project_GM12878,models3D_GM12878_Deni_models3D_GM12878_Deni_results_GM12878_Deni_chr12_5480623_8570102
8k_models_project_GM12878,models3D_GM12878_Nean_models3D_GM12878_Nean_mod_results_GM12878_Nean_chr3_127434292_130363324
8k_models_project_GM12878,models3D_GM12878_Deni_models3D_GM12878_Deni_mod_results_GM12878_Deni_chr21_12254928_14481319


In [9]:
all_relevant_ensembles_df_nean = (
    ensembles_list_by_project_df
    .where(
        (
            (F.col('project_id').isin(['8k_models_project_GM12878']))
            & (F.col('ensemble_id').like('models3D_GM12878_Nean_models3D_GM12878_Nean_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_HFFC6']))
            & (F.col('ensemble_id').like('models3D_HFFC6_Nean_models3D_HFFC6_Nean_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_H1ESC']))
            & (F.col('ensemble_id').like('models3D_H1ESC_Nean_models3D_H1ESC_Nean_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_GM12878']))
            & (F.col('ensemble_id').like('models3D_GM12878_Nean_models3D_GM12878_Nean_results%'))
        )
    )
)

all_relevant_ensembles_df_deni = (
    ensembles_list_by_project_df
    .where(
        (
            (F.col('project_id').isin(['8k_models_project_GM12878']))
            & (F.col('ensemble_id').like('models3D_GM12878_Deni_models3D_GM12878_Deni_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_HFFC6']))
            & (F.col('ensemble_id').like('models3D_HFFC6_Deni_models3D_HFFC6_Deni_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_H1ESC']))
            & (F.col('ensemble_id').like('models3D_H1ESC_Deni_models3D_H1ESC_Deni_results%'))
        )
        | (
            (F.col('project_id').isin(['8k_models_project_GM12878']))
            & (F.col('ensemble_id').like('models3D_GM12878_Deni_models3D_GM12878_Deni_results%'))
        )
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [10]:
import os
from pyspark.sql import Row
from typing import List, Optional
import json

configurations = {}

def register_mongo_connector_configuration(
    configuration_id: str,
    structure: T.StructType,
    mongo_uri: Optional[str] = None,
    database: Optional[str] = None,
    collection: Optional[str] = None,
    root_condition: str = 'or',
    projection: Optional[List[str]] = None,
):
    if configuration_id is None:
        raise ValueError("configuration_id must be provided")

    if mongo_uri is None:
        mongo_uri = os.environ.get("MONGO_URI", "mongodb://mongo:Flkj234KJFsdzipArch@mongo:27017")

    if database is None:
        database = os.environ.get("MONGO_DATABASE", "enhancer3d")

    if collection is None:
        collection = os.environ.get("MONGO_COLLECTION", "distance_calculation")

    configurations[configuration_id] = {
        'connectionUri': mongo_uri,
        'databaseName': database,
        'collectionName': collection,
        'rootCondition': root_condition,
        'projection': projection or [],
        'structure': structure.jsonValue()
    }

    spark.udf.registerJavaFunction(
        f"loadMongoBatch{configuration_id}",
        "tools.kot.nk2.connector.MongoConnector",
        T.ArrayType(structure)
    )

def load_mongo_batch(configuration_id: str, column_name: str = 'criteria'):
    configuration = configurations.get(configuration_id)
    return F.expr(f"loadMongoBatch{configuration_id}('{json.dumps(configuration)}', {column_name})")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [11]:
register_mongo_connector_configuration(
    configuration_id='DistancesQuery',
    structure=T.StructType([
        T.StructField('_id', T.StructType([
            T.StructField('project_id', T.StringType(), True),
            T.StructField('ensemble_id', T.StringType(), True),
            T.StructField('region_id', T.StringType(), True),
            T.StructField('gene_id', T.StringType(), True),
            T.StructField('enh_id', T.StringType(), True)
        ])),
        T.StructField('gene_type', T.StringType(), True),
        T.StructField('avg_dist', T.DoubleType(), True),
        T.StructField('enh_tSS_distance', T.DoubleType(), True),
        T.StructField('project_cell_lines', T.ArrayType(T.StringType()), True)
    ]),
    projection=[
        '_id.project_id',
        '_id.ensemble_id',
        '_id.region_id',
        '_id.gene_id',
        '_id.enh_id',
        'gene_type',
        'avg_dist',
        'enh_tSS_distance',
        'project_cell_lines'
    ]
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [12]:
distances_query_nean_df = (
    all_relevant_ensembles_df_nean
    .withColumn(
        'batch_id',
        F.monotonically_increasing_id() % 18
    )
    .groupBy('batch_id')
    .agg(
        F.collect_list(
            F.struct(
                F.col('project_id').alias('_id.project_id'),
                F.col('ensemble_id').alias('_id.ensemble_id'),
            )
        ).alias('criteria')
    )
    # Load full data
    .select(
        load_mongo_batch(configuration_id='DistancesQuery', column_name='criteria').alias('data')
    )
    # Explode the data
    .select(
        F.explode(F.col('data')).alias('data')
    )
    .select(
        F.col('data._id').alias('_id'),
        F.col('data.gene_type').alias('gene_type'),
        F.col('data.avg_dist').alias('avg_dist'),
        F.col('data.enh_tSS_distance').alias('enh_tSS_distance'),
        F.col('data.project_cell_lines').alias('project_cell_lines')
    )
)

distances_query_deni_df = (
    all_relevant_ensembles_df_deni
    .withColumn(
        'batch_id',
        F.monotonically_increasing_id() % 18
    )
    .groupBy('batch_id')
    .agg(
        F.collect_list(
            F.struct(
                F.col('project_id').alias('_id.project_id'),
                F.col('ensemble_id').alias('_id.ensemble_id'),
            )
        ).alias('criteria')
    )
    # Load full data
    .select(
        load_mongo_batch(configuration_id='DistancesQuery', column_name='criteria').alias('data')
    )
    # Explode the data
    .select(
        F.explode(F.col('data')).alias('data')
    )
    .select(
        F.col('data._id').alias('_id'),
        F.col('data.gene_type').alias('gene_type'),
        F.col('data.avg_dist').alias('avg_dist'),
        F.col('data.enh_tSS_distance').alias('enh_tSS_distance'),
        F.col('data.project_cell_lines').alias('project_cell_lines')
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [15]:
distances_nean_df = (
    distances_query_nean_df
    .where(
        (F.col('gene_type') == 'protein_coding')
        # & (F.col('enh_tSS_distance') < 20_000)
    )
    .select(
        F.col('_id.project_id').alias('project_id'),
        F.col('_id.ensemble_id').alias('ensemble_id'),
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        F.element_at(F.col('project_cell_lines'), 1).alias('cell_line'),
        # 'dist',
        'avg_dist',
        'enh_tSS_distance'
    )
    # gene_id ENH00001.XXX -> ENH00001
    .withColumn('gene_id', F.split(F.col('gene_id'), '\.')[0])
    .alias("distances_deni")
)

distances_deni_df = (
    distances_query_deni_df
    .where(
        (F.col('gene_type') == 'protein_coding')
        # & (F.col('enh_tSS_distance') < 20_000)
    )
    .select(
        F.col('_id.project_id').alias('project_id'),
        F.col('_id.ensemble_id').alias('ensemble_id'),
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        F.element_at(F.col('project_cell_lines'), 1).alias('cell_line'),
        # 'dist',
        'avg_dist',
        'enh_tSS_distance'
    )
    # gene_id ENH00001.XXX -> ENH00001
    .withColumn('gene_id', F.split(F.col('gene_id'), '\.')[0])
    .alias("distances_nean")
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…



In [17]:
(
    distances_nean_df
    .join(
        distances_deni_df,
        on=[
            F.col('distances_nean.cell_line') == F.col('distances_deni.cell_line'),
            F.col('distances_nean.gene_id') == F.col('distances_deni.gene_id'),
            F.col('distances_nean.enh_id') == F.col('distances_deni.enh_id'),
        ],
        how='inner'
    )
    .select(
        F.col('distances_nean.cell_line').alias('cell_line'),
        F.col('distances_nean.gene_id').alias('gene_id'),
        F.col('distances_nean.enh_id').alias('enh_id'),
        F.col('distances_nean.avg_dist').alias('avg_dist_nean'),
        F.col('distances_deni.avg_dist').alias('avg_dist_deni'),
        F.col('distances_nean.region_id').alias('region_id_nean'),
        F.col('distances_deni.region_id').alias('region_id_deni'),
    )
    # select those where regions are different
    .where(
        F.col('distances_nean.region_id') != F.col('distances_deni.region_id')
    )
    .count()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

646715