In [1]:
%%configure -f
{"driverMemory": "4G", "driverCores": 2, "executorMemory": "12G", "executorCores": 6, "numExecutors": 3}

ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
0,,pyspark,idle,,,,


In [2]:
from typing import List

from pyspark import SparkFiles
from subprocess import call
import sys


def install_deps(deps: List[str]) -> None:
    call([sys.executable, '-m', 'pip', 'install', '-q', '-t', SparkFiles.getRootDirectory(), *deps])


install_deps(['numpy', 'matplotlib', 'pandas', 'scipy', 'seaborn'])

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1,,pyspark,idle,,,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [4]:
from pyspark.sql import SparkSession
spark: SparkSession

distance_calculation_df = (
    spark
    .read
    .format("mongodb")
    .option("database", "enhancer3d")
    .option("collection", "distance_calculation")
    .load()
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [5]:
from pyspark.sql import functions as F, types as T
import numpy as np

@F.udf(T.ArrayType(T.DoubleType()))
def diff(A, B):
    return np.abs(np.array(A) - np.array(B)).tolist()

@F.udf(T.DoubleType())
def var(A):
    return float(np.var(A))

@F.udf(T.DoubleType())
def avg(A):
    return float(np.mean(A))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [8]:
gm12878_neanderthal_df_ref = (
    distance_calculation_df
    .where(
        (F.col('_id.project_id') == '8k_models_project_GM12878')
        & (F.col('_id.ensemble_id').startswith('models3D_GM12878_Nean_models3D_GM12878_Nean_results'))
        & (F.col('gene_type') == 'protein_coding')
    )
)

gm12878_denisovan_df_ref = (
    distance_calculation_df
    .where(
        (F.col('_id.project_id') == '8k_models_project_GM12878')
        & (F.col('_id.ensemble_id').startswith('models3D_GM12878_Deni_models3D_GM12878_Deni_results'))
        & (F.col('gene_type') == 'protein_coding')
    )
)

hffc6_neanderthal_df_ref = (
    distance_calculation_df
    .where(
        (F.col('_id.project_id') == '8k_models_project_HFFC6')
        & (F.col('_id.ensemble_id').startswith('models3D_HFFC6_Nean_models3D_HFFC6_Nean_results'))
        & (F.col('gene_type') == 'protein_coding')
    )
)

hffc6_denisovan_df_ref = (
    distance_calculation_df
    .where(
        (F.col('_id.project_id') == '8k_models_project_HFFC6')
        & (F.col('_id.ensemble_id').startswith('models3D_HFFC6_Deni_models3D_HFFC6_Deni_results'))
        & (F.col('gene_type') == 'protein_coding')
    )
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [9]:
%%pretty
gm12878_neanderthal_df_ref.show(n=1)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

_id,avg_dist,dist,enh_center_pos,enh_center_position,enh_chr,enh_end,enh_loci,enh_model_coloring_end,enh_model_coloring_start,enh_model_position,enh_score,enh_start,enh_tSS_distance,gene_TSS_pos,gene_chr,gene_end,gene_model_coloring_end,gene_model_coloring_start,gene_model_position,gene_start,gene_strand,gene_type,number_bins,project_author,project_authors,project_cell_line,project_cell_lines,project_executed_at,project_species,region_chr,region_end,region_start,var_dist
{8k_models_projec...,10.02947235107422,[10.3883590698242...,10855122,10855122,chr10,10855497,chr10:10854747-10...,178,178,178,8.12594160720854,10854747,56726,10798396,chr10,11336675,660,121,121,10798396,+,protein_coding,100,,"[Nikita Kozlov, M...",,[GM12878],2025-04-10 10:41:...,"[Denisovans, Nean...",chr10,12302793,10300866,0.1118004098534584


In [10]:
gm12878_neanderthal_genes_with_average_distance_df = (
    gm12878_neanderthal_df_ref
    .select(
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        'avg_dist'
    )
    .groupBy('gene_id')
    .agg(
        F.avg('avg_dist').alias('avg_dist_per_gene')
    )
    .orderBy(F.desc('avg_dist_per_gene'))
    .alias("gm12878")
)

hffc6_neanderthal_genes_with_average_distance_df = (
    hffc6_neanderthal_df_ref
    .select(
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        'avg_dist'
    )
    .groupBy('gene_id')
    .agg(
        F.avg('avg_dist').alias('avg_dist_per_gene')
    )
    .orderBy(F.desc('avg_dist_per_gene'))
    .alias("hffc6")
)

gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df = (
    gm12878_neanderthal_genes_with_average_distance_df
    .join(
        hffc6_neanderthal_genes_with_average_distance_df,
        on=['gene_id'],
        how='inner'
    )
    .select(
        'gene_id',
        F.col('gm12878.avg_dist_per_gene').alias('dist_diff_avg_gm12878'),
        F.col('hffc6.avg_dist_per_gene').alias('dist_diff_avg_hffc6'),
        F.abs((F.col('gm12878.avg_dist_per_gene') - F.col('hffc6.avg_dist_per_gene'))).alias('dist_diff_avg'),
    )
    .orderBy(F.desc('dist_diff_avg'))
    # .limit(100)
)

gm12878_denisovan_genes_with_average_distance_df = (
    gm12878_denisovan_df_ref
    .select(
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        'avg_dist'
    )
    .groupBy('gene_id')
    .agg(
        F.avg('avg_dist').alias('avg_dist_per_gene')
    )
    .orderBy(F.desc('avg_dist_per_gene'))
    .alias("gm12878")
)

hffc6_denisovan_genes_with_average_distance_df = (
    hffc6_denisovan_df_ref
    .select(
        F.col('_id.region_id').alias('region_id'),
        F.col('_id.gene_id').alias('gene_id'),
        F.col('_id.enh_id').alias('enh_id'),
        'avg_dist'
    )
    .groupBy('gene_id')
    .agg(
        F.avg('avg_dist').alias('avg_dist_per_gene')
    )
    .orderBy(F.desc('avg_dist_per_gene'))
    .alias("hffc6")
)

gm12878_hffc6_denisovan_genes_with_biggest_distance_difference_df = (
    gm12878_denisovan_genes_with_average_distance_df
    .join(
        hffc6_denisovan_genes_with_average_distance_df,
        on=['gene_id'],
        how='inner'
    )
    .select(
        'gene_id',
        F.col('gm12878.avg_dist_per_gene').alias('dist_diff_avg_gm12878'),
        F.col('hffc6.avg_dist_per_gene').alias('dist_diff_avg_hffc6'),
        F.abs((F.col('gm12878.avg_dist_per_gene') - F.col('hffc6.avg_dist_per_gene'))).alias('dist_diff_avg'),
    )
    .orderBy(F.desc('dist_diff_avg'))
    # .limit(100)
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [26]:
%%pretty
gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df.show(n=5)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

gene_id,dist_diff_avg_gm12878,dist_diff_avg_hffc6,dist_diff_avg
ENSG00000227445.1,4.844081688900383,38.90395504915261,34.05987336025223
ENSG00000198703.2,4.274681260391158,37.540769070009645,33.26608780961849
ENSG00000236656.1,4.537461128770088,37.77818047245847,33.240719343688376
ENSG00000130558.20,6.204862315576155,39.31081470366447,33.10595238808831
ENSG00000169717.7,6.218723866427056,39.23214709069118,33.01342322426412


In [11]:
# to pandas and plot
import pandas as pd

gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df_pd = (
    gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df
    .toPandas()
)

# save the dataframe to a csv file
gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df_pd.to_csv(
    '/work/playground/gm12878_hffc6_neanderthal_genes_with_biggest_distance_difference_df.csv',
    index=False
)

gm12878_hffc6_denisovan_genes_with_biggest_distance_difference_df_pd = (
    gm12878_hffc6_denisovan_genes_with_biggest_distance_difference_df
    .toPandas()
)

# save the dataframe to a csv file
gm12878_hffc6_denisovan_genes_with_biggest_distance_difference_df_pd.to_csv(
    '/work/playground/gm12878_hffc6_denisovan_genes_with_biggest_distance_difference_df.csv',
    index=False
)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…