# Most of this notebook is inspired by Ruth's notion:
https://www.notion.so/LD-Score-Regression-96461063fe5b48daa8b5174998188825

In [1]:
# Load packages
pacman::p_load(dplyr, stringr, data.table, tidyr, data.table, Matrix,
               hdf5r, Seurat, Signac,harmony, knitr, SoupX, cluster, glmgampoi,
               EnsDb.Hsapiens.v86, 
               logr, parallel, future,
               fpc,
               ggplot2, ggpubr, ggrepel, ggbreak, gridExtra, patchwork, grid, ggh4x)

“package ‘glmgampoi’ is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages”
“”
“there is no package called ‘glmgampoi’”
"Failed to install/load:
glmgampoi"


# Split bed by emission state

In [35]:
# Set directories
peaks.dir = '/nfs/lab/tscc/luca/MEGA_Heart/peaks_LDSC_annotate/'
peaks.split.dir = '/nfs/lab/tscc/luca/MEGA_Heart/peaks_LDSC_annotate/splitByES/'

dir.create(peaks.split.dir)

In [31]:
files = list.files(paste0(peaks.dir))
files

In [32]:
states = c("E1", "E2", "E3", "E4", "E5")

In [38]:
for (i in seq_along(files)){
    file.use = files[i]
    message("processing: ", file.use)
    celltype.use = gsub(pattern = "_annotate.bed", replacement = "", x = file.use)

    # Load ES data
    data <- read.table(paste0(peaks.dir, file.use), sep = '\t', header = FALSE) %>%
        dplyr::mutate(ID = paste(V1, V2, V3, sep = "-")) %>%
        distinct(ID, .keep_all = TRUE) %>%
        dplyr::select(V1, V2, V3, ID,
                      ES = V4) %>%
        dplyr::filter(ES != ".")

    for (state.use in states){
        data.write = data %>%
            dplyr::filter(ES == state.use) 

        message("  - State subset check: ", unique(data.write$ES))

        data.write = data.write %>%
            dplyr::select(V1, V2, V3)

        # Save narrow peak file
        write.table(data.write, paste0(peaks.split.dir,
                                       celltype.use, "--", state.use,
                                       "_annotated_peaks.bed"), 
                    sep = '\t', row.names = FALSE, col.names = FALSE, quote=FALSE)
    }
}

processing: aCM_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3

  - State subset check: E4

  - State subset check: E5

processing: Adipocyte_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3

  - State subset check: E4

  - State subset check: E5

processing: Endocardial_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3

  - State subset check: E4

  - State subset check: E5

processing: Endothelial_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3

  - State subset check: E4

  - State subset check: E5

processing: Epicardial_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3

  - State subset check: E4

  - State subset check: E5

processing: Fibroblast_annotate.bed

  - State subset check: E1

  - State subset check: E2

  - State subset check: E3



# Convert to Hg19

In [None]:
# Installation found here:
'https://github.com/bulik/ldsc'

# Create background peak set 

In [39]:
"Just using the background for all CREs"
"/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/MergedPeaks_ConsensuspeaksHg19.bed"

# Make annotations

In [None]:
# For celltypes

N=22  # Number of parallel jobs allowed
job_counter=0   # Initialize job counter

# Loop over annotations (cell types)
for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/celltypes.txt); do

    # Create a directory for each annotation if it doesn't exist
    mkdir -p /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19

    # Loop over chromosomes 1 to 22
    for i in {1..22}; do
        ((job_counter=job_counter%N))  # Control parallel job count
        ((job_counter++==0)) && wait   # Wait if max jobs reached

        # Run the make_annot.py script for each chromosome in parallel
        python /nfs/lab/Luca/Scripts/ldsc/make_annot.py \
        --bed-file /nfs/lab/tscc/luca/MEGA_Heart/peaks_LDSC_annotate/splitByES/Hg19/${annot}_Hg19_annotated_peaks.bed \
        --bimfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i}.bim \
        --annot-file /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz &
    done
done

# Wait for all background jobs to finish before exiting
wait
exit 0

# LD score regression

In [None]:
# For celltypes
for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/celltypes.txt); do
    for i in {1..22}; do

        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --print-snps /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline_snps/hm.${i}.snp \
        --ld-wind-cm 1.0 \
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i} \
        --bfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i} \
        --thin-annot \
        --annot /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz \
        --l2 

    done
done

In [None]:
# For the future: Just add the background as one more celltype

# Run partitioned heritability

In [None]:
# RUNNING ON Ophelia

In [None]:
# make a list of files one, per trait up to: ".ldsc.sumstats.gz"
## GCST90162626_buildGRCh37.tsv.ldsc.sumstats.gz -> GCST90162626_buildGRCh37.tsv

In [None]:
N=50  # Number of parallel jobs allowed
i=0   # Initialize job counter

# Loop over traits
for trait in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Traits.txt); do
    ((i=i%N)); ((i++==0)) && wait

    # Extract the base name of the trait 
    trait_basename=$(basename ${trait})
    trait_name=${trait_basename%.ldsc.sumstats.gz}

    (
    # Loop over annotations
    for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/celltypes.txt); do
        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --h2 ${trait} \
        --ref-ld-chr /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19/${annot}_hg19.,/nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline/baseline.,/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/Background_hg19/Background_hg19.\
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Results/${trait_name}_hg19.${annot} \
        --overlap-annot  \
        --frqfile-chr /nfs/lab/ysun/LDSC/1000G_Phase3_frq/1000G.EUR.QC. \
        --w-ld-chr /nfs/lab/ysun/LDSC/weights_hm3_no_hla/weights. \
        --print-coefficients
    done
    ) &

done
exit 0

# Run on the latest 3 papers Ruth munged

In [None]:
N=20  # Number of parallel jobs allowed
i=0   # Initialize job counter

# Loop over traits
for trait in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Traits_3Papers.txt); do
    ((i=i%N)); ((i++==0)) && wait

    # Extract the base name of the trait 
    trait_basename=$(basename ${trait})
    trait_name=${trait_basename%.ldsc.sumstats.gz}

    (
    # Loop over annotations
    for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/celltypes.txt); do
        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --h2 ${trait} \
        --ref-ld-chr /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Annotations/Annotations/hg19/${annot}_hg19/${annot}_hg19.,/nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline/baseline.,/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/Background_hg19/Background_hg19.\
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC_ES/Results/${trait_name}_hg19.${annot} \
        --overlap-annot  \
        --frqfile-chr /nfs/lab/ysun/LDSC/1000G_Phase3_frq/1000G.EUR.QC. \
        --w-ld-chr /nfs/lab/ysun/LDSC/weights_hm3_no_hla/weights. \
        --print-coefficients
    done
    ) &

done
exit 0