# Most of this notebook is inspired by Ruth's notion:
https://www.notion.so/LD-Score-Regression-96461063fe5b48daa8b5174998188825

# Convert peaks to bed

In [2]:
# Load packages
pacman::p_load(dplyr, stringr, data.table, tidyr, data.table, Matrix,
               hdf5r, Seurat, Signac,harmony, knitr, SoupX, cluster, glmgampoi,
               EnsDb.Hsapiens.v86, 
               logr, parallel, future,
               fpc,
               ggplot2, ggpubr, ggrepel, ggbreak, gridExtra, patchwork, grid, ggh4x)

“package ‘glmgampoi’ is not available for this version of R

A version of this package for your version of R might be available elsewhere,
see the ideas at
https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages”
“”
“there is no package called ‘glmgampoi’”
"Failed to install/load:
glmgampoi"


In [3]:
peaks.dir = "/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/"

files = list.files(paste0(peaks.dir), pattern = "peaks.xls")

In [3]:
message(paste0("Converting peaks to bed"))
for (i in seq_along(files)){
    peaks.use = read.table(paste0(peaks.dir, files[i]),
           header = TRUE)
    celltype = str_split(peaks.use$name[1], "_peak_")[[1]][1]
    
    # Save bed file formatted for bedops
    peaks.use = paste0(peaks.use$chr, ":", peaks.use$start, "-", peaks.use$end)
    
    # remove peaks on nonstandard chromosomes and in genomic blacklist regions
    peaks.use = StringToGRanges(peaks.use, sep = c(":", "-"))
    peaks.use <- keepStandardChromosomes(peaks.use, pruning.mode = "coarse")
    message("  - Only on standard Chromosomes: ", length(peaks.use))
    peaks.use <- subsetByOverlaps(x = peaks.use, ranges = blacklist_hg38_unified, invert = TRUE)
    message("  - After blacklist exclusion: ", length(peaks.use))
    
    peaks.use = as.data.frame(peaks.use)
    # Save bed file formatted for bedops
    peaks.use = peaks.use %>%
        dplyr::select(seqnames, start, end)
    
    # Save narrow peak file
    write.table(peaks.use, paste0(peaks.dir, celltype, "_peaks.bed"), 
                sep = '\t', row.names = FALSE, col.names = FALSE, quote=FALSE)
    message("Saved peaks for: ", celltype)    
    
}
message(paste0("Done"))

Converting peaks to bed

  - Only on standard Chromosomes: 175337

  - After blacklist exclusion: 175041

Saved peaks for: aCM

  - Only on standard Chromosomes: 65539

  - After blacklist exclusion: 65394

Saved peaks for: Adipocyte

  - Only on standard Chromosomes: 96442

  - After blacklist exclusion: 96220

Saved peaks for: Endocardial

  - Only on standard Chromosomes: 131077

  - After blacklist exclusion: 130803

Saved peaks for: Endothelial

  - Only on standard Chromosomes: 94845

  - After blacklist exclusion: 94669

Saved peaks for: Epicardial

  - Only on standard Chromosomes: 174196

  - After blacklist exclusion: 173900

Saved peaks for: Fibroblast

  - Only on standard Chromosomes: 80459

  - After blacklist exclusion: 80228

Saved peaks for: Lymphoid

  - Only on standard Chromosomes: 143363

  - After blacklist exclusion: 143065

Saved peaks for: Myeloid

  - Only on standard Chromosomes: 52864

  - After blacklist exclusion: 52687

Saved peaks for: Neuronal

  - Only

# Convert to Hg19

In [None]:
# Installation found here:
'https://github.com/bulik/ldsc'

# Create background peak set 

In [6]:
files <- list.files("/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls", 
                    pattern = "_peaksHg19.bed",
                    full.names = TRUE)
files

In [15]:
# Read the peaks and combine them
all_peaks <- lapply(files, function(file) {
  read.delim(file, header = FALSE, col.names = c("chr", "start", "end"))
})

# Combine all peak data into a single data frame
all_peaks_df <- do.call(rbind, all_peaks)

# Create a GRanges object
granges_peaks <- GRanges(seqnames = all_peaks_df$chr,
                         ranges = IRanges(start = all_peaks_df$start, end = all_peaks_df$end))
# Sort the GRanges object
granges_peaks <- sort(granges_peaks)

# Merge overlapping ranges
merged_peaks <- reduce(granges_peaks)

# Save bed file 
merged_peaks = as.data.frame(merged_peaks)
merged_peaks = merged_peaks %>%
    dplyr::select(seqnames, start, end)

# Save narrow peak file
write.table(merged_peaks,
            "/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/MergedPeaks_ConsensuspeaksHg19.bed", 
            sep = '\t', row.names = FALSE, col.names = FALSE, quote=FALSE)
message("Saved consensus peak list")    

Saved consensus peak list



# Make annotations

In [None]:
# For celltypes

N=22  # Number of parallel jobs allowed
job_counter=0   # Initialize job counter

# Loop over annotations (cell types)
for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/celltypes.txt); do

    # Create a directory for each annotation if it doesn't exist
    mkdir -p /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19

    # Loop over chromosomes 1 to 22
    for i in {1..22}; do
        ((job_counter=job_counter%N))  # Control parallel job count
        ((job_counter++==0)) && wait   # Wait if max jobs reached

        # Run the make_annot.py script for each chromosome in parallel
        python /nfs/lab/Luca/Scripts/ldsc/make_annot.py \
        --bed-file /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/${annot}_peaksHg19.bed \
        --bimfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i}.bim \
        --annot-file /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz &
    done
done

# Wait for all background jobs to finish before exiting
wait
exit 0

In [None]:
# For background

N=22  # Number of parallel jobs allowed
job_counter=0   # Initialize job counter

annot="Background"

    # Create a directory for each annotation if it doesn't exist
    mkdir -p /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19

    # Loop over chromosomes 1 to 22
    for i in {1..22}; do
        ((job_counter=job_counter%N))  # Control parallel job count
        ((job_counter++==0)) && wait   # Wait if max jobs reached

        # Run the make_annot.py script for each chromosome in parallel
        python /nfs/lab/Luca/Scripts/ldsc/make_annot.py \
        --bed-file /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/1_preprocessing/PeakCalling/2_PeaksMap_Peakcalls/MergedPeaks_ConsensuspeaksHg19.bed \
        --bimfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i}.bim \
        --annot-file /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz &
    done

# Wait for all background jobs to finish before exiting
wait
exit 0

# LD score regression

In [None]:
# For celltypes
for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/celltypes.txt); do
    for i in {1..22}; do

        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --print-snps /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline_snps/hm.${i}.snp \
        --ld-wind-cm 1.0 \
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i} \
        --bfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i} \
        --thin-annot \
        --annot /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz \
        --l2 

    done
done

In [None]:
# For bg
annot="Background"

    for i in {1..22}; do

        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --print-snps /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline_snps/hm.${i}.snp \
        --ld-wind-cm 1.0 \
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i} \
        --bfile /nfs/lab/ysun/LDSC/1000G_EUR_Phase3_plink/1000G.EUR.QC.${i} \
        --thin-annot \
        --annot /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.${i}.annot.gz \
        --l2 
    done

In [None]:
# For the future: Just add the background as one more celltype

# Run partitioned heritability

In [None]:
# RUNNING ON Ophelia

In [None]:
# make a list of files one, per trait up to: ".ldsc.sumstats.gz"
## GCST90162626_buildGRCh37.tsv.ldsc.sumstats.gz -> GCST90162626_buildGRCh37.tsv

In [None]:
N=50  # Number of parallel jobs allowed
i=0   # Initialize job counter

# Loop over traits
for trait in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Traits.txt); do
    ((i=i%N)); ((i++==0)) && wait

    # Extract the base name of the trait 
    trait_basename=$(basename ${trait})
    trait_name=${trait_basename%.ldsc.sumstats.gz}

    (
    # Loop over annotations
    for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/celltypes.txt); do
        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --h2 ${trait} \
        --ref-ld-chr /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.,/nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline/baseline.,/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/Background_hg19/Background_hg19.\
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Results/${trait_name}_hg19.${annot} \
        --overlap-annot  \
        --frqfile-chr /nfs/lab/ysun/LDSC/1000G_Phase3_frq/1000G.EUR.QC. \
        --w-ld-chr /nfs/lab/ysun/LDSC/weights_hm3_no_hla/weights. \
        --print-coefficients
    done
    ) &

done
exit 0

# Run on the latest 3 papers Ruth munged

In [None]:
N=20  # Number of parallel jobs allowed
i=0   # Initialize job counter

# Loop over traits
for trait in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Traits_3Papers.txt); do
    ((i=i%N)); ((i++==0)) && wait

    # Extract the base name of the trait 
    trait_basename=$(basename ${trait})
    trait_name=${trait_basename%.ldsc.sumstats.gz}

    (
    # Loop over annotations
    for annot in $(cat /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/celltypes.txt); do
        python /nfs/lab/Luca/Scripts/ldsc/ldsc.py \
        --h2 ${trait} \
        --ref-ld-chr /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/${annot}_hg19/${annot}_hg19.,/nfs/lab/ysun/LDSC/1000G_EUR_Phase3_baseline/baseline.,/nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Annotations/hg19/Background_hg19/Background_hg19.\
        --out /nfs/lab/projects/mega_heart/FNIH/multiome/Analysis/3_downstream/Major_celltypes/LDSC/Results/${trait_name}_hg19.${annot} \
        --overlap-annot  \
        --frqfile-chr /nfs/lab/ysun/LDSC/1000G_Phase3_frq/1000G.EUR.QC. \
        --w-ld-chr /nfs/lab/ysun/LDSC/weights_hm3_no_hla/weights. \
        --print-coefficients
    done
    ) &

done
exit 0