# Find a match for each of the variants in the credible sets:
- Condition 1: same GC contenta around the variant
- Condition 2: match coding and non-coding

In [1]:
setwd("/frazer01/projects/CARDIPS/analysis/cardiac_gwas_coloc")

source("script/functions.R"  )


In [2]:
dir.create("pipeline/6.2.footprints_finemap/random"          , showWarnings = FALSE)
dir.create("pipeline/6.2.footprints_finemap/random/input"     , showWarnings = FALSE)


In [4]:
loci             = add_rownames(fread  ("pipeline/1.2.genomewide_significant_loci/loci.txt"              , sep = "\t", header = TRUE, data.table = FALSE))
manifest         = add_rownames(fread  ("pipeline/1.1.sumstats/manifest.txt"                             , sep = "\t", header = TRUE, data.table = FALSE))
moloc_df         =              fread  ("pipeline/2.2.moloc/moloc.txt"                                   , sep = "\t", header = TRUE, data.table = FALSE)
locus2coloc      =              fread  ("pipeline/5.4.analyze_coloc_qtl_som_maps/signal2coloc.txt"       , sep = "\t", header = TRUE, data.table = FALSE)
cs_annot         =              fread  ("pipeline/6.2.footprints_finemap/credible_sets_50_annotated.txt" , sep = "\t", header = TRUE, data.table = FALSE)
cs50             = locus2coloc[ locus2coloc$credible_set <= 50 & locus2coloc$finemapped == TRUE,]


In [121]:
snps       = fread(cmd = "zcat /frazer01/reference/public/ukbb/variants.tsv.bgz", sep = "\t", header = TRUE, data.table = FALSE)
snps       = snps[snps$chr %in% as.character(1:22),]
snps$chrom = as.numeric(snps$chr)
snps$id    = paste("VAR", snps$chrom, snps$pos, snps$ref, snps$alt, sep = "_")
snps$maf   = snps$minor_AF

In [123]:
snps2bed              = snps[,c("chrom", "pos", "id", "maf", "consequence_category")]
snps2bed$chrom        = paste0("chr", snps2bed$chrom)
snps_bed              = paste(getwd(), "pipeline/6.2.footprints_finemap/random/input", "ukbb_snps.bed", sep = "/")
snps2bed$credible_set = 0
snps2bed$maf          = round(snps2bed$maf * 100 / 4, digits = 0) * 4

snps2bed[snps2bed$consequence_category != "non_coding", "consequence_category"] = "coding"
snps2bed[snps2bed$id %in% cs_annot$id                 , "credible_set"        ] = 1

fwrite(snps2bed[,c("chrom", "pos", "pos", "id", "maf", "consequence_category", "credible_set")], snps_bed, sep = "\t", col.names = FALSE, row.names = FALSE)

In [125]:
table(snps2bed$maf)


      0       4       8      12      16      20      24      28      32      36 
5103145 1845031 1011039  789160  663249  594517  545421  509474  487906  468675 
     40      44      48 
 458125  445943  442618 

# Step 1: divide genome into bins of 200 bp

In [100]:
chrom_sizes = "/frazer01/reference/public/hg19/hg19.size.txt"
hg19_fasta  = "/frazer01/reference/public/hg19/hg19.fa"


In [101]:
windows_bed = paste(getwd(), "pipeline/6.2.footprints_finemap/random/input", "windows.bed", sep = "/")
command     = paste("bedtools", "makewindows", 
                    "-g", chrom_sizes,
                    "-w", 500,
                    ">", windows_bed)

system(command)


In [126]:
windows_snps_bed = paste(getwd(), "pipeline/6.2.footprints_finemap/random/input", "windows_snps.bed", sep = "/")
command          = paste("bedtools", "intersect", 
                         "-loj",
                         "-a", windows_bed,
                         "-b", snps_bed,
                         ">" , windows_snps_bed)

system(command)

windows_snps     = fread(windows_snps_bed, sep = "\t", header = FALSE, data.table = FALSE)
windows_snps     = windows_snps[ windows_snps[,4] != ".",]
windows_snps$bin = paste(windows_snps[,1], windows_snps[,2], windows_snps[,3], sep = "_")

fwrite(windows_snps[,c("V1", "V2", "V3", "bin", "V7", "V8", "V9", "V10")], windows_snps_bed, sep = "\t", col.names = FALSE, row.names = FALSE)

# Step 2: calculate GC content in each bin

In [138]:
windows_nuc_txt  = paste(getwd(), "pipeline/6.2.footprints_finemap/random/input", "windows_nuc.txt", sep = "/")
command          = paste("bedtools", "nuc", 
                         "-fi" , hg19_fasta,
                         "-bed", windows_snps_bed,
                         "|", "cut -f1,4-8,10",
                         "|", "tail -n +2",
                         ">", windows_nuc_txt)

system(command)


# Step 3: For each variant in the credible sets, find 100 with the same features (coding/noncoding; sam GC content, same MAF, same chromosome)

In [139]:
gc           = fread(windows_nuc_txt, sep = "\t", header = FALSE, data.table = FALSE)
colnames(gc) = c("chrom", "bin", "id", "maf", "consequence", "cs", "gc")
gc$gc        = round(100 * gc$gc, digits = 0)

In [153]:
gc$combined = paste(gc$chrom, gc$maf, gc$gc, gc$consequence)
gc_dt       = data.table(gc)
setkey(gc_dt, combined)

In [154]:
to_select0          = unique(gc[gc$cs == 1, c("id", "chrom", "consequence", "maf", "gc")])
to_select1          = aggregate(gc  ~ id + chrom + consequence, data = to_select0, function(x){x[[1]]})
to_select2          = aggregate(maf ~ id + chrom + consequence, data = to_select0, function(x){x[[1]]})
to_select           = merge(to_select1, to_select2, by = c("id", "chrom", "consequence"))
to_select$combined  = paste(to_select$chrom, to_select$maf, to_select$gc, to_select$consequence)
rownames(to_select) = to_select$id

In [190]:
select_random = function(id, to_select, gc_dt, seed = 1)
{
    combined  = to_select[id, "combined"]
    selection = as.data.frame(gc_dt[combined])[,"id"]
    
    set.seed(seed)
    
    to_replace = ifelse(test = length(selection) >= 100, yes = FALSE, no = TRUE)
    out        = sample(selection, 100, replace = to_replace)
    
    #writeLines(text = out, con = paste0("pipeline/6.2.footprints_finemap/random/selected_by_variant/", id, ".txt"))
    
    return(out)
}

random_vars        = lapply (to_select$id, function(id){select_random(id, to_select, gc_dt)})
names(random_vars) = to_select$id

saveRDS(random_vars, "pipeline/6.2.footprints_finemap/random/selected_by_variant.rds")