In [None]:
libs <- c(
    'dplyr',
    'tidyr',
    'stringr',
    'ggplot2',
    'GenomicRanges',
    'RPostgreSQL',
    'doMC',
    'numbers',
    'doParallel'
)

for (lib in libs) {
        if (!require(lib, character.only = TRUE, quietly = TRUE)) {
            install.packages(lib, repos='http://cran.us.r-project.org')
        }
}

library(BSgenome.Hsapiens.UCSC.hg38)
hg38 = BSgenome.Hsapiens.UCSC.hg38

source("utility_functions.R")
source("stat_functions.R")
source("plot_functions.R")
source("~/git-repos/BDDS/trenadb/src/utils.R")
source("~/git-repos/BDDS/footprints/testdb/src/dbFunctions.R")

In [None]:
load("/local/rory/all.TF.fimo.samples.ratio.49.df.RData")

In [None]:
db_lymph_hint <- src_postgres(drv=dbDriver("PostgreSQL"),
                              user="trena",
                              password="trena",
                              dbname="lymphoblast_hint",
                              host="whovian",
                              port="5432")
db_lymph_well <- src_postgres(drv=dbDriver("PostgreSQL"),
                              user="trena",
                              password="trena",
                              dbname="lymphoblast_wellington",
                              host="whovian",
                              port="5432")
hint_regions <- tbl(db_lymph_hint, "regions")
hint_hits    <- tbl(db_lymph_hint, "hits")
well_regions <- tbl(db_lymph_well, "regions")
well_hits    <- tbl(db_lymph_well, "hits")

## function to loop over all chromosomes

In [None]:
merge_fimo_hint_wellington_one_chrom <- function(chrom_str,
                                                 fimo_tbl,
                                                 hint_regions_tbl,
                                                 hint_hits_tbl,
                                                 well_regions_tbl,
                                                 well_hits_tbl
                                                ) {
    
    # some tables use chr22 and some just use 22
    chrom_long_str = paste("chr",chrom_str, sep="")
    
    # select one chromosome from my data table
    fimo_tbl %>%
    filter(chrom==chrom_str) %>%
    select(-empty) ->
    chrom_all_tf_df
    
    # select one chromosome from hint
    hint_regions_tbl %>%
    filter(chrom==chrom_long_str) %>%
    left_join(hint_hits_tbl, by="loc") %>%
    as.data.frame %>%
    as.tbl %>%
    select(start, endpos, strand, name, score1) %>%
    rename("score1"="h_score") ->
    chrom_hint_all_tbl

    # select one chromosome from wellington
    well_regions_tbl %>%
    filter(chrom==chrom_long_str) %>%
    left_join(well_hits_tbl, by="loc") %>%
    as.data.frame %>%
    as.tbl %>%
    select(start, endpos, strand, name, score1) %>%
    rename("score1"="w_score") ->
    chrom_well_all_tbl
    
    # keep only max hint score but count total nontrivial scores
    chrom_hint_all_tbl %>%
    group_by(start, endpos, name, strand) %>%
    mutate(h_count = n()) %>%
    group_by(start, endpos, name, strand) %>%
    mutate(h_max_score = max(h_score)) %>%
    distinct(start, endpos, name, strand, .keep_all = TRUE) %>%
    select(-h_score) ->
    chrom_hint_unique_tbl

    # keep only min wellington score but count total nontrivial scores
    chrom_well_all_tbl %>%
    group_by(start, endpos, name, strand) %>%
    mutate(w_count = n()) %>%
    group_by(start, endpos, name, strand) %>%
    mutate(w_min_score = min(w_score)) %>%
    distinct(start, endpos, name, strand, .keep_all = TRUE) %>%
    select(-w_score) ->
    chrom_well_unique_tbl
    
    # merge hint and wellington into my table
    chrom_all_tf_df %>%
    left_join(chrom_hint_unique_tbl, by=c("start", "endpos", "strand", "motifname"="name")) %>%
    left_join(chrom_well_unique_tbl, by=c("start", "endpos", "strand", "motifname"="name")) %>%
    replace_na(list(h_count=0, w_count=0, h_max_score=0, w_min_score=0)) ->
    chrom_all_tf_df_merged
    
    return(chrom_all_tf_df_merged)
    
}

## Perform loop over all chromosomes

In [None]:
big_df <- tibble()

for (chr_str in as.character(1:22)) {
    message(paste("working on chromosome",chr_str))
    
    df <- merge_fimo_hint_wellington_one_chrom(chr_str,
                                               all.TF.df,
                                               hint_regions,
                                               hint_hits,
                                               well_regions,
                                               well_hits)
    big_df <- rbind(big_df, df)
}

In [None]:
str(big_df)

## Rename and save

In [None]:
all.TF.df.fimo.hint.well <- big_df
save(all.TF.df.fimo.hint.well, file="/local/rory/all.TF.df.fimo.hint.well.49.Rdata")

## Explore data

In [None]:
all.TF.df.fimo.hint.well %>%
ggplot +
geom_freqpoly(aes(x=asinh(w_min_score), y= ..density.., color=factor(cs_hit))) +
theme_minimal()

all.TF.df.fimo.hint.well %>%
ggplot +
geom_freqpoly(aes(x=asinh(h_max_score), y= ..density.., color=factor(cs_hit))) +
theme_minimal()

all.TF.df.fimo.hint.well %>%
ggplot +
geom_point(
    aes(x=asinh(h_max_score),
        y=asinh(w_min_score),
        color=factor(cs_hit)),
    alpha=0.05) +
theme_minimal()