In [3]:
libs <- c(
    'tidyverse',
    'stringr',
    'GenomicRanges',
    'RPostgreSQL',
    'doMC',
    'numbers',
    'doParallel',
    'Matrix'

)

for (lib in libs) {
        if (!require(lib, character.only = TRUE, quietly = TRUE)) {
            install.packages(lib, repos='http://cran.us.r-project.org')
        }
}

library(BSgenome.Hsapiens.UCSC.hg38)
hg38 = BSgenome.Hsapiens.UCSC.hg38

source("my_R_functions/utility_functions.R")
source("my_R_functions/stat_functions.R")
source("my_R_functions/plot_functions.R")
source("~/git-repos/BDDS/trenadb/src/utils.R")
source("~/git-repos/BDDS/footprints/testdb/src/dbFunctions.R")

In [4]:
load("Rdata_files/Tfmotifmap.Rdata")

In [5]:
relevant_motifs <- unique(unlist(TFs.to.motifs))

read_delim("text_data_files/motif_class_fam", delim = "\t") %>%
    # clean up and subset to only relevant motifs
    rename("ID" = "motifname") %>%
    select(motifname, class) %>%
    mutate_all(str_trim) %>%
    filter(motifname %in% relevant_motifs) %>% 
    # fix double classes
    mutate(class = str_split(class, "::")) %>% 
    unnest(class) %>%
    # create one-hot(ish, some double matches) version
    mutate(dummy_yesno = 1) %>%
    distinct %>%
    spread(class, dummy_yesno, fill = 0) ->
    motif_class_hot

Parsed with column specification:
cols(
  ID = col_character(),
  class = col_character(),
  family = col_character()
)


In [6]:
load("/local/rory/all.TF.df.fimo.hint.well.9.Rdata")

## merge in TF classes

In [7]:
all.TF.df.fimo.hint.well.annotated <- left_join(all.TF.df.fimo.hint.well, motif_class_hot)

Joining, by = "motifname"


In [None]:
dim(all.TF.df.fimo.hint.well.annotated)
dim(all.TF.df.fimo.hint.well)

## merge in gc content

In [None]:
# TODO: runs out of memory on pnr 49 dataset -- make more efficient?

In [None]:
get_gc_content <- function(start_col, end_col, chrom_col, b=100) {
    require(GenomicRanges)
    
    window_center <- round((start_col + end_col)/2)
    windows <- getSeq(hg38, paste0("chr",chrom_col), window_center-b, window_center+b)

    alph_freq <- alphabetFrequency(windows)
    gc_content <- rowSums(alph_freq[,c("C","G")])/(2*b+1)
    
    return(gc_content)
}

In [None]:
all.TF.df.fimo.hint.well.annotated %>%
    mutate("gc_content" = get_gc_content(start,endpos,chrom)) ->
    all.TF.df.fimo.hint.well.annotated

## merge in tss distance

In [None]:
db_gtf <- dbConnect(PostgreSQL(), user= "trena", password="trena", dbname="gtf", host="whovian")
query <- "select * from hg38human where moleculetype='gene' and gene_biotype='protein_coding'"
tss_raw_table <- dbGetQuery(db_gtf, query)[, c("chr", "gene_name", "start", "endpos","strand")]

In [None]:
tss_raw_table %>%
    mutate(ref = ifelse(strand == '+', start, endpos)) %>%
    select(chr, ref) %>%
    rename(c("chr" = "chrom", "ref"="ts_start")) %>%
    filter(!(chrom %in% c('chrMT','chrX','chrY'))) %>%
    mutate(chrom=str_sub(chrom,  start = 4)) ->
    tss_tbl

In [None]:
motif_gr <- makeGRangesFromDataFrame(all.TF.df.fimo.hint.well.annotated, start.field="start", end.field="endpos")
tss_gr <- makeGRangesFromDataFrame(tss_tbl, start.field="ts_start", end.field="ts_start")
dist_to_nearest_tss <- distanceToNearest(motif_gr, tss_gr, select="arbitrary")
tss_dists <- mcols(dist_to_nearest_tss)[,1]

In [None]:
all.TF.df.fimo.hint.well.annotated %>%
    mutate(asinh_tss_dist = asinh(tss_dists)) ->
    all.TF.df.fimo.hint.well.annotated

## Save data

In [8]:
# load("/local/rory/all.TF.df.fimo.hint.well.annotated.9.Rdata")

In [9]:
# change hint and wellington counts to fractions
# rearrange column order to keep TF classes last. 

all.TF.df.fimo.hint.well.annotated %>%
mutate(h_frac = h_count/max(h_count)) %>%
mutate(w_frac = w_count/max(w_count)) %>%
select(-one_of("h_count","w_count")) %>%
select(motifname:w_min_score, h_frac, w_frac, gc_content, asinh_tss_dist, everything()) ->
all.TF.df.fimo.hint.well.annotated

In [10]:
save(all.TF.df.fimo.hint.well.annotated, file="/local/rory/all.TF.df.fimo.hint.well.annotated.9.Rdata")

## Explore some annotations

In [None]:
all.TF.df.fimo.hint.well.annotated %>%
ggplot +
geom_freqpoly(aes(x=asinh_tss_dist, y= ..density.., color=factor(cs_hit)), bins=100) +
theme_minimal()

In [None]:
all.TF.df.fimo.hint.well.annotated %>%
ggplot +
geom_freqpoly(aes(x=gc_content, y= ..density.., color=factor(cs_hit)), bins=101) +
theme_minimal()

In [None]:
all.TF.df.fimo.hint.well.annotated %>%
sample_n(100000) %>%
ggplot +
geom_point(
    aes(x=asinh_tss_dist,
        y=gc_content,
        color=factor(cs_hit)),
    alpha=0.05) +
theme_minimal()