In [None]:
%run ~/devel/my_scripts/Jupyter/init.ipy

# load R libraries & functions
%R library(RColorBrewer)
%R source("/gfs/devel/tkhoyratty/my_scripts/R/pca.R")

# load python functions
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

sys.path.insert(0, "/gfs/devel/tkhoyratty/my_scripts/python/")
from upperQuantileNorm import upperQuantileNorm

db = "/gfs/work/tkhoyratty/AirPouch_ATAC/analysis/atac_pipeline_trim/csvdb"

# ATAC Pipeline Report

In [None]:
# import counts & upper quantile normalise
def get_counts(statement, filt="all_fragments"):
    # get df, filter on fragment size
    
    df = DB.fetch_DataFrame(statement, db)

    df = df.pivot("sample_id", "peak_id", "RPM").transpose()
    df.index.name = None

    df = upperQuantileNorm(df) # normalise to upper quantiles for between sample comparison
    df["size_filt"] = filt
    
    return df

statement = '''select sample_id, peak_id, RPM_width_norm *1000 as RPM 
                                   from all_norm_counts where size_filt == "all_fragments" '''
counts = get_counts(statement)
counts = counts.append(get_counts(statement, filt="<150bp"))

In [None]:
# Get sample information
sample_info = DB.fetch_DataFrame('''select * from sample_info''', db)
sample_info["replicate"] = sample_info["replicate"].apply(lambda x: str(x))
sample_info.index = sample_info["sample_id"]
sample_info.index.name = None

## Mapping QC

In [None]:
def mapping_stats(paired=True, db=db, sample_info=sample_info):
    '''Collect all mapping stats & retrun df for plotting'''
    
    if paired==True:
        reads = DB.fetch_DataFrame('''select READS_ALIGNED_IN_PAIRS/2 as MAPPED_PAIRS, PCT_READS_ALIGNED_IN_PAIRS, 
                                      TOTAL_READS, PCT_ADAPTER, sample_id from picardAlignmentSummary 
                                      where CATEGORY = "PAIR" ''', db)
    if paired==False:
        print "Update function for non-paired data"
    
    # Format mapping qc df
    reads["Filter"] = reads["sample_id"].apply(lambda x: x.split(".")[-1])
    reads["Filter"] = reads["sample_id"].apply(lambda x: x.split("_")[-1] if "size_filt_prep" not in x else "prep<150bp")
    reads["sample_id"] = reads["sample_id"].apply(lambda x: '_'.join(x.split("_")[0:-1]))
    reads["sample_id"] = reads["sample_id"].apply(lambda x: x.split(".")[0])

    if len(sample_info)==0:
        print "Provide sample_info df with sample annotations"
        
    reads = pd.merge(reads, sample_info, on="sample_id", how="inner")
    
    # get no. reads mapping to chrM
    chrm = DB.fetch_DataFrame('''select * from allContig''', db)
    
    # reformat df
    chrm = chrm.pivot("sample_id", "contig", "mapped_reads")
    chrm["total_mapped_reads"] = chrm.sum(axis=1)
    chrm["sample_id"] = chrm.index.values
    chrm.index.name = None
    chrm = chrm[["chrM", "total_mapped_reads", "sample_id"]]
    chrm["pct_chrM"] = chrm["chrM"] / chrm["total_mapped_reads"] *100 # % reads mapping to chrM

    # annotate df
    chrm["sample_id"] = chrm["sample_id"].apply(lambda x: str(x).split(".")[0])
    chrm = pd.merge(chrm, sample_info, on="sample_id", how="inner")
    chrm["Filter"] = "genome" # chrM only in genomic reads as filtered out after, others not tested

    df = pd.merge(reads, chrm[["pct_chrM", "sample_id", "Filter"]], how="outer", on=["sample_id", "Filter"])
    
    return df

mapping_qc = mapping_stats()

In [None]:
%%R -i mapping_qc -w 800 -h 600

Palette <- c("#E69F00", "#0072B2", "#D55E00", "#009E73",  "#56B4E9",  "#999999", "#F0E442")

get_legend <- function(a.gplot){ 
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 
  legend <- tmp$grobs[[leg]] 
  return(legend)} 

test <- ggplot(mapping_qc, aes(y=MAPPED_PAIRS, x=sample_id, colour=condition, shape=Filter)) + 
        geom_point(size=6) + 
        theme_Publication()
        
a <- ggplot(mapping_qc, aes(y=MAPPED_PAIRS, x=sample_id, colour=condition, shape=Filter)) + 
        geom_point(size=6) + 
        theme_Publication() + 
        theme(axis.text.x=element_blank()) +
        scale_y_continuous(limits=c(0, 65000000)) +
        scale_colour_manual(values=Palette) +
        labs(x="", y="Mapped Pairs") +
        geom_hline(yintercept=25000000, lty="dashed", col="black")

b <- ggplot(subset(mapping_qc, Filter=="genome"), 
            aes(y=PCT_READS_ALIGNED_IN_PAIRS*100, x=sample_id, colour=condition)) + 
        geom_point(size=6, shape=17) + 
        theme_Publication() + 
        theme(axis.text.x=element_blank()) +
        labs(y="% Reads in Pairs", x="") +
        scale_y_continuous(limits=c(0, 100)) +
        scale_colour_manual(values=Palette)

c <- ggplot(mapping_qc, aes(y=PCT_ADAPTER, x=sample_id, colour=condition, shape=Filter)) + 
        geom_point(size=6) + 
        theme_Publication() + 
        theme(axis.text.x=element_blank()) +
        scale_colour_manual(values=Palette) +
        labs(x="", y="% Adaptor")

d <- ggplot(mapping_qc, aes(y=pct_chrM, x=sample_id, colour=condition)) + 
        geom_point(size=6, shape=17) +
        theme_Publication() +
        labs(x="", y= "% chrM Reads") +
        scale_y_continuous(limits=c(0,100)) +
        theme(axis.text.x=element_blank()) +
        scale_colour_manual(values=Palette)

legend <- get_legend(a)

grid.arrange(a + theme(legend.position="none"), b + theme(legend.position="none"), 
             c + theme(legend.position="none"), d + theme(legend.position="none"), 
             ncol=2, nrow=2, bottom=legend)

## Insert sizes

In [None]:
def fragment_stats(db=db, sample_info=sample_info):
    '''Collect insert size stats & format df for plotting'''
    
    def clean(df):
        df["Filter"] = df["sample_id"].apply(lambda x: x.split(".")[-1])
        df["Filter"] = df["sample_id"].apply(lambda x: x.split("_")[-1] if "size_filt_prep" not in x else "prep<150bp")
        df["sample_id"] = df["sample_id"].apply(lambda x: x.rstrip("_prep"))
        df["sample_id"] = df["sample_id"].apply(lambda x: x.split(".")[0])
        df = pd.merge(df, sample_info, on="sample_id", how="inner")
        return df
    
    insert_sizes = DB.fetch_DataFrame('''select * from picardInsertSizeHistogram where sample_id like "%prep"''', db)
    size_stats = DB.fetch_DataFrame('''select * from picardInsertSizeMetrics where sample_id like "%prep"''', db)

    insert_sizes = clean(insert_sizes)
    size_stats = clean(size_stats)

    return [size_stats, insert_sizes]

(size_stats, insert_sizes) = fragment_stats()

In [None]:
%%R -i insert_sizes,size_stats -w 1200 -h 1200

a <- ggplot(insert_sizes, aes(y=All_Reads_fr_count, x=insert_size)) + 
        geom_bar(stat="identity") +
        scale_fill_manual(values=Palette) +
        scale_x_continuous(limits=c(0, 800)) + 
        labs(y="No. Fragments", x="Fragment length (b.p.)") +
        theme_Publication() + facet_wrap(~ Filter)

c <- ggplot(insert_sizes, 
            aes(x=category, y=insert_size, fill=condition, alpha=factor(replicate))) + 
        geom_boxplot() +
        scale_fill_manual(values=Palette) +
        coord_cartesian(y=c(0, 1000)) + 
        labs(y="Fragment length (b.p.)", x="Sample") +
        theme_Publication() +
        scale_alpha_discrete(range=c(0.5,1), name="replicate") +
        theme(axis.text.x=element_blank())  + facet_wrap(~ Filter)

grid.arrange(a, c, ncol=1, nrow=2)

## Peakcalling QC
### All peaks
* All detected peaks with & without filtering reads by size & fraction of reads in peaks

In [None]:
peak_stats = DB.fetch_DataFrame('''select a.no_peaks, a.size_filt, b.FRIP, b.sample_id from no_peaks a, 
                                frip_table b where a.sample_id=b.sample_id and a.size_filt=b.size_filt ''', db)

peak_stats = pd.merge(peak_stats, sample_info, how="inner", on="sample_id")

In [None]:
%%R -i peak_stats  -w 800 -h 350

no_peaks <- ggplot(peak_stats, 
                   aes(y=no_peaks, x=category, colour=condition, 
                       shape=factor(replicate), alpha=size_filt)) + 
                geom_point(size=6) + theme_Publication() +
                theme(axis.text.x=element_blank()) + 
                scale_alpha_discrete(range=c(0.4, 1)) +
                labs(y="No. Peaks", y="", x="") +
                scale_y_continuous(limits=c(0, 40000)) +
                scale_colour_manual(values=Palette) #+ facet_wrap(~ size_filt)

frip_plot <- ggplot(peak_stats, 
                    aes(y=FRIP, x=category, colour=condition, shape=factor(replicate), 
                        alpha=size_filt)) + 
                geom_point(size=6) + theme_Publication() +
                theme(axis.text.x=element_blank()) + 
                scale_alpha_discrete(range=c(0.4, 1)) +
                labs(y="FRIP", x="") +
                scale_y_continuous(limits=c(0,1)) + 
                scale_colour_manual(values=Palette) +
                guides(colour=guide_legend(override.aes=list(size=6)), 
                       shape=guide_legend(override.aes=list(size=6))) +
                geom_hline(yintercept=0.2, lty="dashed", colour="black")  #+ facet_wrap(~ size_filt)

# hc_peaks <- ggplot(hc_peaks, aes(y=no_peaks, x=sample, fill=Tissue)) + 
#                 geom_bar(stat="identity") + theme_Publication() +
#                 theme(axis.text.x=element_blank()) + 
#                 labs(y="No. High Conf. Peaks", y="", x="") +
#                 scale_y_continuous(limits=c(0, 40000)) +
#                 scale_fill_manual(values=Palette)

key <- get_legend(frip_plot)

grid.arrange(no_peaks + theme(legend.position="none"), frip_plot + theme(legend.position="none"),
             ncol=2, nrow=1, bottom=key)

### High confidence peaks
* Peaks which are consistent between biological replicates, from size filtered & non- size filtered peak sets

In [None]:
merged_peaks = DB.fetch_DataFrame('''select * from no_peaks where merged like "%merged" ''', db)

In [None]:
%%R -i merged_peaks -w 600 -h 400

Palette <- c("#E69F00", "#0072B2", "#D55E00", "#009E73",  "#56B4E9",  "#999999", "#F0E442")

ggplot(merged_peaks, aes(y=no_peaks, x=sample_id, shape=size_filt, colour=sample_id)) +
    geom_point(size=6) +
    scale_colour_manual(values=Palette, guide=FALSE) +
    theme_Publication() +
    labs(x="")

## Data Exploration 
* based on counts over merged peakset
* merged peakset consists of all detected peaks (not only high confidence)
* counts are normalised for sequencing depth, peak width, and upper quantile normalised for between sample comparison

In [None]:
# subset counts
all_counts = counts.loc[counts.size_filt == "all_fragments"]
all_counts.drop("size_filt", axis=1, inplace=True)

all_counts_anno = all_counts.copy(deep=True)
all_counts_anno.columns = sample_info["category"]

### Pearson correlation between nomalised counts in consensus peakset
* clustering from Ward method & Manhattan distances

In [None]:
%%R -i all_counts_anno

cm <- data.matrix(log2(all_counts_anno +1))
m <- cor(cm, method="pearson", use="all")


library(ComplexHeatmap)
library(circlize)
library(dendextend)

distr <- dist(m, method="manhattan")
clustr <- hclust(distr, method="ward.D2")
dendr <- as.dendrogram(clustr)
dendr <- dendr %>% sort(type="labels")

distc <- dist(t(m), method="manhattan")
clustc <- hclust(distc, method="ward.D2")
dendc <- as.dendrogram(clustc)
dendc <- rev(dendc) %>% sort(type="labels")

p2 <- Heatmap(m,
       col = colorRamp2(c(min(m), max(m)), c("white", "red")),
       cluster_rows=dendr,
       cluster_columns=dendc,
       column_dend_reorder = FALSE,
       column_dend_height = unit(2, "cm"),
       row_dend_width = unit(2, "cm"),
       row_names_gp=gpar(fontsize=16),
       column_names_gp=gpar(fontsize=16),
       name="Pearson Correlation:",
       heatmap_legend_param=list(legend_direction="horizontal", 
                                  at=c(0.9, 1), 
                                  color_bar = "continuous",
                                  legend_width = unit(5, "cm"), 
                                  title_position = "lefttop",
                                  title_gp=gpar(fontsize=18),
                                  labels_gp=gpar(fontsize=14)),
       )

draw(p2, heatmap_legend_side = "bottom")

### Dimensionality Reduction

In [None]:
%%R -w 1200 -h 500 -i sample_info,all_counts

df <- as.data.frame(log2(all_counts+1))
# df <- t(df)
pca <- prcomp(df, scale=F)

head(pca$x)

pca_plots <- ggplot_prcomp(pca, 
             plots=list("A"=c("PC1","PC2"), "B"=c("PC3", "PC4")),
             sample_information=sample_info, 
             shape="replicate", 
             color="condition",
             size=7,
             nudge_scale_factor=30) 


a <- pca_plots$A + theme_Publication() + 
        scale_colour_manual(values=Palette, name="condition:") + 
        scale_shape_manual(values=c(16,17), name="replicate:") +
        guides(colour=guide_legend(override.aes=list(size=8)), 
               shape=guide_legend(override.aes=list(size=6)))
b <- pca_plots$B + theme_Publication() + 
        scale_colour_manual(values=Palette, name="condition:") + 
        scale_shape_manual(values=c(16,17), name="replicate:")
c <- pca_plots$scree + theme_Publication()  + 
        scale_colour_manual(values=Palette, name="condition:") + 
        scale_shape_manual(values=c(16,17), name="replicate:")


get_legend <- function(a.gplot){ 
  tmp <- ggplot_gtable(ggplot_build(a.gplot)) 
  leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box") 
  legend <- tmp$grobs[[leg]] 
  return(legend)} 
                        
legend <- get_legend(a)
                      
a <- a + theme(legend.position=c(10, 10)) # dont show legend
b <- b + theme(legend.position=c(10, 10))
                      
lay = rbind(c(1,1,1,2,2,2,3,3,3), 
            c(1,1,1,2,2,2,3,3,3),
            c(1,1,1,2,2,2,3,3,3), 
            c(1,1,1,2,2,2,3,3,3), 
            c(NA,4,4,4,NA,NA,NA,NA,NA))  
                      
        
                      
grid.arrange(top=textGrob("PCA of Normalised Read Counts Over All Peaks", gp=gpar(fontfamily="Helvetica", fontface="bold", fontsize=23)),
             a, b, c, legend, layout_matrix=lay)

# pca_plots$A

In [None]:
%%R -h 500 -w 500

require(Rtsne)

log2counts <- as.data.frame(log2(all_counts+1))
                            
tsne_out <- Rtsne(t(log2counts), pca=T, perplexity=2)

tsne_df <- as.data.frame(tsne_out$Y)
rownames(tsne_df) <- colnames(log2counts)
colnames(tsne_df) <- c("tSNE1", "tSNE2")
tsne_df$sample_id <- rownames(tsne_df)

tsne_df <- merge(tsne_df, sample_info, by="sample_id")
head(tsne_df)

p <- ggplot(tsne_df, aes(y=tSNE1, x=tSNE2, shape=replicate, colour=condition)) + 
        geom_point(size=7) + 
        theme_Publication()  + 
        scale_colour_manual(values=Palette, name="condition:") + 
        scale_shape_manual(values=c(16,17), name="replicate:")

grid.arrange(p, ncol=1, nrow=1)

### Replicate correlation

In [None]:
# use sample information to get no. replicates & conditions
rep_pairs = sample_info.pivot("condition", "replicate", "category").transpose()
rep_pairs.columns.name = None
rep_pairs.index.name = None

# report replicates to dict
reps = {}
for col in rep_pairs.columns:
    reps[col]=[rep_pairs[col].iloc[0], rep_pairs[col].iloc[1]]
    
sns.set(style="whitegrid", palette="muted")# set seaborn theme

# use dict to subset df of normalised counts & plot rep correlations
for key in reps:
    df = all_counts_anno[reps[key]]
    df.columns = ["Rep1", "Rep2"]
    p = sns.jointplot(data=df, y="Rep1", x="Rep2", kind="reg", size=7, color="g")
    plt.subplots_adjust(top=0.9)
    p.fig.suptitle(key) # add title
    plt.show()