In [None]:
#load rpy2 magic
%load_ext rpy2.ipython

# to switch off warning messages
import warnings
warnings.filterwarnings("ignore")

# make default cell width 85% of available screen
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:85% !important; }</style>"))

# show multiple tables in python shells
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# load R libraries & functions
%R options(warn=-1)
%R library(RColorBrewer)
%R library(ggplot2)
%R library(gplots)
%R library(gridExtra)
%R library(ggrepel)
    
# load python modules
import glob
import re
import sys
import os
import rpy2.robjects as robjects
import CGAT.Database as DB
import sqlite3
import seaborn as sns
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

db = "./csvdb"

# **MEME-ChIP Report**
***
## **DREME** 
- Searches for **short**, ungapped motifs, relatively enriched vs background (flanking regions)
- TOMTOM compares discovered motifs against database(s) of known motifs

In [None]:
%%R

# R functions
theme_notebook <- function(base_size=18, base_family="helvetica") {
                  (theme_set(theme_minimal(base_size=18))
                  + theme(plot.title = element_text(face="bold", size=20, hjust=0.5),
                             text = element_text(),
                             axis.title = element_text(face="bold",size = rel(1)),
                             axis.title.y = element_text(angle=90,vjust=2, size=20),
                             axis.title.x = element_text(vjust=-0.2, size=20),
                             axis.text = element_text(size=20),
                             axis.line = element_line(colour="black"),
                             axis.ticks = element_line(),
                             legend.key = element_rect(colour = NA),
                             legend.key.size= unit(0.5, "cm"),
                             legend.margin = unit(0.5, "cm"),
                             legend.text = element_text(size=14),
                             legend.title = element_text(size=16),
                             strip.text = element_text(size=18)
                             ))
}

# Set ggplot theme
theme_set(theme_notebook(base_size=18))
Palette <- c("#E69F00", "#0072B2", "#D55E00", "#009E73", "#56B4E9",  "#999999", "#F0E442")

In [None]:
def motif_lookup(db):
    
    with open("./pipeline.ini", "r") as o:
        for line in o:
            a = re.findall("^motif_db=.*", line)
            if len(a) > 0:
                dbs = a[0].split(",")

    df1 = {}
    df2 = {}
    df3 = {}
    
    transfac = [x for x in dbs if "transfac" in x]
    jaspar = [x for x in dbs if "JASPAR" in x]
    uniprobe = [x for x in dbs if "uniprobe" in x]
    hocomoco = [x for x in dbs if "HOCOMOCO" in x]
    chen = [x for x in dbs if "chen" in x]

    n = 0
    for i in [transfac, jaspar, uniprobe]:
        n = n + 1
        if n == 1:
            db_name = "transfac"
        if n == 2:
            db_name = "jaspar"
        if n == 3:
            db_name = "uniprobe"
        meme = ''.join(i).lstrip("motif_db=")
        with open(meme, "r") as o:
            for line in o:
                motif = re.findall("^MOTIF.*", line)
                if len(motif) > 0: 
                    motif = motif[0].split(" ")
                    motif_id = motif[1]
                    motif_name = motif[2]
                    df1[motif_name] = motif_id

        if n == 1:
            dfs = pd.DataFrame.from_dict(df1, orient="index")
            dfs.columns = ["motif_id"]
            dfs["motif_name"] = dfs.index.values
            dfs.reset_index(inplace=True, drop=True)
            dfs["database"] = db_name
        else:
            dfs1 = pd.DataFrame.from_dict(df1, orient="index")
            dfs1.columns = ["motif_id"]
            dfs1["motif_name"] = dfs1.index.values
            dfs1.reset_index(inplace=True, drop=True)
            dfs1["database"] = db_name
            
            dfs = dfs.append(dfs1)

    if len(hocomoco) > 0:
        meme = ''.join(hocomoco).lstrip("motif_db=")
        with open(meme, "r") as o:
            for line in o:
                motif = re.findall("^MOTIF.*", line)
                if len(motif) > 0: 
                    motif = motif[0].split(".")
                    motif[0].split(".")
                    motif_name = str(motif[0]).replace("MOTIF ", "")
                    motif_suffix = '.'.join(motif[1:3])
                    motif_id = '.'.join([motif_name, motif_suffix])
                    df2[motif_name] = motif_id

        df2 = pd.DataFrame.from_dict(df2, orient="index")
        df2.columns = ["motif_id"]
        df2["motif_name"] = df2.index.values
        df2.reset_index(inplace=True, drop=True)
        df2["database"] = "hocomoco"
    
    if len(chen) > 0:
        meme = ''.join(chen).lstrip("motif_db=")
        with open(meme, "r") as o:
            for line in o:
                motif = re.findall("^MOTIF.*", line)
                if len(motif) > 0: 
                    motif = motif[0].split(" ")
                    motif_id = motif[1]
                    motif_name = motif[1]
                    df3[motif_name] = motif_id
                
        df3 = pd.DataFrame.from_dict(df3, orient="index")
        df3.columns = ["motif_id"]
        df3["motif_name"] = df3.index.values
        df3.reset_index(inplace=True, drop=True)
        df3["database"] = "chen"
    
    # merge all dfs
    df = dfs.append([df2, df3])
    
    # add table to db
    connect = sqlite3.connect(db)
    df.to_sql("motif_table", connect, if_exists="replace", index=False)

    return df

df = motif_lookup(db)

In [None]:
def dreme_summary(db, conditions="", widths="", no_peaks="", samples=""):

    beds = glob.glob("data.dir/*_meme.bed")

    names = []
    for bed in beds:
        name = os.path.basename(bed).split("_")[0]
        names.append(name)

    # get meme-chip run params from pipeline.ini
    with open("./pipeline.ini", "r") as o:
        for line in o:
            if len(widths)==0:
                w = re.findall("^widths=.*", line)
                if len(w) > 0:
                    widths = w[0].lstrip("widths=").split(",")

            if len(no_peaks)==0:
                n = re.findall("^npeaks=.*", line)
                if len(n) > 0:
                    no_peaks = n[0].lstrip("npeaks=").split(",")

    df = []
    n = 0
    for name in names:
        for c in conditions:
            for peaks in no_peaks:
                for width in widths:
                    table = '_'.join([name, c, str(peaks), str(width)]) + "_Dreme_tomtom"


                    # get corresponding meme e_values for discovered motifs from meme.txt
                    meme_out = "meme.chip.dir/" + '_'.join([name, c]) + "." + '.'.join([str(peaks), str(width)]) + "/dreme_out/dreme.txt"

                    with open(meme_out, "r") as open_meme:
                        l = 0
                        for line in open_meme:
                            motif = re.findall("^#.*", line)
                            if len(motif) > 0:
                                l = l + 1
                                if l > 6:
                                    motif =  [ x for x in motif if "Stopping" not in x]
                                    motif =  [ x for x in motif if "Running" not in x]
                                    motif =  [ x for x in motif if "Word" not in x]
                                    if len(motif) > 0:
                                        motif = [x for x in ''.join(motif).split() if x not in ["BEST", "#"]]
                                        motif.append(table.replace("_Dreme_tomtom", ""))
                                        df.append(motif)

    df = pd.DataFrame(df, columns = ["motif", "motif_rc", "pos", "neg", "p_value", "dreme_evalue", "run"])
    df.drop("p_value", inplace=True, axis=1)
#     df = df.drop_duplicates()

    # add table to db
    connect = sqlite3.connect(db)
    df.to_sql("dreme_motifs", connect, if_exists="replace", index=False)

    return df

dreme = dreme_summary(db, conditions=["increase", "decrease"])

In [None]:
def dremechip_summary(db, conditions="", widths="", no_peaks="", samples=""):
    beds = glob.glob("data.dir/*_meme.bed")

    if len(samples)==0:
        names = []
        for bed in beds:
            name = os.path.basename(bed).split("_")[0]
            names.append(name)
    else:
        names = samples

    # get meme-chip run params from pipeline.ini
    with open("./pipeline.ini", "r") as o:
        for line in o:
            if len(widths)==0:
                w = re.findall("^widths=.*", line)
                if len(w) > 0:
                    widths = w[0].lstrip("widths=").split(",")

            if len(no_peaks)==0:
                n = re.findall("^npeaks=.*", line)
                if len(n) > 0:
                    no_peaks = n[0].lstrip("npeaks=").split(",")

    # get existing tables in db
    dbhandle = sqlite3.connect(db)
    cc = dbhandle.cursor()
    cc.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tabs = cc.fetchall()
    tables = pd.DataFrame(tabs, columns=["tables"])

    dfs = []
    for name in names:
        for c in conditions:
            for peaks in no_peaks:
                for width in widths:
                    table = '_'.join([name, c, str(peaks), str(width)]) + "_Dreme_tomtom"

                    if tables["tables"].str.contains(table).any():

                        statement = '''select a.query_id, a.query_consensus, a.target_id, 
                                    b.motif_name, b.database, a.e_value as tomtom_evalue, a.orientation
                                    from %(table)s a, motif_table b where a.target_id = b.motif_id 
                                    order by e_value asc''' % locals()

                        df = DB.fetch_DataFrame(statement, db)
                        df = df.drop_duplicates()
                        df["run"] = str(table).rstrip("_Dreme_tomtom")
                        df["DESeq2_comparison"] = df["run"].apply(lambda x: x.split("_")[0])
                        df["condition"] = df["run"].apply(lambda x: x.split("_")[1])
                        df["no_peaks"] = df["run"].apply(lambda x: x.split("_")[2])
                        df["window"] = df["run"].apply(lambda x: x.split("_")[3])

                        dfs.append(df)


    c = 0
    for df in dfs:
        c = c + 1
        if c == 1:
            motifs = df
        else:
            motifs = motifs.append(df)
        motifs.head()
            
    motifs["motif_name"] = motifs["motif_name"].apply(lambda x: x.replace("_MOUSE", ""))
    
    return motifs

motifs = dremechip_summary(db, conditions=["increase", "decrease"])

In [None]:
def analyse_dreme(motifs, dreme):

    # reformat dfs for merge
    motifs.rename(columns={"query_id":"motif"}, inplace=True)

    # merge dfs
    dreme_chip = pd.merge(dreme, motifs, how="inner", on=["run", "motif"])
    dreme_chip.drop_duplicates(inplace=True)

    dreme_chip["dreme_evalue"] = dreme_chip["dreme_evalue"].apply(lambda x: float(x)) # correct type
    dreme_chip["settings"] = dreme_chip["no_peaks"] + "_" + dreme_chip["window"] # make column for plot annotations

    # subset on sig motifs & sig matches
    dreme_chip = dreme_chip[(dreme_chip.tomtom_evalue < 0.05) & (dreme_chip.dreme_evalue < 0.05)].sort_values(["dreme_evalue", "tomtom_evalue"], ascending=True)

    return dreme_chip

dreme_chip = analyse_dreme(motifs, dreme)

## Effect of MEME-ChIP Settings on DREME & TOMTOM

In [None]:
# make new df for plots
dreme_evalue = dreme_chip.copy(deep=True)
dreme_evalue.index.name = None
dreme_evalue.reset_index(inplace=True, drop=True)
dreme_evalue = dreme_evalue.drop_duplicates(["target_id", "motif_name", "run", "motif"], keep="first")

In [None]:
%%R -i dreme_evalue

a <- ggplot(dreme_evalue, aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), shape=factor(query_id), alpha=condition, colour=database)) +
        geom_jitter(size=3) + 
        scale_alpha_discrete(range=c(0.4, 1)) +
        facet_grid(settings ~ DESeq2_comparison) +
        theme(legend.position="bottom", legend.direction="horizontal")
        
b <- ggplot(dreme_evalue, aes(y=-log10(dreme_evalue), x=window, fill=no_peaks)) + 
        geom_boxplot(aes(alpha=dreme_evalue), position="dodge")  +
        scale_fill_manual(values=Palette) + 
        labs(title="Dreme motif discovery", x="Peak width")
        
c <- ggplot(dreme_evalue, aes(y=-log10(tomtom_evalue), x=window, fill=no_peaks)) + 
        geom_boxplot(aes(alpha=dreme_evalue), position="dodge")  +
        scale_fill_manual(values=Palette) + 
        labs(title="Tomtom motif comparison", x="Peak width")

In [None]:
dreme_motif_no = dreme_evalue.groupby(["run", "window", "no_peaks"]).agg({"motif_name":"count"})
dreme_motif_no.reset_index(inplace=True)

In [None]:
%%R -i dreme_motif_no,dreme_chip -w 1200 -h 800

# meme_motif_no <- meme_motif_no[order(meme_motif_no$motif_name, descending=TRUE), ]
d <- ggplot(dreme_motif_no, aes(y=motif_name, x=window, fill=no_peaks)) + 
        geom_bar(stat="identity", position="dodge")  +
        scale_fill_manual(values=Palette) + 
        labs(title="Tomtom motif number", x="Peak width", y="No. Motifs (e-value < 0.05)")# +
#         scale_x_discrete(limits=unique(meme_motif_no$window))
        
e <- ggplot(dreme_chip, aes(y=-log10(tomtom_evalue), x=database, fill=database)) + 
        geom_boxplot() +
        scale_fill_manual(values=Palette) +
        labs(title="Tomtom motif e-value per database")
        
grid.arrange(c, b, d, e, ncol=2, nrow=2)

## DREME-ChIP Summary
* For simplicity results from all MEME-ChIP runs are aggregated
    * TOMTOM & DREME e-values are averaged across runs
    * No. motif occurances between runs and databases is counted
    * All DREME motifs have an e-value < 0.05
* This is to provide an overview of the data, do not use for downstream analysis

In [None]:
# aggregate data
dreme_summary = dreme_chip.groupby(["target_id", "motif_name", "motif", "DESeq2_comparison", "condition"]).agg({"tomtom_evalue":"mean", "dreme_evalue":"mean", "run":"count", "database":"count"})
dreme_summary.reset_index(inplace=True)

In [None]:
%%R -i dreme_summary -w 1000 -h 400

for (comp in unique(dreme_summary$DESeq2_comparison)){

    df <- subset(dreme_summary, DESeq2_comparison == comp)

    p <- ggplot(df, aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), alpha=run, colour=condition)) +
            geom_point(aes(size=database), position=position_jitterdodge()) + 
            scale_size(range=c(2,4), name="database matches") +
            scale_alpha(range=c(0.4,1), name="no. runs") +
            scale_shape_manual(values=c(16, 17, 15), name="meme motif") +
            geom_text_repel(
                data=df[-log10(df$tomtom_evalue) > 
                          quantile(-log10(df$tomtom_evalue), 0.9, na.rm=T), ], 
                aes(label=motif_name), colour="black", alpha=1, position=position_jitterdodge()) +
            facet_wrap(~ condition) +
            theme(legend.position="bottom", legend.direction="horizontal") +
            scale_colour_manual(values=Palette) +
            guides(color=guide_legend(override.aes=list(size=4)), alpha=guide_legend(override.aes=list(size=4))) +
            labs(title=comp)
    
    grid.arrange(p, ncol=1, nrow=1)
}

## DREME & TOMTOM Results for Each Run & Sample
* Mutliple database hits for the same motif discovered by DREME are merged (TOMTOM e-values are averaged)

In [None]:
# make new df for plot
dreme_graph = dreme_chip.copy(deep=True)

# count occurnaces of the same motif matching a TF in many databases
# average tomtom_evalue in these instances
dreme_graph = dreme_graph.groupby(["target_id", "motif_name", "motif","orientation", "run", "DESeq2_comparison", "condition", "settings"]).agg({"database":"count", "dreme_evalue":"min", "tomtom_evalue":"mean"})
dreme_graph.reset_index(inplace=True)

In [None]:
%%R -i dreme_graph -w 1200 -h 4000

ggplot(dreme_graph, aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), colour=condition)) +
    geom_jitter(aes(size=database)) + 
    scale_size(range=c(2,4), name="database matches") +
    geom_text_repel(data=dreme_graph[-log10(dreme_graph$"dreme_evalue") > 
                                   quantile(-log10(dreme_graph$"dreme_evalue"), 0.8, na.rm=T) &
                                   -log10(dreme_graph$"tomtom_evalue") > 
                                   quantile(-log10(dreme_graph$"tomtom_evalue"), 0.8, na.rm=T) |#, ], 
                                   -log10(dreme_graph$"dreme_evalue") > quantile(-log10(dreme_graph$"dreme_evalue"), 0.95, na.rm=T)  &
                                   -log10(dreme_graph$"tomtom_evalue") > 
                                   quantile(-log10(dreme_graph$"tomtom_evalue"), 0.9, na.rm=T) |
                                   -log10(dreme_graph$"tomtom_evalue") > quantile(-log10(dreme_graph$"tomtom_evalue"), 0.95, na.rm=T), ], 
                    aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), label=motif_name), colour="black") +
    facet_grid(settings ~ DESeq2_comparison) +
    theme(legend.position="bottom", legend.direction="horizontal") +
    scale_colour_manual(values=Palette) +
    guides(color=guide_legend(override.aes=list(size=4))) +
    guides(shape=guide_legend(override.aes=list(size=4)))

### Top Database Matches per DREME Motif 
* All discovered motifs have e-value < 0.05
* Multiple database matches for DREME motifs are grouped by TF name & TOMTOM e-values averaged

In [None]:
# make extra cols for annotation
dreme_graph["width"] = dreme_graph["settings"].apply(lambda x: "width_" + x.split("_")[1])
dreme_graph["peaks"] = dreme_graph["settings"].apply(lambda x: "npeaks_" + x.split("_")[0])

In [None]:
%%R -i dreme_graph -w 1200 -h 1200

for (comp in unique(dreme_graph$DESeq2_comparison)){
    df <- subset(dreme_graph, DESeq2_comparison == comp)

    p <- ggplot(df, aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), colour=condition)) +
        geom_point(aes(size=database), position=position_jitterdodge()) + 
        scale_size(range=c(2,4), name="database matches") +
        scale_alpha(range=c(0.4,1)) +
        geom_text_repel(data=df[-log10(df$"tomtom_evalue") > 
                                       quantile(-log10(df$"tomtom_evalue"), 0.8, na.rm=T), ], 
                        aes(y=-log10(tomtom_evalue), x=-log10(dreme_evalue), label=motif_name), colour="black", alpha=1) +
        facet_grid(width ~ peaks) +
        theme(legend.position="bottom", legend.direction="horizontal") +
        scale_colour_manual(values=Palette) +
        guides(color=guide_legend(override.aes=list(size=6))) +
        guides(alpha=guide_legend(override.aes=list(size=6)))  +
        labs(title=comp)
    
    grid.arrange(p, ncol=1, nrow=1)
}

## Top Motif Tables:
* Sorted by peak size & TOMTOM e-value

### Peaks with increased accessibility:

In [None]:
for i in dreme_chip["DESeq2_comparison"].unique():
    print i
    dreme_chip[(dreme_chip.DESeq2_comparison == i) & (dreme_chip.condition == "increase")].sort_values(["window", "tomtom_evalue"], ascending=True).drop_duplicates(["target_id", "motif_name", "orientation"], keep="first").head(10)

### Peaks with decreased accessibility:

In [None]:
for i in dreme_chip["DESeq2_comparison"].unique():
    print i
    dreme_chip[(dreme_chip.DESeq2_comparison == i) & (dreme_chip.condition == "decrease")].sort_values(["window", "tomtom_evalue"], ascending=True).drop_duplicates(["target_id", "motif_name", "orientation"], keep="first").head(10)