In [None]:
########################
#When: 2018-07-07
#What: Get the counts to plot Venn diagram
#Who: Shalu Jhanwar
########################

########################
#Load libraries
########################
import glob, os
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib_venn import venn3, venn3_circles, venn3_unweighted
from scipy.stats import zscore
import seaborn as sns
sns.set()
%matplotlib inline
matplotlib.style.use('ggplot')
%matplotlib notebook

########################
#Define functions
########################
def getListgeneName(fileName):
    DEGfileDf = pd.read_csv(fileName, sep='\t')
    genesDEG = DEGfileDf['geneId'].tolist()
    DEGfileDf = DEGfileDf.loc[:,('geneId', 'gene_name')]
    return genesDEG, DEGfileDf

def getVennCounts3List(A, B, C):
    AandBandC = set(A) & set(B) & set(C)
    only_AandB = set(A) & set(B) - set(AandBandC)
    only_BandC = set(B) & set(C) - set(AandBandC)
    only_AandC = set(A) & set(C) - set(AandBandC)
    only_A = set(A) - (set(B) | set(C))
    only_B = set(B) - (set(A) | set(C))
    only_C = set(C) - (set(A) | set(B))

    #Sanity check
    if len(set(A)) != (len(set(AandBandC)) + len(set(only_AandB)) + len(set(only_AandC)) + len(set(only_A))):
        print "listA counts are not fine"
    elif len(set(B)) != (len(set(AandBandC)) + len(set(only_AandB)) + len(set(only_BandC)) + len(set(only_B))):
        print "listB counts are not fine" 
    elif len(set(C)) != (len(set(AandBandC)) + len(set(only_AandC)) + len(set(only_BandC)) + len(set(only_C))):
        print "listC counts are not fine"
    else:
        print "SET A -- onlyA: %d, AandB: %d, AandC: %d,  A&B&C: %d" % (len(only_A), len(only_AandB),len(only_AandC),len(AandBandC))
        print "SET B -- onlyB: %d, AandB: %d, BandC: %d,  A&B&C: %d" % (len(only_B), len(only_AandB),len(only_BandC),len(AandBandC))
        print "SET C -- onlyC: %d, AandC: %d, BandC: %d,  A&B&C: %d" % (len(only_C), len(only_AandC),len(only_BandC),len(AandBandC))
    
    return only_A, only_B, only_C, only_AandB, only_AandC, only_BandC, AandBandC

def getConcat(df1, df2):
    dfConcat = pd.concat([df1, df2], axis=0)
    dfConcat=dfConcat.drop_duplicates(subset=['geneId', 'gene_name'],keep='first')
    dfConcat.reset_index(drop=True, inplace=True)
    return dfConcat

def getConcat3df(df1, df2, df3):
    dfConcat = pd.concat([df1, df2, df3], axis=0)
    dfConcat=dfConcat.drop_duplicates(subset=['geneId', 'gene_name'],keep='first')
    dfConcat.reset_index(drop=True, inplace=True)
    return dfConcat

def getGeneNames_writeFiles(setObj, geneNameId, fileName):
    dfSet = pd.DataFrame([setObj]).T
    dfSet.columns = ["geneId"]
    dfGeneName = pd.merge(dfSet, geneNameId, how='inner', on=['geneId'])
    dfGeneName=dfGeneName['gene_name']
    dfGeneName = dfGeneName.dropna()
    dfGeneName.to_csv(fileName + '.txt', index=False, header=None)
    
def plotVennDiag(only_A, only_B, only_C, only_AandB, only_AandC, only_BandC, AandBandC):
    plt.figure(figsize=(8,8))
    v = venn3(subsets=(len(only_A), len(only_B), len(only_AandB),
                   len(only_C), len(only_AandC), len(only_BandC), len(AandBandC)),
          set_labels = ('A', 'B', 'C'))
    c = venn3_circles(subsets=(len(only_A), len(only_B), len(only_AandB),
                   len(only_C), len(only_AandC), len(only_BandC), len(AandBandC)),
                  linestyle='dashed')
    plt.show()
    
def plotVennDiag_unweighted(only_A, only_B, only_C, only_AandB, only_AandC, only_BandC, AandBandC):   
    plt.figure(figsize=(8,8))
    v = venn3_unweighted(subsets=(len(only_A), len(only_B), len(only_AandB),
                   len(only_C), len(only_AandC), len(only_BandC), len(AandBandC)),
          set_labels = ('A', 'B', 'C'))
    plt.show()

In [None]:
########################
#Get counts of combination
########################
inDir = 'DEG_withoutscRNA_1.5FC_3noOfsamples/'
outDir = 'DEG_withoutscRNA_1.5FC_3noOfsamples/'

#For mouse
E10vsE11,E10vsE11_geneName_id = getListgeneName(inDir + 'GLM_e10.5vse11.5_fdr05_withCoordGeneName.bed');
E9vsE10,E9vsE10_geneName_id = getListgeneName(inDir + 'GLM_e9.5vse10.5_fdr05_withCoordGeneName.bed');
E9vsE11,E9vsE11_geneName_id = getListgeneName(inDir + 'GLM_e9.5vse11.5_fdr05_withCoordGeneName.bed');

only_E10vsE11, only_E9vsE10, only_E9vsE11, only_E10vsE11andE9vsE10, only_E10vsE11andE9vsE11, only_E9vsE10andE9vsE11, E10vsE11andE9vsE10andE9vsE11 = getVennCounts3List(E10vsE11, E9vsE10, E9vsE11)

getGeneNames_writeFiles(only_E10vsE11, E10vsE11_geneName_id, outDir + "only_E10vsE11")
getGeneNames_writeFiles(only_E9vsE10, E9vsE10_geneName_id, outDir + "only_E9vsE10")
getGeneNames_writeFiles(only_E9vsE11, E9vsE11_geneName_id, outDir + "only_E9vsE11")

only_E10vsE11andE9vsE10_geneName_id = getConcat(E10vsE11_geneName_id, E9vsE10_geneName_id)
getGeneNames_writeFiles(only_E10vsE11andE9vsE10, only_E10vsE11andE9vsE10_geneName_id, outDir + "only_E10vsE11andE9vsE10")

only_E10vsE11andE9vsE11_geneName_id = getConcat(E10vsE11_geneName_id, E9vsE11_geneName_id)
getGeneNames_writeFiles(only_E10vsE11andE9vsE11, only_E10vsE11andE9vsE11_geneName_id, outDir + "only_E10vsE11andE9vsE11")

only_E9vsE10andE9vsE11_geneName_id = getConcat(E9vsE10_geneName_id, E9vsE11_geneName_id)
getGeneNames_writeFiles(only_E9vsE10andE9vsE11, only_E9vsE10andE9vsE11_geneName_id, outDir + "only_E9vsE10andE9vsE11")

E10vsE11andE9vsE10andE9vsE11_geneName_id = getConcat3df(E10vsE11_geneName_id, E9vsE10_geneName_id, E9vsE11_geneName_id)
getGeneNames_writeFiles(E10vsE11andE9vsE10andE9vsE11, E10vsE11andE9vsE10andE9vsE11_geneName_id, outDir + "E10vsE11andE9vsE10andE9vsE11")

#For chicken
H1920vsH22,H1920vsH22_geneName_id = getListgeneName(inDir + 'GLM_H1920vsH22_fdr05_withCoordGeneName.bed');
H22vsH24,H22vsH24_geneName_id = getListgeneName(inDir + 'GLM_H22vsH24_fdr05_withCoordGeneName.bed');
H1920vsH24,H1920vsH24_geneName_id = getListgeneName(inDir + 'GLM_H1920vsH24_fdr05_withCoordGeneName.bed');

only_H1920vsH22, only_H22vsH24, only_H1920vsH24, only_H1920vsH22andH22vsH24, only_H1920vsH22andH1920vsH24, only_H22vsH24andH1920vsH24, H1920vsH22andH22vsH24andH1920vsH24 = getVennCounts3List(H1920vsH22, H22vsH24, H1920vsH24)

getGeneNames_writeFiles(only_H1920vsH22,H1920vsH22_geneName_id, outDir + "only_H1920vsH22")
getGeneNames_writeFiles(only_H22vsH24,H22vsH24_geneName_id, outDir + "only_H22vsH24")
getGeneNames_writeFiles(only_H1920vsH24,H1920vsH24_geneName_id, outDir + "only_H1920vsH24")

only_H1920vsH22andH22vsH24_geneName_id = getConcat(H1920vsH22_geneName_id, H22vsH24_geneName_id)
getGeneNames_writeFiles(only_H1920vsH22andH22vsH24, only_H1920vsH22andH22vsH24_geneName_id, outDir + "only_H1920vsH22andH22vsH24")

only_H1920vsH22andH1920vsH24_geneName_id = getConcat(H1920vsH22_geneName_id, H1920vsH24_geneName_id)
getGeneNames_writeFiles(only_H1920vsH22andH1920vsH24, only_H1920vsH22andH1920vsH24_geneName_id, outDir + "only_H1920vsH22andH1920vsH24")

only_H22vsH24andH1920vsH24_geneName_id = getConcat(H22vsH24_geneName_id, H1920vsH24_geneName_id)
getGeneNames_writeFiles(only_H22vsH24andH1920vsH24, only_H22vsH24andH1920vsH24_geneName_id, outDir + "only_H22vsH24andH1920vsH24")

H1920vsH22andH22vsH24andH1920vsH24_geneName_id = getConcat3df(H1920vsH22_geneName_id, H22vsH24_geneName_id, H1920vsH24_geneName_id)
getGeneNames_writeFiles(H1920vsH22andH22vsH24andH1920vsH24, H1920vsH22andH22vsH24andH1920vsH24_geneName_id, outDir + "H1920vsH22andH22vsH24andH1920vsH24")

In [None]:
########################
#plot VennDiagram - unweighted
########################
plotVennDiag(only_E10vsE11, only_E9vsE10, only_E9vsE11, only_E10vsE11andE9vsE10, only_E10vsE11andE9vsE11, only_E9vsE10andE9vsE11, E10vsE11andE9vsE10andE9vsE11)
plotVennDiag(only_H1920vsH22, only_H22vsH24, only_H1920vsH24, only_H1920vsH22andH22vsH24, only_H1920vsH22andH1920vsH24, only_H22vsH24andH1920vsH24, H1920vsH22andH22vsH24andH1920vsH24)
plotVennDiag_unweighted(only_E10vsE11, only_E9vsE10, only_E9vsE11, only_E10vsE11andE9vsE10, only_E10vsE11andE9vsE11, only_E9vsE10andE9vsE11, E10vsE11andE9vsE10andE9vsE11)
plotVennDiag_unweighted(only_H1920vsH22, only_H22vsH24, only_H1920vsH24, only_H1920vsH22andH22vsH24, only_H1920vsH22andH1920vsH24, only_H22vsH24andH1920vsH24, H1920vsH22andH22vsH24andH1920vsH24)
