In [1]:
import numpy as np
from tqdm import tqdm
import glob
import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu, ttest_ind, zscore, pearsonr, wilcoxon
import pybedtools as pbt
from scipy.integrate import simps
from IPython.display import clear_output
import json
from QuadFinder import QuadMotifFinder

plt.style.use('fivethirtyeight')
def clean_axis(ax):
    ax.xaxis.set_tick_params(labelsize=9)
    ax.yaxis.set_tick_params(labelsize=9)
    for i in ['top', 'bottom', 'left', 'right']:
        ax.spines[i].set_visible(False)
    ax.grid(which='major', linestyle='--', alpha=0.5)
    return True

### Load names of oncogenes downloaded from COSMIC (cancer Genes census list) and obtain there corresponding ENSEMBL transcript ids

In [2]:
oncogenes = []
with open('../data/oncogenes/Census_allFri Apr 14 21-33-23 2017.tsv') as h:
    next(h)
    for l in h:
        c = l.split('\t', 1)
        oncogenes.append(c[0])

tid_to_gene = json.load(open('../data/annotation/gencode_tid_to_gene_name.json'))
oncogenes_tid = {}
for k,v in tid_to_gene.items():
    if v in oncogenes:
        oncogenes_tid[k] = v
len(oncogenes), len(oncogenes_tid)

(616, 6627)

### Load the TSS inf|ormation for all above transcripts and calculate +/- 1 kb tss region

In [3]:
chroms = ['chr'+str(x) for x in range(1,23)] + ['chrX', 'chrY']
chrom_wise_oncogene_info = {}

min_starts = {}
for chrom in chroms:
    chrom_wise_oncogene_info[chrom] = {'positive': [], 'negative': []}
    with open('../data/annotation/chrom_wise_gencode_tss/%s.bed' % chrom) as h:
        for l in h:
            c = l.rstrip('\n').split('\t')
            if c[3] in oncogenes_tid:
                gene_name = oncogenes_tid[c[3]]
                if c[5] == '+':
                    start = int(c[1])
                else:
                    start= int(c[2])
                up = start - 1000
                down = start + 1000 + 1
                strand = 'positive' if c[5] == '+' else 'negative'
                chrom_wise_oncogene_info[chrom][strand].append(
                    (up, down, gene_name, c[3]))
                if strand == 'positive':
                    if gene_name not in min_starts:
                        min_starts[gene_name] = (up, c[3])
                    else:
                        min_starts[gene_name] = (min(min_starts[gene_name][0], up), c[3])
                else:
                    if gene_name not in min_starts:
                        min_starts[gene_name] = (down, c[3])
                    else:
                        min_starts[gene_name] = (max(min_starts[gene_name][0], down), c[3])
min_transcripts = [x[1] for x in min_starts.values()]
len(min_starts)

602

### Load the SNIPR-Q scores in the above regions

In [4]:
def load_scores(chrom, strand):
    in_dir = '/home/parashar/scratch/quadcomb/data/snipr/'
    fn = '%s/%s_%s_scores.npy' % (in_dir, chrom, strand)
    return np.load(fn, mmap_mode='r')

def get_g4s(seq, strand):
    g4s = []
    for l,b in zip((7, 25, 25), (0, 0, 5)):
        q = QuadMotifFinder([seq], stem=3, loop_start=1, loop_stop=l, greedy=True,
                            bulge=b, is_seq=True, strands=[strand], verbose=False)
        q.run()
        g4s.append([(x.split('\t')[1], x.split('\t')[2]) for x in q.resNov])
    return g4s
    
gene_sense_exp = []
gene_antisense_exp = []
gene_sense_g4 = []
gene_antisense_g4 = []
tid_list = []

for chrom in tqdm(chroms[::-1]):
    fasta_dir = '/home/parashar/scratch/hg19_resource/chromosomes'
    chrom_seq = ''.join([x.rstrip('\n').upper() for x in open("%s/%s.fa" % (
                                                fasta_dir, chrom)).readlines()[1:]])
    pos_exp = load_scores(chrom, 'positive')
    neg_exp = load_scores(chrom, 'negative')
    for i in chrom_wise_oncogene_info[chrom]['positive']:
        seq = chrom_seq[i[0]:i[1]]
        gene_sense_exp.append(pos_exp[i[0]:i[1]])
        gene_antisense_exp.append(neg_exp[i[0]:i[1]])
        tid_list.append(i[3])
        gene_sense_g4.append(get_g4s(seq, '+'))
        gene_antisense_g4.append(get_g4s(seq, '-'))
    for i in chrom_wise_oncogene_info[chrom]['negative']:
        seq = chrom_seq[i[0]:i[1]][::-1]
        gene_sense_exp.append(neg_exp[i[0]:i[1]][::-1])
        gene_antisense_exp.append(pos_exp[i[0]:i[1]][::-1])
        tid_list.append(i[3])
        gene_sense_g4.append(get_g4s(seq, '-'))
        gene_antisense_g4.append(get_g4s(seq, '+'))
gene_sense_exp = np.array(gene_sense_exp)
gene_antisense_exp = np.array(gene_antisense_exp)
gene_sense_exp.shape, gene_antisense_exp.shape, len(tid_list)

100%|██████████| 24/24 [02:31<00:00,  9.71s/it]


((6627, 2001), (6627, 2001), 6627)

In [40]:
for n,transcript in enumerate(tid_list):
    if transcript not in min_transcripts:
        continue
    print ("\r%d" % n, end='', flush=True)
    gene = oncogenes_tid[transcript]
    fig, axis = plt.subplots(1, 2, figsize=(8, 2.5))
    for col in [0, 1]:
        ax = axis[col]
        if col == 0:
            ax.plot(list(range(2001)), gene_sense_exp[n], lw=0.4, alpha=0.6, color='k')
        else:
            ax.plot(list(range(2001)), gene_antisense_exp[n], lw=0.4, alpha=0.6, color='k')
        ax.axvline(1000, ls='--', color='k', alpha=0.7, lw=0.5)
        print_strand = '%s\nSense' % gene if col == 0 else '%s\nAntisense' % gene
        ax.text(0.76, 0.6, print_strand, transform=ax.transAxes, fontsize=9)
        if col == 0:
            spans = gene_sense_g4[n]
        else:
            spans = gene_antisense_g4[n]
        for p,i in zip((-100, -200, -300), range(3)):
            for g4_span in spans[i]:
                r = list(range(int(g4_span[0]), int(g4_span[1])))
                ax.plot(r, [p for x in range(r[-1] - r[0] + 1)], lw=6, color='crimson')
        for j in ['top', 'bottom', 'left', 'right']:
            ax.spines[j].set_visible(False)
        ax.grid(which='major', linestyle='--', alpha=0.5)
        ax.set_yticks([0, 250, 500, 750, 1000])
        if col == 1:
            ax.set_yticklabels([])
        else:
            ax.set_yticklabels([0, 250, 500, 750, 1000])
            ax.set_ylabel('SNIPR-Q score', fontsize=10)
        ax.set_xticks([0, 500, 1000, 1500, 2000])
        ax.set_xticklabels([-1000, -500, 0, 500, 1000], rotation=40, fontsize=9)
        ax.set_xlabel('Distance from TSS', fontsize=10)
        ax.xaxis.set_tick_params(labelsize=9)
        ax.yaxis.set_tick_params(labelsize=9) 
        ax.set_xlim(-10, 2010)
        ax.set_ylim((-400, 1010))
    plt.tight_layout()
    plt.savefig('../images/blow_out_oncogenes/%s_%s.svg' % (gene, transcript),
                 transparent=True, edgecolor='none')
    plt.savefig('../images/blow_out_oncogenes/%s_%s.png' % (gene, transcript),
                transparent=True, edgecolor='none', dpi=150)
    plt.close()

6626

In [38]:
snipr_cutoffs = {
    'sense': { 185: [], 500: [], 750: [], 900: []},
    'antisense': { 185: [], 500: [], 750: [], 900: []},
}
g4s_cutoffs = {
    'sense': { 70: [], 250: [], 255: []},
    'antisense': { 70: [], 250: [], 255: [], }
}
for n,transcript in enumerate(tid_list):
    if transcript not in min_transcripts:
        continue
    gene = oncogenes_tid[transcript]
    print (gene)
    max_sense = max(gene_sense_exp[n])
    for i in snipr_cutoffs['sense']:
        if max_sense > i:
            snipr_cutoffs['sense'][i].append(gene)
    max_antisense = max(gene_antisense_exp[n])
    for i in snipr_cutoffs['antisense']:
        if max_antisense > i:
            snipr_cutoffs['antisense'][i].append(gene)
        
    if len(gene_sense_g4[n][0]) > 0:
        g4s_cutoffs['sense'][70].append(gene)
    if len(gene_sense_g4[n][1]) > 0:
        g4s_cutoffs['sense'][250].append(gene)
    if len(gene_sense_g4[n][2]) > 0:
        g4s_cutoffs['sense'][255].append(gene)
        
    if len(gene_antisense_g4[n][0]) > 0:
        g4s_cutoffs['antisense'][70].append(gene)
    if len(gene_antisense_g4[n][1]) > 0:
        g4s_cutoffs['antisense'][250].append(gene)
    if len(gene_antisense_g4[n][2]) > 0:
        g4s_cutoffs['antisense'][255].append(gene)

CRLF2
P2RY8
STAG2
BCORL1
ATP2B3
RPL10
PHF6
RBM10
KDM6A
DDX3X
ZRSR2
AR
MSN
NONO
FOXO4
MED12
SSX1
SSX4
GATA1
WAS
ELF4
SEPT6
BTK
ATRX
MTCP1
GPC3
BCOR
AMER1
TFE3
KDM5C
SSX2
APOBEC3B
EP300
LZTR1
SEPT5
EWSR1
NF2
BCR
SMARCB1
PDGFB
MYH9
MKL1
MAPK1
CLTCL1
MN1
CHEK2
OLIG2
TMPRSS2
ERG
U2AF1
RUNX1
TOP1
SRC
PLCG1
SS18L1
GNAS
ASXL1
SDC4
MAFB
PTPRT
PTK6
NFATC2
SALL4
CBLC
BCL3
CIC
CD79A
ZNF331
CNOT3
PPP2R1A
KLK2
DNM2
SMARCA4
FSTL3
STK11
GNA11
CRTC1
CCNE1
LSM14A
CALR
TPM4
ERCC2
AKT2
TFPT
MLLT1
KEAP1
TCF3
SH3GL1
MAP2K2
CEP89
CEBPA
DNAJB1
PRKACA
BRD4
LYL1
JAK3
ELL
SMAD4
SETBP1
MALT1
SMAD2
BCL2
KDSR
ZNF521
SS18
HLF
MSI2
PPM1D
CLTC
RNF213
SEPT9
ASPSCR1
PRKAR1A
MAP2K4
USP6
RABEP1
LASP1
MLLT6
TAF15
ERBB2
CDK12
RARA
SPECC1
NF1
SUZ12
COL1A1
SPOP
BRIP1
RNF43
ETV4
BRCA1
CANT1
SRSF2
CD79B
DDX5
AXIN2
H3F3B
GAS7
TP53
PER1
NCOR1
FLCN
YWHAE
STAT5B
STAT3
SMARCE1
CTCF
CBFB
HERPUD1
CYLD
CDH1
CIITA
RMI2
TRAF7
TSC2
IL21R
FUS
TNFRSF17
ERCC4
CDH11
FANCA
CBFA2T3
MAF
ZFHX3
CREBBP
GRIN2A
SOCS1
AXIN1
MYH11
PALB2
PML
MAP2K1
SMAD

In [39]:
for i in ['sense', 'antisense']:
    for j in snipr_cutoffs[i]:
        print (i, j, len(snipr_cutoffs[i][j]))
        print (
            len(set(snipr_cutoffs[i][j]).intersection(g4s_cutoffs[i][70])),
            len(set(snipr_cutoffs[i][j]).intersection(g4s_cutoffs[i][250])),
            len(set(snipr_cutoffs[i][j]).intersection(g4s_cutoffs[i][255])),
        )
        if j in [900]:
            print (set(snipr_cutoffs[i][j]).difference(g4s_cutoffs[i][70]))
        if j in [750]:
            print (set(snipr_cutoffs[i][j]).difference(g4s_cutoffs[i][250]))

sense 185 548
275 448 548
sense 900 82
72 82 82
{'ELF4', 'ELL', 'SET', 'ERBB4', 'CANT1', 'H3F3B', 'EIF4A2', 'WT1', 'DDR2', 'NUMA1'}
sense 500 429
264 403 429
sense 750 257
196 251 257
{'OMD', 'ESR1', 'ACVR1', 'NPM1', 'PHOX2B', 'ATIC'}
antisense 185 527
248 434 527
antisense 900 60
55 60 60
{'ETV1', 'BRD3', 'NCKIPSD', 'RECQL4', 'MLH1'}
antisense 500 401
241 380 401
antisense 750 229
173 222 229
{'CREB3L2', 'BRCA1', 'EZR', 'NPM1', 'TFE3', 'CLP1', 'TP53'}


G4s in the promoters of cancer genes are considered to be viable switches to regulate their expression using ligands which bind to G4 structures. However, using current computational methodologies it remains unclear which promoters can be best targeted and hence, researchers have sought to individually characterize the G4s in the promoters of cancer genes. SNIPR-Q can be specially useful as it can not only indicate the G4 scores, which proxies for their probability of formation, but also help pinpoint the exact bases which would be involved in the structure formation. Chamber et al, showed that BRCA1, BRCA2 and MAP3K8 promoters contains a G-quadruplex structure which were not previously known harbor any G4s (by virtue of presence of G3L1-7 motif), thus opening avenues to therapeutically target these gene using G4 binding ligands.

We tested 602 cancer associated gene promoters and identified those with high SNIPR-Q scores and different G4 motifs (using Quadparser  and QuadBase2) in a strand specific manner. We found that, for the sense strands, 82 promoters had bases with SNIPR-Q scores over 900 and 10 among them (ex. NUMA1, ELF4 and ERBB4) did not harbor PG4s of G3L1-7 configuration. For the antisense stand, we found that 60 promoters had bases with SNIPR-Q scores over 900 and 5 (viz. ETV1, BRD3, NCKIPSD, RECQL4 and MLH1) among them had no G3L1-7 PG4. Further, we found that there were 6 and 7 promoters in the sense and antisense strand respectively that contained scores over 750 but were had no detectable unbulged PG4s with even with considering loop lengths upto 25. These included BRCA1, TP53, ESR1, NPM1 and CLP1.

### Load G4 motifs in the above regions

In [146]:
def g4_intersect(chrom, bed, loop_len=15):
    g4_fn = "g3_%d_0_%s_nov.bed" % (loop_len, chrom)
    g4 = pbt.BedTool("/home/parashar/scratch/quadruplexes/hg19/%s" % g4_fn)
    intersect = bed.intersect(g4, wo=True)
    tids = {}
    for l in intersect:
        c = str(l).rstrip('\n').split('\t')
        start = max(0, int(c[7]) - int(c[1]))
        end = min(2000, int(c[8]) - int(c[1]))
        a = np.zeros(2001)
        a[start:end] += 1
        t_strand = c[5]
        if t_strand == '-':
            a = a[::-1]
        tname = c[4]
        if tname not in tids:
            tids[tname] = {'sense': np.zeros(2001), 'antisense': np.zeros(2001)}
        
        g4_strand = '+' if c[10][0].upper() == 'G' else '-'
        if g4_strand == t_strand:
            tids[tname]['sense'] += a
        else:
            tids[tname]['antisense'] += a
    return tids

tid_arrays = {}
for chrom in tqdm(chroms):
    t_bed = []
    for strand,s in zip(['positive', 'negative'], ['+', '-']):
        for i in chrom_wise_oncogene_info[chrom][strand]:
            bedline = "\t".join(map(str,
                                [chrom, i[0], i[1], i[2], i[3], s]))
            t_bed.append(bedline)
    t_bed = pbt.BedTool('\n'.join(t_bed), from_string=True)    
    temp_dict = g4_intersect(chrom, t_bed)
    for tid in temp_dict:
        tid_arrays[tid] = temp_dict[tid]

gene_sense_g4 = []
gene_antisense_g4 = []
for tid in tid_list:
    if tid in tid_arrays:
        gene_sense_g4.append(tid_arrays[tid]['sense'])
        gene_antisense_g4.append(tid_arrays[tid]['antisense'])
    else:
        gene_sense_g4.append(np.zeros(2001))
        gene_antisense_g4.append(np.zeros(2001))
gene_sense_g4 = np.array(gene_sense_g4)
gene_antisense_g4 = np.array(gene_antisense_g4)
gene_sense_g4.shape, gene_antisense_g4.shape

100%|██████████| 24/24 [00:06<00:00,  3.94it/s]


((5804, 2001), (5804, 2001))

### Load OQs in the above regions

In [196]:
def oq_intersect(chrom, transcript_bed):
    oq_fn = '../data/ROC_data/regions/chrom_wise/Na_K_2_oq_regions_%s.bed' % chrom
    oq_bed = pbt.BedTool(oq_fn)
    intersect = transcript_bed.intersect(oq_bed, wo=True)
    tids = {}
    for l in intersect:
        c = str(l).rstrip('\n').split('\t')
        if int(c[10]) > 1:
            start = max(0, int(c[7]) - int(c[1]))
            end = min(2000, int(c[8]) - int(c[1]))
            a = np.zeros(2001)
            a[start:end] += 1
            t_strand = c[5]
            if t_strand == '-':
                a = a[::-1]
            tname = c[4]
            if tname not in tids:
                tids[tname] = {'sense': np.zeros(2001), 'antisense': np.zeros(2001)}
            oq_strand = '+' if c[9] == '-' else '-'
            if oq_strand == t_strand:
                tids[tname]['sense'] += a
            else:
                tids[tname]['antisense'] += a
    return tids

tid_arrays = {}
for chrom in tqdm(chroms):
    t_bed = []
    for strand,s in zip(['positive', 'negative'], ['+', '-']):
        for i in chrom_wise_oncogene_info[chrom][strand]:
            bedline = "\t".join(map(str,
                                [chrom, i[0], i[1], i[2], i[3], s]))
            t_bed.append(bedline)
    t_bed = pbt.BedTool('\n'.join(t_bed), from_string=True)    
    temp_dict = oq_intersect(chrom, t_bed)
    for tid in temp_dict:
        tid_arrays[tid] = temp_dict[tid]

gene_sense_oq = []
gene_antisense_oq = []
for tid in tid_list:
    if tid in tid_arrays:
        gene_sense_oq.append(tid_arrays[tid]['sense'])
        gene_antisense_oq.append(tid_arrays[tid]['antisense'])
    else:
        gene_sense_oq.append(np.zeros(2001))
        gene_antisense_oq.append(np.zeros(2001))
gene_sense_oq = np.array(gene_sense_oq)
gene_antisense_oq = np.array(gene_antisense_oq)
gene_sense_oq.shape, gene_antisense_oq.shape

100%|██████████| 24/24 [00:04<00:00,  5.77it/s]


((5804, 2001), (5804, 2001))

In [182]:
def make_ranges(a):
    ranges = []
    for s,e in zip(np.where(np.diff(a) > 0)[0], np.where(np.diff(a) < 0)[0]):
        ranges.append(list(range(s+1,e+1)))
    return ranges   

def g4_exp_overlap(g4_arrays, exp_arrays, cutoff=0.01):    
    overlap = []
    total = []
    for g,e in zip(g4_arrays, exp_arrays):
        e = np.where(e > cutoff)[0]
        tg, tl, to, tot = len(e), 0, 0, 0
        for r in make_ranges(g):
            tot += 1
            len_intersect = len(set(r).intersection(e))
            if len_intersect > 0:
                to += 1
        overlap.append(to)
        total.append(tot)
    return np.array(overlap), np.array(total)

s_overlap, s_total_g4s = g4_exp_overlap(gene_sense_g4, gene_sense_exp)
as_overlap, as_total_g4s = g4_exp_overlap(gene_antisense_g4, gene_antisense_exp)

In [101]:
sum(s_total_g4s), sum(as_total_g4s), sum(s_total_g4s) + sum(as_total_g4s)

(12246, 10033, 22279)

In [112]:
print (len(np.where((s_total_g4s + as_total_g4s) > 0)[0]),
       len(np.where((s_total_g4s + as_total_g4s) > 0)[0])/len(as_total_g4s)*100)

4307 74.20744314266024


In [116]:
exp_ctr = 0
for i in range(gene_sense_exp.shape[0]):
    if len(np.where(gene_sense_exp[i] > 0.01)[0]) > 0 or \
       len(np.where(gene_antisense_exp[i] > 0.01)[0]) > 0:
        exp_ctr += 1
print (exp_ctr, exp_ctr/gene_sense_g4.shape[0]*100)

5210 89.7656788421778


In [119]:
sum(s_overlap) + sum(as_overlap), (sum(s_overlap) + sum(as_overlap))/(sum(s_total_g4s) + sum(as_total_g4s))

(17568, 0.78854526684321558)

In [121]:
len(np.where((s_overlap + as_overlap) > 0)[0])

4055

In [240]:
ctr = []
for i,j,a,b,n in zip(gene_sense_g4, gene_antisense_g4,
                   gene_sense_exp, gene_antisense_exp, range(gene_antisense_exp.shape[0])):
    if sum(i)+sum(j) == 0:
        if len(np.where(a > 0.01)[0]) > 0 or len(np.where(b > 0.01)[0]) > 0:
            ctr.append(n)
len(ctr)

938

In [241]:
high_conf_genes = {}
for i in np.array(tid_list)[ctr]:
    if oncogenes_tid[i] not in high_conf_genes:
        high_conf_genes[oncogenes_tid[i]] = []
    high_conf_genes[oncogenes_tid[i]].append(i)
print (len(high_conf_genes))
    
for i in high_conf_genes:
    if len(high_conf_genes[i]) > num_transcripts[i]/2:
        print (i, len(high_conf_genes[i]),  num_transcripts[i])

323
CREBBP 11 20
SETD2 6 8
HIST1H3B 1 1
WRN 2 3
LRIG3 5 9
BUB1B 6 11
SND1 11 16
SBDS 5 5
TAL2 1 1
BIRC3 6 7
MDS2 2 3
LCP1 4 7
PRDM1 6 7
PDCD1LG2 1 1
SFPQ 6 10
FANCD2 9 13
IL7R 8 10
SSX2 2 3
FCRL4 2 3
SSX1 1 1
ARHGAP26 11 20
TNFRSF17 2 3


In [197]:
def g4_oq_overlap(g4_arrays, oq_arrays):
    overlap = []
    total = []
    for g,o in zip(g4_arrays, oq_arrays):
        to, tot = 0, 0
        g = np.where(g > 0)[0]
        for r in make_ranges(o):
            tot += 1
            len_intersect = len(set(r).intersection(g))
            if len_intersect > 0:
                to += 1
        overlap.append(to)
        total.append(tot)
    return np.array(overlap), np.array(total)

gq_s_overlap, gq_s_total_g4s = g4_oq_overlap(gene_sense_g4, gene_sense_oq)
gq_as_overlap, gq_as_total_g4s = g4_oq_overlap(
    gene_antisense_g4, gene_antisense_oq)

In [198]:
gq_s_total_g4s.sum() + gq_as_total_g4s.sum()

2760

In [199]:
print (len(np.where((gq_s_total_g4s + gq_as_total_g4s) > 0)[0]),
       len(np.where((gq_s_total_g4s + gq_as_total_g4s) > 0)[0])/len(gq_s_total_g4s)*100)

2051 35.33769813921433


In [200]:
print (gq_s_overlap.sum() + gq_as_overlap.sum(),
       (gq_s_overlap.sum() + gq_as_overlap.sum())/ (gq_s_total_g4s.sum() + gq_as_total_g4s.sum()))

1981 0.717753623188


In [201]:
print (len(np.where((gq_s_overlap + gq_as_overlap) > 0)[0]),
       len(np.where((gq_s_overlap + gq_as_overlap) > 0)[0])/len(gq_s_total_g4s)*100)

1605 27.653342522398344


In [202]:
def oq_exp_overlap(oq_arrays, exp_arrays, cutoff=0.01):    
    overlap = []
    total = []
    for o,e in zip(oq_arrays, exp_arrays):
        e = np.where(e > cutoff)[0]
        to, tot = 0, 0
        for r in make_ranges(o):
            tot += 1
            len_intersect = len(set(r).intersection(e))
            if len_intersect > 0:
                to += 1
        overlap.append(to)
        total.append(tot)
    return np.array(overlap), np.array(total)

oe_s_overlap, oe_s_total_g4s = oq_exp_overlap(gene_sense_oq, gene_sense_exp)
oe_as_overlap, oe_as_total_g4s = oq_exp_overlap(
    gene_antisense_oq, gene_antisense_exp)

In [203]:
print (len(np.where((oe_s_total_g4s + oe_as_total_g4s) > 0)[0]),
       len(np.where((oe_s_total_g4s + oe_as_total_g4s) > 0)[0])/len(oe_s_total_g4s)*100)

2051 35.33769813921433


In [204]:
print (oe_s_overlap.sum() + oe_as_overlap.sum(),
       (oe_s_overlap.sum() + oe_as_overlap.sum())/ (oe_s_total_g4s.sum() + oe_as_total_g4s.sum()))

2417 0.875724637681


In [180]:
print (len(np.where((oe_s_overlap + oe_as_overlap) > 0)[0]),
       len(np.where((oe_s_overlap + oe_as_overlap) > 0)[0])/len(oe_s_total_g4s)*100)

3052 52.58442453480359
