In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
import math
import re
import subprocess
from os import listdir
from os.path import isfile, join

In [2]:
import matplotlib as mpl 
mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
import seaborn as sns
import matplotlib.pyplot as plt
plt.close("all")
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams["font.family"] = 'sans-serif'
mpl.rcParams['font.size'] = 8
sns.set(style="ticks")

In [3]:
cd /Users/davidy/jamboree20crispr/ontarget_analysis/220427_distance_optimization

/Users/davidy/jamboree20crispr/ontarget_analysis/220427_distance_optimization


In [4]:
##### Define functions

def get_lengths(chromcoord):
    chrom, start, end = chromcoord.split('_')
    length = int(end)-int(start)
    return int(length)

def round_down(num, divisor):
    return num - (num%divisor)

In [5]:
### Import full dataframe
full_df = pd.read_csv('HCRFF_ontarget_distance_effect_enhancer_grna_df.csv')
print(full_df.columns.values.tolist())

display(full_df)

print(len(full_df.loc[full_df['epifeature_type']=='dhs_peak']['dhs_peak_cc'].drop_duplicates().tolist()))

['chr', 'start', 'end', 'position', 'strand', 'sequence', 'distance', 'signal_distance', 'second_distance', 'avg_zlog2FC', 'zlog2FC_R1', 'zlog2FC_R2', 'enhancer_max_effect', 'enhancer_top3_mean_effect', 'enhancer_top10_mean_effect', 'enhancer_mean_effect', 'relative_max_effect', 'relative_top3_mean_effect', 'relative_top10_mean_effect', 'relative_mean_effect', 'grna_effect_rank', 'enhancer', 'dhs_peak_cc', 'epifeature_type', 'element', 'enhancer_first_summit', 'enhancer_second_summit', 'gene']


Unnamed: 0,chr,start,end,position,strand,sequence,distance,signal_distance,second_distance,avg_zlog2FC,...,relative_top10_mean_effect,relative_mean_effect,grna_effect_rank,enhancer,dhs_peak_cc,epifeature_type,element,enhancer_first_summit,enhancer_second_summit,gene
0,chr8,127794448,127794451,127794451,-,GGGGGGAGGCGCGCGCGGCC,-268,-33,-669,2.545155,...,1.306373,2.910400,1,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,127794484,127795120,MYC
1,chr8,127794921,127794924,127794920,+,GGGCGTTTGGGGAGGGTGAG,201,436,-200,2.027260,...,1.040549,2.318183,2,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,127794484,127795120,MYC
2,chr8,127794696,127794699,127794699,-,GCCGTGTCTCCACAGGTCAC,-20,215,-421,1.999589,...,1.026346,2.286542,3,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,127794484,127795120,MYC
3,chr8,127794823,127794826,127794822,+,TGAGTAGTCGGACGGAGGAA,103,338,-298,1.980487,...,1.016541,2.264698,4,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,127794484,127795120,MYC
4,chr8,127794875,127794878,127794878,-,CTCGCCACCAGTCTTGAGGC,159,394,-242,1.916645,...,0.983773,2.191695,5,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,127794484,127795120,MYC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15018,chr6,109308057,109308060,109308060,-,TACCTGCATATTATATTCTA,-224,-224,-224,0.059064,...,0.052082,0.096724,56,chr6_109308005_109308519,chr6_109308067_109308452,h3k27ac_peak,chr6:109308045-109308645:.,109308284,109308284,CD164
15019,chr6,109308084,109308087,109308083,+,ATATAATATGCAGGTAGGAT,-201,-201,-201,0.054006,...,0.047622,0.088441,57,chr6_109308005_109308519,chr6_109308067_109308452,h3k27ac_peak,chr6:109308045-109308645:.,109308284,109308284,CD164
15020,chr6,109308266,109308269,109308265,+,CACATTTTCTTATGATAGAA,-19,-19,-19,0.028067,...,0.024749,0.045963,58,chr6_109308005_109308519,chr6_109308067_109308452,h3k27ac_peak,chr6:109308045-109308645:.,109308284,109308284,CD164
15021,chr6,109308337,109308340,109308336,+,TTTTCGTAACAGTAGTGATC,52,52,52,0.014060,...,0.012398,0.023025,59,chr6_109308005_109308519,chr6_109308067_109308452,h3k27ac_peak,chr6:109308045-109308645:.,109308284,109308284,CD164


27


In [6]:
##### Get lengths of epigenetic elements that are hits in the screen

# Get respective, unique peaks as lists
h3k27ac_peaks = full_df.loc[full_df['epifeature_type']=='h3k27ac_peak']['enhancer'].drop_duplicates().tolist()
dhs_peaks = full_df.loc[full_df['epifeature_type']=='dhs_peak']['enhancer'].drop_duplicates().tolist()
# Record and report lengths
h3k27ac_peak_lengths = [get_lengths(cc) for cc in h3k27ac_peaks]
dhs_peak_lengths = [get_lengths(cc) for cc in dhs_peaks]
print(len(h3k27ac_peak_lengths), 'H3K27ac peak lengths:', h3k27ac_peak_lengths)
print(len(dhs_peak_lengths), 'DHS peak lengths:', dhs_peak_lengths)
lengths_lod = []
for length in h3k27ac_peak_lengths:
    adf = {'length':length,
          'feature':'H3K27ac'}
    lengths_lod.append(adf)
for length in dhs_peak_lengths:
    adf = {'length':length,
          'feature':'DHS'}
    lengths_lod.append(adf)
lengths_df = pd.DataFrame(lengths_lod)
### Plot
g1 = sns.histplot(data=lengths_df, x='length',hue='feature')
outplot = 'SigHits_EpifeaturePeak_Lengths_Histogram.pdf'
#plt.xlim(0, 3000)
plt.savefig(outplot)
plt.close('all')

##### Get lengths of epigenetic elements across the genome
# Read in files
h3k27ac_bed_df = pd.read_csv('/Users/davidy/misc_resources/chip/results/macs2/h3k27ac_k562_hg38_unique.bed', sep = '\t', names = ['chr','start','end'])
dhs_bed_df = pd.read_csv('/Users/davidy/misc_resources/chip/results/macs2/dhs_k562_hg38_optimal_peak_unique.bed', sep = '\t', names = ['chr','start','end'])
# Create length column
h3k27ac_bed_df['length'] = h3k27ac_bed_df['end'] - h3k27ac_bed_df['start']
dhs_bed_df['length'] = dhs_bed_df['end'] - dhs_bed_df['start']
# Get lengths and record into dataframe for plotting
h3k27ac_bed_lengths = h3k27ac_bed_df['length'].tolist()
dhs_bed_lengths = dhs_bed_df['length'].tolist()
lengths_lod = [] # Caution: Using the same names as above, will overwrite!
for length in h3k27ac_bed_lengths:
    adf = {'length':length,
          'feature':'H3K27ac'}
    lengths_lod.append(adf)
for length in dhs_bed_lengths:
    adf = {'length':length,
          'feature':'DHS'}
    lengths_lod.append(adf)
lengths_df = pd.DataFrame(lengths_lod) # Caution: Using the same names as above, will overwrite!

### Print median and mean lengths of DHS and H3K27ac peaks
DHS_lengths = lengths_df.loc[lengths_df['feature']=='DHS']['length'].tolist()
H3K27ac_lengths = lengths_df.loc[lengths_df['feature']=='H3K27ac']['length'].tolist()
print('DHS mean length: ', np.mean(DHS_lengths), 'DHS median length: ',np.median(DHS_lengths))
print('H3K27ac mean length: ',np.mean(H3K27ac_lengths), 'H3K27ac median length: ',np.median(H3K27ac_lengths))
DHS_median_length = np.median(DHS_lengths)
H3K27ac_median_length = np.median(H3K27ac_lengths)

### Plot
g2 = sns.histplot(data=lengths_df, x='length',hue='feature',
                 log_scale=(False, True),bins=50)
outplot = 'Genomewide_EpifeaturePeak_Lengths_Histogram.pdf'
#plt.xlim(0, 3000)
plt.savefig(outplot)
plt.close('all')


19 H3K27ac peak lengths: [1044, 3261, 4231, 7235, 3849, 2519, 6625, 813, 974, 314, 1456, 2288, 4927, 696, 697, 606, 2057, 2203, 514]
27 DHS peak lengths: [1210, 1402, 1168, 535, 1342, 1004, 796, 721, 405, 736, 244, 304, 533, 630, 566, 230, 210, 443, 833, 488, 294, 657, 544, 282, 320, 1013, 385]
DHS mean length:  468.1554588312918 DHS median length:  358.0
H3K27ac mean length:  950.0982412402859 H3K27ac median length:  627.0


In [7]:
##### Accessible peaks have stronger average effects than H3K27ac peaks

### Trim dataframe to exclude DHSs that dont have a H3K27ac peak
# Get the DHS peak CCs that are in the 'h3k27ac_peak' rows
h3k27ac_intersecting_dhs_peaks = full_df.loc[full_df['epifeature_type']=='h3k27ac_peak']['dhs_peak_cc'].drop_duplicates().tolist()
# Create a trimmed dataframe that includes rows with the DHS peaks that have an h3k27ac intersecting peak
# Note, this will keep h3k27ac_peak, dhs_peak, and slopDHS_peak epifeature_type
trimmed_elementwise_df = full_df.loc[full_df['dhs_peak_cc'].isin(h3k27ac_intersecting_dhs_peaks)][['enhancer_mean_effect','enhancer_top10_mean_effect','enhancer','dhs_peak_cc','epifeature_type','element','gene']].drop_duplicates()
display(trimmed_elementwise_df)
trimmed_elementwise_df.to_csv('testdf.csv', index = False)

### Create a dictionary that records the effects of DHS peaks
# Note: The same DHS may be found multiple times but have different effects if its a hit for multiple gene screens
dhs_peak_effects = {} # Key = dhs_peak_cc_GENE; Value = [[enhancer_mean_effect, enhancer_top10_mean_effect]]
dhs_peak_df = trimmed_elementwise_df.loc[trimmed_elementwise_df['epifeature_type']=='dhs_peak']
for i, row in dhs_peak_df.iterrows():
    dhs_peak_effects[row['dhs_peak_cc']+'_'+row['gene']] = [row['enhancer_mean_effect'],row['enhancer_top10_mean_effect']]

### Get the h3k27ac-dhs paired effects
# Note: An H3K27ac peak may have multiple intersecting DHSs even for a given gene screen
# Super important note: An H3K27ac peak by definition here intersects the DHS. So they can have IDENTICAL top10 gRNAs!!! Therefore the best comparison should be MEAN only.
h3k27ac_dhs_effects_lod = []
paired_effects_lod = []
row_counter = 0
for i, row in trimmed_elementwise_df.loc[trimmed_elementwise_df['epifeature_type']=='h3k27ac_peak'].iterrows():
    row_counter += 1
    h3k27ac_mean = row['enhancer_mean_effect']
    h3k27ac_top10_mean = row['enhancer_top10_mean_effect']
    dhs_peak_cc = row['dhs_peak_cc']
    gene = row['gene']
    # DHS effects
    dhs_mean, dhs_top10 = dhs_peak_effects[dhs_peak_cc+'_'+gene]
    
    adf = {'h3k27ac_peak_cc':row['enhancer'],
          'dhs_peak_cc':dhs_peak_cc,
          'gene':gene,
          'h3k27ac_mean':h3k27ac_mean,
          'h3k27ac_top10_mean':h3k27ac_top10_mean,
          'dhs_mean':dhs_mean,
          'dhs_top10_mean':dhs_top10}
    
    bdf = {'dhs_peak_cc':dhs_peak_cc,
          'enhancer_mean':dhs_mean,
          'epifeature_type':'DHS'}
    cdf = {'dhs_peak_cc':dhs_peak_cc,
          'enhancer_mean':h3k27ac_mean,
          'epifeature_type':'H3K27ac'}
    paired_effects_lod.append(bdf)
    paired_effects_lod.append(cdf)
    
    h3k27ac_dhs_effects_lod.append(adf)
    if h3k27ac_mean > dhs_mean:
        mean_comparison = 'H3K27ac stronger'
    else:
        mean_comparison = 'DHS stronger'
    if h3k27ac_top10_mean > dhs_top10:
        top10_comparison = 'H3K27ac stronger'
    else:
        top10_comparison = 'DHS stronger'
    
    print(row_counter, row['enhancer'], dhs_peak_cc, gene, mean_comparison, [h3k27ac_mean, dhs_mean] , top10_comparison, [h3k27ac_top10_mean, dhs_top10])

### Create the h3k27ac-dhs paired effects dataframe, make it unique
h3k27ac_dhs_effects_df = pd.DataFrame(h3k27ac_dhs_effects_lod).drop_duplicates()
paired_effects_df = pd.DataFrame(paired_effects_lod).drop_duplicates()

### Significance test difference in effects between h3k27ac and dhs
# Welch t-test
tstat, pval = stats.ttest_ind(h3k27ac_dhs_effects_df['dhs_mean'],h3k27ac_dhs_effects_df['h3k27ac_mean'],equal_var=False)
print('Welch t-test difference in DHS vs H3K27ac mean effects. t-statistic:', tstat, 'p-value: ',pval)
# Binomial test
# Out of 28 rows, 3 duplicate DHS-greater rows (same DHS peak), so 25 rows/trials; Based on avg mean effects: 3 failures (H3K27ac greater), 22 successes (DHS greater)
binom_pval = stats.binom_test(x=22, n=25, p=0.5)
print('Binomial test p-value: ',binom_pval)

##### Plotting
g3= sns.scatterplot(data=paired_effects_df, hue='epifeature_type',x='dhs_peak_cc',y='enhancer_mean')
outplot = 'H3K27ac_vs_DHS_mean_effects_enhancerwise.pdf'
plt.savefig(outplot)
plt.close('all')

fig, ax = plt.subplots(figsize=(6,6))

g4= sns.scatterplot(data=h3k27ac_dhs_effects_df,x='dhs_mean',y='h3k27ac_mean', ax=ax)
# g4.set_xlim(0.1, 10)
# g4.set_ylim(0.1, 10)
# g4.set_xticks([0, 1, 10])
# g4.set_yticks([0, 1, 10])

ax.set_xscale('log')
ax.set_yscale('log')

ax.set_xlim(0.4, 10.1)
ax.set_ylim(0.4, 10.1)

ax.set_xticks([0.5, 1, 5, 10])
ax.set_yticks([0.5, 1, 5, 10])

ax.get_xaxis().set_major_formatter(mpl.ticker.ScalarFormatter())
ax.get_yaxis().set_major_formatter(mpl.ticker.ScalarFormatter())

ax.axline((0,0), (1,1), color = '#777777', linestyle='--')

# plt.xlim(0, 7)
# plt.ylim(0, 7)
# x0, x1 = g4.get_xlim()
# y0, y1 = g4.get_ylim()
# lims = [max(x0, y0), min(x1, y1)]
# g4.plot(lims, lims, '-r')
outplot = 'H3K27ac_vs_DHS_mean_effects_scatter.pdf'
plt.savefig(outplot)

plt.close('all')



Unnamed: 0,enhancer_mean_effect,enhancer_top10_mean_effect,enhancer,dhs_peak_cc,epifeature_type,element,gene
0,0.874504,1.948260,chr8_127794273_127795483,chr8_127794273_127795483,dhs_peak,chr8:127794569-127795269:.,MYC
114,1.233250,4.025687,chr8_127898381_127899783,chr8_127898381_127899783,dhs_peak,chr8:127898369-127899769:.,MYC
278,2.835760,6.735703,chr8_127959660_127960828,chr8_127959660_127960828,dhs_peak,chr8:127959469-127961769:.,MYC
390,1.502592,3.157277,chr8_128044844_128045379,chr8_128044844_128045379,dhs_peak,chr8:128044769-128045569:.,MYC
450,0.685169,1.245611,chr8_127793273_127794273,chr8_127794273_127795483,slopDHS_peak,chr8:127794569-127795269:.,MYC
...,...,...,...,...,...,...,...
14224,0.518362,1.568859,chr6_109307067_109308067,chr6_109308067_109308452,slopDHS_peak,chr6:109308045-109308645:.,CD164
14313,0.649555,1.284017,chr6_109308452_109309452,chr6_109308067_109308452,slopDHS_peak,chr6:109308045-109308645:.,CD164
14354,0.722723,2.714657,chr6_109236150_109238207,chr6_109237079_109237399,h3k27ac_peak,chr6:109236645-109237345:.,CD164
14778,5.968775,12.376338,chr6_109303099_109305302,chr6_109303963_109304976,h3k27ac_peak,chr6:109303145-109306545:.,CD164


1 chr8_127794547_127795591 chr8_127794273_127795483 MYC DHS stronger [0.8184157298172073, 0.8745037436161072] DHS stronger [1.8288881924599028, 1.9482603636331184]
2 chr8_127896780_127900041 chr8_127898381_127899783 MYC DHS stronger [1.0068437783532511, 1.2332501147424155] H3K27ac stronger [4.078754468284753, 4.025687073101818]
3 chr8_127958538_127962769 chr8_127959660_127960828 MYC DHS stronger [1.7178089929271336, 2.835759712768372] DHS stronger [6.735703061826347, 6.735703061826347]
4 chr8_128044172_128051407 chr8_128044844_128045379 MYC DHS stronger [0.7727368944109412, 1.5025916694122292] H3K27ac stronger [3.883193035011671, 3.157277351565895]
5 chr11_61833171_61837020 chr11_61834010_61835352 FADS1 DHS stronger [2.071150735731198, 3.458285265421221] DHS stronger [6.103169962624519, 6.103169962624519]
6 chr11_61868786_61871305 chr11_61869482_61870486 FADS1 DHS stronger [0.8265465073382295, 1.123545933991403] H3K27ac stronger [2.4988504712250004, 2.0388591146845263]
7 chr11_33939962

In [8]:
##### Distance dependent gRNA effects
# Only use DHS and slopDHS epifeatures
# The DHS does not need to have an intersecting H3K27ac peak
# if DHS is repeated for multiple genes, only use the best one

### Get the best gene for each DHS peak
max_gene_dhs_effect_dict = {} # Key = dhs_peak_cc; Value = Gene with highest mean effect for the dhs
distance_effect_lod = []

# Create a checker to ensure that no gRNA is recorded twice, based on sequence alone
# This will check across all DHS!!! There are some slopDHS peaks that are overlapping, so this is necessary to exclude duplicate gRNAs
checked_guides = set()

# Binned distance effect dictionary
# Note, bins should be thought of as (0,20), (21, 40), etc.
# So always round gRNA distance down 
bin_length = 20
#round_down_integer = 10
bin_distance_effect_dict = {}
[bin_distance_effect_dict.setdefault(i, []) for i in range(0,2000,bin_length)]

for dhs_peak in full_df.loc[full_df['epifeature_type']=='dhs_peak']['dhs_peak_cc'].drop_duplicates().tolist():
    # Get trimmed dataframe
    temp_dhs_df = full_df.loc[(full_df['dhs_peak_cc']==dhs_peak) & (full_df['epifeature_type']=='dhs_peak')][['dhs_peak_cc','enhancer_mean_effect','gene']].drop_duplicates()
    # Sort by descending if there are multiple entries/genes for a DHS. Note that not all DHSs have multiple entries.
    temp_dhs_df.sort_values(by='enhancer_mean_effect', inplace=True, ascending=False)
    # Get the gene name with the greatest mean effect for the DHS, and record it
    max_gene = temp_dhs_df['gene'].head(1).tolist()[0]
    max_gene_dhs_effect_dict[dhs_peak] = max_gene
    ### Begin creating distance-effect dataframe
    
    
    # Create a checker to ensure that no gRNA is recorded twice, based on sequence alone. 
    # This is specific for each DHS peak, so it will not be written twice if a gRNA intersects both DHS peak and slopDHS peak.
    #checked_guides = set()
    
    ## Retrieve DHS guides
    guide_df = full_df.loc[(full_df['dhs_peak_cc']==dhs_peak) & (full_df['epifeature_type']=='dhs_peak') & (full_df['gene']==max_gene)][['sequence','signal_distance','relative_top10_mean_effect','relative_mean_effect','epifeature_type']]
    # Process guides, recording in distance_effect_lod
    for i, row in guide_df.iterrows():
        # If the gRNA has been seen and recorded already, skip!
        if row['sequence'] in checked_guides:
            continue
        else:
            adf = {'dhs_peak':dhs_peak,
                'sequence':row['sequence'],
                  'signal_distance':row['signal_distance'],
                   'abs_signal_distance': abs(row['signal_distance']),
                   'epifeature_type':row['epifeature_type'],
                  'relative_top10_mean_effect':row['relative_top10_mean_effect'],
                   'relative_mean_effect':row['relative_mean_effect']}
            distance_effect_lod.append(adf)
            checked_guides.add(row['sequence'])
            
            # Binning strategy
            abs_signal_distance = abs(row['signal_distance'])
            rounded_abs_signal_distance = round_down(abs_signal_distance,bin_length)
            bin_distance_effect_dict[rounded_abs_signal_distance].append(row['relative_top10_mean_effect'])
            
    
    ## Retrieve slopDHS guides for the same DHS
    # This should retrieve two separate slopDHS peaks, one for each side of the DHS
    guide_df = full_df.loc[(full_df['dhs_peak_cc']==dhs_peak) & (full_df['epifeature_type']=='slopDHS_peak') & (full_df['gene']==max_gene)][['sequence','signal_distance','relative_top10_mean_effect','relative_mean_effect','enhancer','epifeature_type']]
    slop_dhs_peaks = guide_df['enhancer'].drop_duplicates().tolist()
    #print(dhs_peak, max_gene, slop_dhs_peaks)

    if len(slop_dhs_peaks) < 2:
        print('Missing a slopped DHS peak!!!', slop_dhs_peaks, dhs_peak, max_gene)
        # Note: This is a GATA1 DHS. The GATA1 screen was not tiling, but DHS tiling, so it makes sense there aren't many gRNAs outside peak
    else:
        print(slop_dhs_peaks, dhs_peak) # Print the slop DHS peaks and the DHS peak
    
    for i, row in guide_df.iterrows():
        # If the gRNA has been seen and recorded already, skip!
        if row['sequence'] in checked_guides:
            continue
        else:
            adf = {'dhs_peak':dhs_peak,
                   'sequence':row['sequence'],
                  'signal_distance':row['signal_distance'],
                   'abs_signal_distance': abs(row['signal_distance']),
                   'epifeature_type':row['epifeature_type'],
                  'relative_top10_mean_effect':row['relative_top10_mean_effect'],
                   'relative_mean_effect':row['relative_mean_effect']}
            distance_effect_lod.append(adf) 
            checked_guides.add(row['sequence'])

            # Binning strategy
            abs_signal_distance = abs(row['signal_distance'])
            rounded_abs_signal_distance = round_down(abs_signal_distance,bin_length)
            bin_distance_effect_dict[rounded_abs_signal_distance].append(row['relative_top10_mean_effect'])
    print(len(checked_guides))
            
# Convert to dataframe
distance_effect_df = pd.DataFrame(distance_effect_lod)
#distance_effect_df['abs_signal_distance'] = abs(distance_effect_df['signal_distance'])
distance_effect_df.to_csv('dhs_expanded_distance_effect_df.csv', index = False, header = True)

### Process the binning dataframe
bin_distance_effect_lod = []
for distance in bin_distance_effect_dict:
    if len(bin_distance_effect_dict[distance]) == 0:
        continue
    else:
        avg_effect = np.mean(bin_distance_effect_dict[distance])
        adf = {'bin_distance':distance,
              'bin_avg_relative_top10_mean':avg_effect}
        bin_distance_effect_lod.append(adf)
bin_distance_effect_df = pd.DataFrame(bin_distance_effect_lod)
display(distance_effect_df)



['chr8_127793273_127794273', 'chr8_127795483_127796483'] chr8_127794273_127795483
209
['chr8_127897381_127898381', 'chr8_127899783_127900783'] chr8_127898381_127899783
438
['chr8_127958660_127959660', 'chr8_127960828_127961828'] chr8_127959660_127960828
705
['chr8_128043844_128044844', 'chr8_128045379_128046379'] chr8_128044844_128045379
879
['chr11_61833010_61834010', 'chr11_61835352_61836352'] chr11_61834010_61835352
1218
['chr11_61868482_61869482', 'chr11_61870486_61871486'] chr11_61869482_61870486
1559
['chr11_61869793_61870793', 'chr11_61871589_61872589'] chr11_61870793_61871589
1706
['chr11_33943500_33944500', 'chr11_33945221_33946221'] chr11_33944500_33945221
1814
['chr8_128047327_128048327', 'chr8_128048732_128049732'] chr8_128048327_128048732
1998
['chr8_128047766_128048766', 'chr8_128049502_128050502'] chr8_128048766_128049502
2080
['chr8_128055970_128056970', 'chr8_128057214_128058214'] chr8_128056970_128057214
2204
['chr8_128058002_128059002', 'chr8_128059306_128060306'] ch

Unnamed: 0,dhs_peak,sequence,signal_distance,abs_signal_distance,epifeature_type,relative_top10_mean_effect,relative_mean_effect
0,chr8_127794273_127795483,GGGGGGAGGCGCGCGCGGCC,-33,33,dhs_peak,1.306373,2.910400
1,chr8_127794273_127795483,GGGCGTTTGGGGAGGGTGAG,436,436,dhs_peak,1.040549,2.318183
2,chr8_127794273_127795483,GCCGTGTCTCCACAGGTCAC,215,215,dhs_peak,1.026346,2.286542
3,chr8_127794273_127795483,TGAGTAGTCGGACGGAGGAA,338,338,dhs_peak,1.016541,2.264698
4,chr8_127794273_127795483,CTCGCCACCAGTCTTGAGGC,394,394,dhs_peak,0.983773,2.191695
...,...,...,...,...,...,...,...
4130,chr6_109308067_109308452,ATATGCAGAGGGAAAATACC,911,911,slopDHS_peak,0.131830,0.244827
4131,chr6_109308067_109308452,CCTTACAGCAGATCTGTACA,987,987,slopDHS_peak,0.075213,0.139681
4132,chr6_109308067_109308452,TATGAATTTTCTAGCAGTTG,1109,1109,slopDHS_peak,0.052855,0.098159
4133,chr6_109308067_109308452,GTCTCTTAGTTGTGGAACTA,865,865,slopDHS_peak,0.018127,0.033664


In [13]:
##### Plotting
g5 = sns.kdeplot(data=distance_effect_df, x='signal_distance',y='relative_top10_mean_effect', hue='epifeature_type', fill=False)
plt.ylim(0,3)
outplot = 'SignalDistance_vs_Top10MeanEffect_DHSslopDHS_HueKDEplot.pdf'
plt.savefig(outplot)
plt.close('all')

g6 = sns.kdeplot(data=distance_effect_df, x='signal_distance',y='relative_top10_mean_effect', fill=True)
plt.ylim(0,3)
outplot = 'SignalDistance_vs_Top10MeanEffect_DHSslopDHS_KDEplot.pdf'
plt.savefig(outplot)
plt.close('all')

g7 = sns.scatterplot(data=distance_effect_df, x='signal_distance',y='relative_top10_mean_effect', hue='epifeature_type', alpha=0.5)
plt.ylim(0,3)
outplot = 'SignalDistance_vs_Top10MeanEffect_DHSslopDHS_HueScatter.pdf'
plt.savefig(outplot)
plt.close('all')

g7 = sns.scatterplot(data=distance_effect_df, x='signal_distance',y='relative_top10_mean_effect', alpha=0.5)
plt.ylim(0,3)
outplot = 'SignalDistance_vs_Top10MeanEffect_DHSslopDHS_Scatter.pdf'
plt.savefig(outplot)
plt.close('all')

g8 = sns.jointplot(data=distance_effect_df, x='signal_distance',y='relative_top10_mean_effect',alpha=0.5)
plt.ylim(0,3)
outplot = 'SignalDistance_vs_Top10MeanEffect_DHSslopDHS_Jointplot.pdf'
plt.savefig(outplot)
plt.close('all')

g9 = sns.jointplot(data=distance_effect_df, x='abs_signal_distance',y='relative_top10_mean_effect',alpha=0.5,linewidth=0)
plt.ylim(0,3)
outplot = 'AbsSignalDistance_vs_Top10MeanEffect_DHSslopDHS_Jointplot.pdf'
plt.savefig(outplot)
plt.close('all')

g10 = sns.kdeplot(data=distance_effect_df, x='abs_signal_distance',y='relative_top10_mean_effect', fill=True)
plt.ylim(0,3)
outplot = 'AbsSignalDistance_vs_Top10MeanEffect_DHSslopDHS_KDEplot.pdf'
plt.savefig(outplot)
plt.close('all')

g11 = sns.jointplot(data=bin_distance_effect_df, x='bin_distance',y='bin_avg_relative_top10_mean',
                    kind='reg',
                   scatter_kws={'s':10, 'linewidth':0})
plt.ylim(0,1)
sr, sp = stats.spearmanr(bin_distance_effect_df['bin_distance'], bin_distance_effect_df['bin_avg_relative_top10_mean'])
print('Spearman correlation rho:', sr, 'p-value:', sp)
r, p = stats.pearsonr(bin_distance_effect_df['bin_distance'], bin_distance_effect_df['bin_avg_relative_top10_mean'])
print('correlation:', r, 'p-value:',p)
outplot = 'BinAbsSignalDistance_vs_BinAvgTop10MeanEffect_DHSslopDHS_Jointplot.pdf'
plt.savefig(outplot)
plt.close('all')

g12 = sns.lmplot(data=bin_distance_effect_df, x='bin_distance',y='bin_avg_relative_top10_mean',
                    #lowess=True,
                 logistic=True,
                   scatter_kws={'s':10, 'linewidth':0, 'edgecolor': 'none'})
plt.ylim(0,1)
sr, sp = stats.spearmanr(bin_distance_effect_df['bin_distance'], bin_distance_effect_df['bin_avg_relative_top10_mean'])
print('Spearman correlation rho:', sr, 'p-value:', sp)
r, p = stats.pearsonr(bin_distance_effect_df['bin_distance'], bin_distance_effect_df['bin_avg_relative_top10_mean'])
print('correlation:', r, 'p-value:',p)
outplot = 'BinAbsSignalDistance_vs_BinAvgTop10MeanEffect_DHSslopDHS_lmplot.pdf'
plt.savefig(outplot)
plt.close('all')


Spearman correlation rho: -0.88469164610549 p-value: 1.388389161274329e-33
correlation: -0.8843065833798582 p-value: 1.6141127326401383e-33
Spearman correlation rho: -0.88469164610549 p-value: 1.388389161274329e-33
correlation: -0.8843065833798582 p-value: 1.6141127326401383e-33


In [9]:
# Random observation

##### Process distance_effect_df, beginning to compare ways of selecting gRNAs
# Note this dataframe doesnt contain any h3k27ac guides. But it has all dhs peak and dhs slop peak guides, which should be greater than the median/mean DHS length
num_grnas = 10
length_threshold = np.median(DHS_lengths)/2 # or np.mean(DHS_lengths); note that mean is > median
print('DHS length threshold: ', length_threshold)
effect_type = 'relative_mean_effect'


### Get various selections of num_grna relative_mean_effects
closest = []
furthest = [] # Furthest just means the num_grna closest to length_threshold absolute distance away from the strongest epigenetic summit
random = [] # gRNAs chosen at random
nth = [] # gRNAs chosen spaced apart from signal distance to length_threshold. This will use non-absolute distances, since guides are positioned. This must include the first guide, and as close to the last as possible!!!
nth_within_range = [] # gRNAs chosen spaced apart from signal distance to +/- 150 bp. This will use non-absolute distances. This must include the first guide, and as close to the last as possible!!!

### Iterate over each dhs peak
dhs_peak_counter = 0

for dhs_peak in set(distance_effect_df['dhs_peak'].tolist()):
    dhs_peak_counter += 1
    dhs_guide_df = distance_effect_df.loc[distance_effect_df['dhs_peak']==dhs_peak]
    
    ## Get closest guides
    closest.append(np.mean(dhs_guide_df.sort_values(by='abs_signal_distance', ascending=True)[effect_type].head(num_grnas).tolist()))
    
    ## Get the "furthest" guides, i.e. those that are nearest the length_threshold
    furthest.append(np.mean(dhs_guide_df.iloc[(dhs_guide_df['abs_signal_distance']-length_threshold).abs().argsort()[:num_grnas]][effect_type]))
    
    ## Get random guides
    # Random guides are selected from only within the range of length_threshold; Arbitrarily setting the seed for random_state as 1
    # No replacement
    temp_random = []
    try:
        #random.append(np.mean(dhs_guide_df.iloc[(dhs_guide_df['abs_signal_distance']-length_threshold).abs().argsort()].sample(n = num_grnas, replace=False, random_state = 1)[effect_type]))
        while len(temp_random) < 100:
            temp_random.append(np.mean(dhs_guide_df.sample(n = num_grnas, replace=False)[effect_type]))
        random.append(np.mean(temp_random))
    except:
        print('problem lol')
        random.append(np.nan)
        display(dhs_guide_df)
    
    ## nth
    # Filter for guides within an absolute distance of the length threshold. Then ascending sort by signal distance. Then get relative mean effects and turn that into an ordered list.
    ordered_guide_effects_array = np.array(dhs_guide_df.loc[dhs_guide_df['abs_signal_distance'] <= length_threshold].sort_values(by='signal_distance',ascending=True)[effect_type])

    # Get the indexed position every evenly spaced nth guide--including the first and close to the last
    idx = np.round(np.linspace(0, len(ordered_guide_effects_array) - 1, num_grnas)).astype(int)
    try:
        nth.append(np.mean(ordered_guide_effects_array[idx]))
    except:
        print(dhs_peak, 'Insufficient guides in length threshold distance')
        nth.append(np.nan)

    ## Nth within range
    within_range_array = np.array(dhs_guide_df.loc[dhs_guide_df['abs_signal_distance'] <= 100].sort_values(by='signal_distance',ascending=True)[effect_type])
    # Get the indexed positions
    idx2 = np.round(np.linspace(0, len(within_range_array) - 1, num_grnas)).astype(int)
    try:
        nth_within_range.append(np.mean(within_range_array[idx2]))  
    except:
        print(dhs_peak, 'Insufficient guides in 100 bp of summit')
        nth_within_range.append(np.nan)




categorical_df = pd.DataFrame.from_dict({'closest':closest,
                              'furthest':furthest,
                              'random':random,
                              'nth_within_median':nth,
                              'nth_within_100':nth_within_range}, 
                            orient='index').T

### IMPORTANT. DROPPING ALL ROWS THAT ARE NOT COMPLETE
categorical_df.dropna(inplace=True)
display(categorical_df)

##### T-tests
# Welch t-tests
tstat, pval = stats.ttest_ind(categorical_df['random'],categorical_df['furthest'],equal_var=False)
print(tstat,pval, 'random v furthest')

tstat, pval = stats.ttest_ind(categorical_df['random'],categorical_df['closest'],equal_var=False)
print(tstat,pval, 'random v closest')

tstat, pval = stats.ttest_ind(categorical_df['furthest'],categorical_df['closest'],equal_var=False)
print(tstat,pval, 'furthest v closest')

tstat, pval = stats.ttest_ind(categorical_df['nth_within_median'],categorical_df['closest'],equal_var=False)
print(tstat,pval, 'nth within median v closest')

tstat, pval = stats.ttest_ind(categorical_df['nth_within_100'],categorical_df['closest'],equal_var=False)
print(tstat,pval, 'nth within 100 v closest')




##### Plotting
# Note: mean of relative_mean_effects are plotted on the Y-axis as value
g13 = sns.boxplot(x="variable", y="value", data=pd.melt(categorical_df), color="cyan", order=['random','furthest','nth_within_median','nth_within_100','closest'])
g13 = sns.swarmplot(x="variable", y="value", data=pd.melt(categorical_df), color="black",order=['random','furthest','nth_within_median','nth_within_100','closest'])
plt.ylim(0,2)
outplot = 'CategoricalGuideSelection_Boxplots.pdf'
plt.savefig(outplot)
plt.close('all')




DHS length threshold:  179.0
problem lol


Unnamed: 0,dhs_peak,sequence,signal_distance,abs_signal_distance,epifeature_type,relative_top10_mean_effect,relative_mean_effect
3276,chr11_33942939_33943233,GGCCAGAAGAAACTTTGGGG,101,101,dhs_peak,1.316659,1.47362
3277,chr11_33942939_33943233,CACGCTGGTCACACTGACTG,-25,25,dhs_peak,1.228274,1.374698
3278,chr11_33942939_33943233,AAGGGATTTTTGCCTGAGCA,80,80,dhs_peak,0.524226,0.58672
3279,chr11_33942939_33943233,CTCCTCCCCAAAGTTTCTTC,87,87,dhs_peak,0.498925,0.558402
3280,chr11_33942939_33943233,CCCAAGAACAATGCAAATGA,55,55,dhs_peak,0.222908,0.249481
3281,chr11_33942939_33943233,TTGCAAATTGTACTCAAGCA,204,204,slopDHS_peak,0.357328,0.399926
3282,chr11_33942939_33943233,AAAATTGGCTCTCGATTTTC,154,154,slopDHS_peak,0.133017,0.148874
3283,chr11_33942939_33943233,CCCATTCTGATGAATTTTCT,283,283,slopDHS_peak,0.034474,0.038584
3284,chr11_33942939_33943233,GTTAGTTACTATTCTGAGTT,258,258,slopDHS_peak,0.00283,0.003167


chr11_61870793_61871589 Insufficient guides in 100 bp of summit
chr8_128048766_128049502 Insufficient guides in length threshold distance
chr8_128048766_128049502 Insufficient guides in 100 bp of summit
chr6_109237079_109237399 Insufficient guides in length threshold distance
chr6_109237079_109237399 Insufficient guides in 100 bp of summit


Unnamed: 0,closest,furthest,random,nth_within_median,nth_within_100
0,1.201481,0.834138,0.724288,0.967797,1.053959
1,1.086452,1.049832,0.998478,1.046584,1.251746
2,1.028418,0.75429,0.832256,0.843317,0.979803
3,0.971876,0.900411,0.712741,1.136931,0.883384
4,1.106282,1.120947,0.687242,1.459926,0.784156
5,1.360708,0.852475,0.879222,1.0809,1.210538
6,1.294434,1.264717,0.713519,1.351185,1.411214
7,1.017492,0.603317,0.903541,1.050437,0.919696
8,0.972664,1.474287,0.915337,0.84411,0.903133
9,0.98339,1.065328,0.688854,1.157328,1.059885


-1.5485689085212724 0.1292460511624156 random v furthest
-6.374741161542837 1.0673718774379844e-07 random v closest
-4.066270257081423 0.00019802414822511384 furthest v closest
-2.838933443200214 0.00690282391149817 nth within median v closest
-1.557655110001823 0.12650437263423756 nth within 100 v closest


In [9]:
##### 22-06-15
##### deepTools plotting
### First get all elements, and re-intersect with DHS and H3K27ac
all_elements = set(full_df['element'].tolist())
print('Number of all unique elements before intersections:', len(all_elements))

# Check to see that all elements have a 'h3k27ac_peak' and 'dhs_peak'
dhs_h3k27ac_containing_elements = []
for element in all_elements:
    element_epifeatures = set(full_df.loc[full_df['element']==element]['epifeature_type'].tolist())
    # Do not process the elements lacking either epifeature
    if 'h3k27ac_peak' not in element_epifeatures or 'dhs_peak' not in element_epifeatures:
        print(element, 'is lacking an epifeature!!!')
    else:
        chrom = element.split(':')[0]
        start, end = element.split(':')[1].split('-')
        dhs_h3k27ac_containing_elements.append(chrom+'_'+str(start)+'_'+str(end))

# Get elements dhs_h3k27ac elements
good_counter = 0
with open ('dhs_h3k27ac_elements.bed', 'w') as ofile:
    for good_element in dhs_h3k27ac_containing_elements:
        oline = good_element.split('_') + [good_element]
        ofile.write('\t'.join(oline)+'\n')
        good_counter += 1
ofile.close()

print(good_counter, 'Number of elements')

# Intersect with DHS
fopen = open('dhs_h3k27ac_elements_intersect_dhs.bed', 'w')
subprocess.call(['bedtools','intersect','-a','dhs_h3k27ac_elements.bed','-b', '/Users/davidy/misc_resources/chip/results/macs2/dhs_k562_hg38_optimal_peak_unique.bed', '-wa','-wb'], stdout=fopen)
fopen.close()

# Keep only the DHS intersecting element
dhs_h3k27ac_elements_intersect_dhs_df = pd.read_csv('dhs_h3k27ac_elements_intersect_dhs.bed', sep = '\t', header = None, names = ['chr1','start1','end1','element','chr2','start2','end2'])
print(len(dhs_h3k27ac_elements_intersect_dhs_df), 'DHS peaks')
dhs_h3k27ac_elements_intersect_dhs_df.to_csv('dhs_h3k27ac_elements_dhs_peaks.bed', sep = '\t', columns=['chr2','start2','end2'], header=False, index=False)


        

Number of all unique elements before intersections: 30
chr4:55614441-55616941:. is lacking an epifeature!!!
chr4:55624941-55627441:. is lacking an epifeature!!!
chr4:55670441-55671441:. is lacking an epifeature!!!
27 Number of elements
32 DHS peaks


In [10]:
##### 22-06-20
### Prepare bedGraph for all gRNAs +/- 2kb of dhs_h3k27ac_elements_dhs_peaks.bed from ALL gRNAs
all_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/220427_distance_optimization') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/220427_distance_optimization', f))]
grna_files = [f for f in all_files if 'grna_avg_z_log2FC' in f]

print(grna_files)

### Concatenate the grna_avg_z_log2FC.qBed files for all genes that pass filter (have at least one non-TSS, in-DHS sig element)
fopen = open('allgenes_passfilter_grna_avg_zlog2fc.bed', 'w')
subprocess.call(['cat']+grna_files,stdout=fopen)
fopen.close()

### 2kb Window with the h3k27ac-intersecting element DHS peaks
fopen = open('dhs_h3k27ac_elements_allgenes_passfilter.bed', 'w')
subprocess.call(['bedtools','window','-w', '1000', '-a', 'dhs_h3k27ac_elements_dhs_peaks.bed', '-b', 'allgenes_passfilter_grna_avg_zlog2fc.bed'], stdout=fopen)
fopen.close()








['CAT_grna_avg_z_log2FC.qBed', 'CD164_grna_avg_z_log2FC.qBed', 'NMU_grna_avg_z_log2FC.qBed', 'PVT1_grna_avg_z_log2FC.qBed', 'FADS3_grna_avg_z_log2FC.qBed', 'LMO2_grna_avg_z_log2FC.qBed', 'CAPRIN1_grna_avg_z_log2FC.qBed', 'MYC_grna_avg_z_log2FC.qBed', 'MEF2C_grna_avg_z_log2FC.qBed', 'FADS1_grna_avg_z_log2FC.qBed', 'FADS2_grna_avg_z_log2FC.qBed', 'GATA1_grna_avg_z_log2FC.qBed']


In [11]:
### Process into a bedgraph... need to filter for 
bedgraph_df = pd.read_csv('dhs_h3k27ac_elements_allgenes_passfilter.bed', sep = '\t', header = None, names = ['chr1','start1','end1','chr2','grna_start','grna_end','avg','rep1','rep2','grna_name','strand','sequence'], usecols=['chr1','start1','end1','chr2','grna_start','grna_end','avg','grna_name','sequence'])


bedgraph_df['dhs_cc'] = bedgraph_df['chr1'] + '_' + bedgraph_df['start1'].astype(str) + '_' + bedgraph_df['end1'].astype(str)
bedgraph_df['screen'] = bedgraph_df['grna_name'].str.split('|', expand=True)[0]


max_bedgraph_df = bedgraph_df.groupby(['dhs_cc', 'screen'], as_index=False)['avg'].mean().groupby(['dhs_cc']).max().reset_index()[['dhs_cc','screen']]


max_merged_bedgraph_df = pd.merge(max_bedgraph_df, bedgraph_df,  how='left', left_on=['dhs_cc','screen'], right_on = ['dhs_cc','screen'])


max_merged_bedgraph_df.to_csv('max_bedgraph.bedGraph', sep = '\t', 
                       columns = ['chr2','grna_start','grna_end','avg'], 
                       index=False, 
                       header=False)

display(max_merged_bedgraph_df)

fopen=open('sorted_max_bedgraph.bedGraph', 'w')
subprocess.call(['sort','-k1,1','-k2,2n','max_bedgraph.bedGraph'],stdout = fopen)
fopen.close()

### Need to split into single base bedgraph
sorted_max_df = pd.read_csv('sorted_max_bedgraph.bedGraph', sep = '\t', header=None, names = ['chr1','start1','end1','avg'])
display(sorted_max_df)
lod = []

#krab_window=0
krab_window = 150

for i, row in sorted_max_df.iterrows():
    for i in range(row['start1']-krab_window,row['end1']+krab_window,1):
        adf = {'chr1':row['chr1'],
              'start1':i,
              'end1':i+1,
              'avg':row['avg']}
        lod.append(adf)

expanded_sorted_max_df = pd.DataFrame(lod)
expanded_sorted_max_df.to_csv('expanded_max.bedGraph', sep = '\t', index = False, header = False)

fopen = open('sorted_expanded_max.bedGraph', 'w')
subprocess.call(['sort','-k1,1','-k2,2n', 'expanded_max.bedGraph'],stdout=fopen)
fopen.close()

# Merge, the -d -1 flag is very important to prevent book-ended features from being overlapped, keeping resolution of averaging at per-base. 
fopen = open('averaged_sorted_expanded_max.bedGraph', 'w')
subprocess.call(['bedtools','merge','-c','4','-o','mean','-d','-1' ,'-i','sorted_expanded_max.bedGraph'],stdout=fopen)
fopen.close()

# Finally run bedGraphtoBigWig
#fopen = open('averaged_expanded_max.bw','w')
subprocess.call(['/Users/davidy/pythonscripts/bedGraphToBigWig','averaged_sorted_expanded_max.bedGraph','/Users/davidy/misc_resources/hg38.chrom.sizes','all_sig_enhancer_guides_best_screen.bw'])
#fopen.close()






Unnamed: 0,dhs_cc,screen,chr1,start1,end1,chr2,grna_start,grna_end,avg,grna_name,sequence
0,chr11_33881952_33882395,LMO2,chr11,33881952,33882395,chr11,33880968,33880971,1.592793,LMO2|chr11:33880949-33880968:+,CAGAAGCCCATTTGACTCCA
1,chr11_33881952_33882395,LMO2,chr11,33881952,33882395,chr11,33880954,33880957,0.467571,LMO2|chr11:33880958-33880977:-,TTCTGTCCTTGGAGTCAAAT
2,chr11_33881952_33882395,LMO2,chr11,33881952,33882395,chr11,33881046,33881049,0.794744,LMO2|chr11:33881050-33881069:-,TATGGGATTCTCAGACAGGA
3,chr11_33881952_33882395,LMO2,chr11,33881952,33882395,chr11,33881047,33881050,2.463301,LMO2|chr11:33881051-33881070:-,GTATGGGATTCTCAGACAGG
4,chr11_33881952_33882395,LMO2,chr11,33881952,33882395,chr11,33881050,33881053,1.454777,LMO2|chr11:33881054-33881073:-,TAAGTATGGGATTCTCAGAC
...,...,...,...,...,...,...,...,...,...,...,...
6333,chrX_48782659_48783289,GATA1,chrX,48782659,48783289,chrX,48783281,48783284,3.288470,GATA1|chrX:48783285-48783304:-,AAATCTGTCCTCACAGGGAA
6334,chrX_48782659_48783289,GATA1,chrX,48782659,48783289,chrX,48783286,48783289,3.308078,GATA1|chrX:48783290-48783309:-,TAGGGAAATCTGTCCTCACA
6335,chrX_48782659_48783289,GATA1,chrX,48782659,48783289,chrX,48783287,48783290,4.320396,GATA1|chrX:48783291-48783310:-,ATAGGGAAATCTGTCCTCAC
6336,chrX_48782659_48783289,GATA1,chrX,48782659,48783289,chrX,48783304,48783307,8.281522,GATA1|chrX:48783308-48783327:-,GAAGCAGCGGTCGCAACATA


Unnamed: 0,chr1,start1,end1,avg
0,chr11,33880954,33880957,0.467571
1,chr11,33880968,33880971,1.592793
2,chr11,33881046,33881049,0.794744
3,chr11,33881047,33881050,2.463301
4,chr11,33881050,33881053,1.454777
...,...,...,...,...
6333,chrX,48783286,48783289,3.308078
6334,chrX,48783287,48783290,4.320396
6335,chrX,48783293,48783296,3.458756
6336,chrX,48783304,48783307,8.281522


0

In [62]:
# Compute matrix for just the DHS and H3K27ac
subprocess.call(['computeMatrix', 'reference-point', '-R', 'dhs_h3k27ac_elements_dhs_peaks.bed', '-S', '/users/davidy/misc_resources/chip/results/macs2/dnase.pval.signal.bw', '/users/davidy/misc_resources/chip/results/macs2/ENCFF469JMR_H3K27ac_hg38_K562_pval.bigWig', '-o', 'matrix_dhs_h3k27ac_pval.tab.gz', '--referencePoint', 'center', '-b', '1000', '-a', '1000', '--averageTypeBins', 'mean', '--skipZeros'])

# Compute matrix for sgRNAs
outname='matrix_sgrnas_krabwindow' + str(krab_window) + '.tab.gz'
subprocess.call(['computeMatrix', 'reference-point', '-R', 'dhs_h3k27ac_elements_dhs_peaks.bed', '-S', 'all_sig_enhancer_guides_best_screen.bw', '-o', outname, '--referencePoint', 'center', '-b', '1000', '-a', '1000'])


# plotProfiles
subprocess.call(['plotProfile', '-m', 'matrix_dhs_h3k27ac_pval.tab.gz', '-out', 'DHSandH3K27acEnhancer_bw_pval.pdf', '--perGroup', '--plotHeight', '6', '--plotWidth', '10'])
subprocess.call(['plotProfile', '-m', outname, '-out', outname.split('.')[0]+'.pdf', '--perGroup', '--plotHeight', '6', '--plotWidth', '10'])


0

In [11]:
##### Sequence feature gRNA effect analysis
### Poly-TTTT test
display(distance_effect_df)
triplepolyt_df = distance_effect_df.loc[(distance_effect_df['sequence'].str.contains('TTT')) & (~distance_effect_df['sequence'].str.contains('TTTT'))][['sequence', 'relative_mean_effect']]
loose_triplepolyt_df = distance_effect_df.loc[distance_effect_df['sequence'].str.contains('TTT')][['sequence', 'relative_mean_effect']]
display(triplepolyt_df)
polyt_df = distance_effect_df.loc[distance_effect_df['sequence'].str.contains('TTTT')][['sequence', 'relative_mean_effect']]
nonpolyt_df = distance_effect_df.loc[~distance_effect_df['sequence'].str.contains('TTTT')][['sequence', 'relative_mean_effect']]
notripleorquad_polyt_df = distance_effect_df.loc[~distance_effect_df['sequence'].str.contains('TTT')][['sequence', 'relative_mean_effect']]

print(len(polyt_df), 'polyTTTT')
print(len(nonpolyt_df), 'non-polyTTTT')

# Welch t-test of difference
tstat, pval = stats.ttest_ind(polyt_df['relative_mean_effect'], 
                              nonpolyt_df['relative_mean_effect'],
                              equal_var=False)
print('Welch t-test difference gRNA relative mean effects for gRNA with TTTT vs without. t-statistic:', tstat, 'p-value: ',pval)

tstat, pval = stats.ttest_ind(polyt_df['relative_mean_effect'], 
                              triplepolyt_df['relative_mean_effect'],
                              equal_var=False)
print('Welch t-test difference gRNA relative mean effects for gRNA with TTTT vs TTT. t-statistic:', tstat, 'p-value: ',pval)

tstat, pval = stats.ttest_ind(loose_triplepolyt_df['relative_mean_effect'], 
                              notripleorquad_polyt_df['relative_mean_effect'],
                              equal_var=False)
print('Welch t-test difference gRNA relative mean effects for gRNA with TTT vs without TTT+. t-statistic:', tstat, 'p-value: ',pval)

# Plot boxplot of effects for polyTTTT vs no polyTTTT
polyt_df['gRNA_type'] = 'PolyTTTT'
nonpolyt_df['gRNA_type'] = 'nonPolyTTTT'
concat_poly_df = pd.concat([polyt_df, nonpolyt_df])
display(concat_poly_df)

g14 = sns.boxplot(data=concat_poly_df, x='gRNA_type',y='relative_mean_effect')
outplot = 'PolyTTTT_gRNA_RelativeMeanEffect_Boxplots.pdf'
plt.ylim(-1,7.5)
plt.savefig(outplot)
plt.close('all')

# Plot boxplot of effects for polyTTT+ vs no polyTTT+
loose_triplepolyt_df['gRNA_type'] = 'PolyTTT+'
notripleorquad_polyt_df['gRNA_type'] = 'nonPolyTTT+'
concat_triplepolyt_df = pd.concat([loose_triplepolyt_df, notripleorquad_polyt_df])
display(concat_triplepolyt_df)

g15 = sns.boxplot(data=concat_triplepolyt_df, x='gRNA_type',y='relative_mean_effect')
outplot = 'PolyTTT+_gRNA_RelativeMeanEffect_Boxplots.pdf'
plt.ylim(-1,7.5)
plt.savefig(outplot)
plt.close('all')


Unnamed: 0,dhs_peak,sequence,signal_distance,abs_signal_distance,epifeature_type,relative_top10_mean_effect,relative_mean_effect
0,chr8_127794273_127795483,GGGGGGAGGCGCGCGCGGCC,-33,33,dhs_peak,1.306373,2.910400
1,chr8_127794273_127795483,GGGCGTTTGGGGAGGGTGAG,436,436,dhs_peak,1.040549,2.318183
2,chr8_127794273_127795483,GCCGTGTCTCCACAGGTCAC,215,215,dhs_peak,1.026346,2.286542
3,chr8_127794273_127795483,TGAGTAGTCGGACGGAGGAA,338,338,dhs_peak,1.016541,2.264698
4,chr8_127794273_127795483,CTCGCCACCAGTCTTGAGGC,394,394,dhs_peak,0.983773,2.191695
...,...,...,...,...,...,...,...
4130,chr6_109308067_109308452,ATATGCAGAGGGAAAATACC,911,911,slopDHS_peak,0.131830,0.244827
4131,chr6_109308067_109308452,CCTTACAGCAGATCTGTACA,987,987,slopDHS_peak,0.075213,0.139681
4132,chr6_109308067_109308452,TATGAATTTTCTAGCAGTTG,1109,1109,slopDHS_peak,0.052855,0.098159
4133,chr6_109308067_109308452,GTCTCTTAGTTGTGGAACTA,865,865,slopDHS_peak,0.018127,0.033664


Unnamed: 0,sequence,relative_mean_effect
1,GGGCGTTTGGGGAGGGTGAG,2.318183
9,GCGAGTTTGGGCGTTTGGGG,1.976602
10,ATGCCAGTGTTTGTGGTTCT,1.938657
20,AATGACCATGATGAGGGTTT,1.647622
29,CAGACCTCTAGTTTCGCCAG,1.359221
...,...,...
4097,CAATATTTACTTAAATAATC,1.907982
4108,AAAAGAATCAAACTATTTAA,1.217693
4111,ATATTTACTTAAATAATCAG,1.097846
4125,AATATTTACTTAAATAATCA,0.586932


195 polyTTTT
3940 non-polyTTTT
Welch t-test difference gRNA relative mean effects for gRNA with TTTT vs without. t-statistic: -3.8296122913792514 p-value:  0.00016858773855817904
Welch t-test difference gRNA relative mean effects for gRNA with TTTT vs TTT. t-statistic: -1.5959374650168416 p-value:  0.11166202878524924
Welch t-test difference gRNA relative mean effects for gRNA with TTT vs without TTT+. t-statistic: -6.340316597164033 p-value:  2.934703286691861e-10


Unnamed: 0,sequence,relative_mean_effect,gRNA_type
92,ACACGGAGTTACTGAGTTTT,0.311929,PolyTTTT
108,AATTTTGTCCCCTGCGTTTC,0.052234,PolyTTTT
150,TGGCTGGAGCTTTTCTGCAC,1.240630,PolyTTTT
204,GGTTGTGTTGCTGTTTTGCC,0.134968,PolyTTTT
277,AGAGATCAGGCTTTTGAGGA,1.029287,PolyTTTT
...,...,...,...
4128,TTAAGTTAGAAATGTTACTC,0.350060,nonPolyTTTT
4130,ATATGCAGAGGGAAAATACC,0.244827,nonPolyTTTT
4131,CCTTACAGCAGATCTGTACA,0.139681,nonPolyTTTT
4133,GTCTCTTAGTTGTGGAACTA,0.033664,nonPolyTTTT


Unnamed: 0,sequence,relative_mean_effect,gRNA_type
1,GGGCGTTTGGGGAGGGTGAG,2.318183,PolyTTT+
9,GCGAGTTTGGGCGTTTGGGG,1.976602,PolyTTT+
10,ATGCCAGTGTTTGTGGTTCT,1.938657,PolyTTT+
20,AATGACCATGATGAGGGTTT,1.647622,PolyTTT+
29,CAGACCTCTAGTTTCGCCAG,1.359221,PolyTTT+
...,...,...,...
4128,TTAAGTTAGAAATGTTACTC,0.350060,nonPolyTTT+
4130,ATATGCAGAGGGAAAATACC,0.244827,nonPolyTTT+
4131,CCTTACAGCAGATCTGTACA,0.139681,nonPolyTTT+
4133,GTCTCTTAGTTGTGGAACTA,0.033664,nonPolyTTT+
