In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
import math
import re
import subprocess
from os import listdir
from os.path import isfile, join

In [2]:
import matplotlib as mpl 
mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
import seaborn as sns
import matplotlib.pyplot as plt
plt.close("all")
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams["font.family"] = 'sans-serif'
mpl.rcParams['font.size'] = 8
sns.set(style="whitegrid")

In [3]:
cd /Users/davidy/jamboree20crispr/ontarget_analysis/230707_WTC11_Analysis

/Users/davidy/jamboree20crispr/ontarget_analysis/230707_WTC11_Analysis


In [81]:
### I/O
all_files = [f for f in listdir('drive-download-20230708T035752Z-001/crispri/guideQuant') if isfile(join('drive-download-20230708T035752Z-001/crispri/guideQuant', f))]
gq_files = [f for f in all_files if '_EQ_' not in f and '_files' not in f]

### Indicate tags for control and treatment files
control_tag = 'Unt'
treatment_tag = '240mg'


##### Process guideQuant files
gq_directory = 'drive-download-20230708T035752Z-001/crispri/guideQuant'

pla_gq_file = [f for f in gq_files if 'Pla' in f]
control_gq_files = [f for f in gq_files if control_tag in f]
treatment_gq_files = [f for f in gq_files if treatment_tag in f]

all_avg_z_scores = {}
targeting_avg_z_scores = {}

combo_rep_df = []

### Make a combined rep df with both tech reps for both conditions
## For control
control_rep_df = pd.read_csv(gq_directory+'/'+control_gq_files[0], sep = '\t', usecols=['name','SeqCounts']).rename(columns={'SeqCounts':'SeqCounts_Ctrl1'}).merge(pd.read_csv(gq_directory+'/'+control_gq_files[1], sep = '\t', usecols=['name','SeqCounts']).rename(columns={'SeqCounts':'SeqCounts_Ctrl2'}), on='name')
# Create an average count column
control_rep_df['avg_SeqCounts_Ctrl'] = control_rep_df[['SeqCounts_Ctrl1','SeqCounts_Ctrl2']].mean(axis=1)
## For treatment
treatment_rep_df = pd.read_csv(gq_directory+'/'+treatment_gq_files[0], sep = '\t', usecols=['name','SeqCounts']).rename(columns={'SeqCounts':'SeqCounts_Trt1'}).merge(pd.read_csv(gq_directory+'/'+treatment_gq_files[1], sep = '\t', usecols=['name','SeqCounts']).rename(columns={'SeqCounts':'SeqCounts_Trt2'}), on='name')
# Create an average count column
treatment_rep_df['avg_SeqCounts_Trt'] = treatment_rep_df[['SeqCounts_Trt1','SeqCounts_Trt2']].mean(axis=1)

## Merge 
gq_rep_df_merged = control_rep_df.merge(treatment_rep_df, on='name')

## Apply a pseudocount to any element under sequencing count coverage, and get normalization info
SeqCount_pseudocount = 10
condition_normalization_dict = {} # Key = Colname; Value = {'mean':X, 'total_count':Y}
for col in [col for col in gq_rep_df_merged.columns.values if col != 'name']:
    # Apply pseudocount
    gq_rep_df_merged[str(col)].clip(lower=SeqCount_pseudocount,inplace=True)
    # Get mean and total counts
    condition_normalization_dict[col] = {'mean':gq_rep_df_merged[str(col)].mean(),
                                        'total_count':gq_rep_df_merged[str(col)].sum()}


### Calculate log2FC according to eq 3.1
log2FC_pseudocount = 1
gq_rep_df_merged['log2fc_avg'] = np.log2((log2FC_pseudocount + gq_rep_df_merged['avg_SeqCounts_Trt']/float(condition_normalization_dict['avg_SeqCounts_Trt']['mean'])) / (log2FC_pseudocount + gq_rep_df_merged['avg_SeqCounts_Ctrl']/float(condition_normalization_dict['avg_SeqCounts_Ctrl']['mean'])))
gq_rep_df_merged['log2fc_rep1'] = np.log2((log2FC_pseudocount + gq_rep_df_merged['SeqCounts_Trt1']/float(condition_normalization_dict['SeqCounts_Trt1']['mean'])) / (log2FC_pseudocount + gq_rep_df_merged['SeqCounts_Ctrl1']/float(condition_normalization_dict['SeqCounts_Ctrl1']['mean'])))
gq_rep_df_merged['log2fc_rep2'] = np.log2((log2FC_pseudocount + gq_rep_df_merged['SeqCounts_Trt2']/float(condition_normalization_dict['SeqCounts_Trt2']['mean'])) / (log2FC_pseudocount + gq_rep_df_merged['SeqCounts_Ctrl2']/float(condition_normalization_dict['SeqCounts_Ctrl2']['mean'])))

### z-transform log2FC
## Get mean and stdev of all avg_log2FCs for all guides
## Note: DO NOT CALCULATE AVG Z SCORE FROM AN AVG LOG2FC
allgrna_log2fc_avg_mean = np.mean(gq_rep_df_merged['log2fc_avg'])
allgrna_log2fc_avg_sd = np.std(gq_rep_df_merged['log2fc_avg'], ddof = 1)
allgrna_log2fc_rep1_mean = np.mean(gq_rep_df_merged['log2fc_rep1'])
allgrna_log2fc_rep1_sd = np.std(gq_rep_df_merged['log2fc_rep1'], ddof = 1)
allgrna_log2fc_rep2_mean = np.mean(gq_rep_df_merged['log2fc_rep2'])
allgrna_log2fc_rep2_sd = np.std(gq_rep_df_merged['log2fc_rep2'], ddof = 1)

# z-transform
gq_rep_df_merged['z_log2fc_avg'] = (gq_rep_df_merged['log2fc_avg'] - allgrna_log2fc_avg_mean) / allgrna_log2fc_avg_sd
gq_rep_df_merged['z_log2fc_rep1'] = (gq_rep_df_merged['log2fc_rep1'] - allgrna_log2fc_rep1_mean) / allgrna_log2fc_rep1_sd
gq_rep_df_merged['z_log2fc_rep2'] = (gq_rep_df_merged['log2fc_rep2'] - allgrna_log2fc_rep2_mean) / allgrna_log2fc_rep2_sd

# Add average of replicates z-l2fc
gq_rep_df_merged['z_log2fc_repavg'] = (gq_rep_df_merged['z_log2fc_rep1']+gq_rep_df_merged['z_log2fc_rep2'])/2

### Add in meta-data
combo_rep_df = pd.read_csv(gq_directory+'/'+pla_gq_file[0], sep = '\t').drop(columns=['SeqCounts']).merge(gq_rep_df_merged, on='name')

# Add single base chromosomal coordinate based on strand
combo_rep_df['onebase_start'] = np.where(combo_rep_df['strandPerturbationTarget']== '+', combo_rep_df['startPerturbationTarget']-1, combo_rep_df['endPerturbationTarget'])
combo_rep_df['onebase_end'] = np.where(combo_rep_df['strandPerturbationTarget']== '+', combo_rep_df['startPerturbationTarget'], combo_rep_df['endPerturbationTarget']+1)

    
display(combo_rep_df)

##### Write to output
# Write full df
combo_rep_df.to_csv('WTC11_CRISPRi_{0}_{1}_ProcessedGuideQuant.tsv'.format(treatment_tag,control_tag), sep = '\t', index=False)
# Write simple bed (with PAM coordinates) for intersection
combo_rep_df.to_csv('WTC11_CRISPRi_{0}_{1}_Processed_Simple_PamCC.bed'.format(treatment_tag,control_tag), sep = '\t', index=False, header=False, columns=['chrPerturbationTarget','startPerturbationTarget','endPerturbationTarget','name','z_log2fc_repavg'])
# Write simple bed (with onebase coordinates) for intersection
combo_rep_df.to_csv('WTC11_CRISPRi_{0}_{1}_Processed_Simple_SingleBaseCC.bed'.format(treatment_tag,control_tag), sep = '\t', index=False, header=False, columns=['chrPerturbationTarget','onebase_start','onebase_end','name','z_log2fc_repavg'])

# Write simple qBed for plotting
combo_rep_df.to_csv('WTC11_CRISPRi_{0}_{1}_Processed_Simple_SingleBaseCC.qBed'.format(treatment_tag,control_tag), sep = '\t', index=False, header=False, columns=['chrPerturbationTarget','onebase_start','onebase_end','z_log2fc_repavg'])


##### Plotting

f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=combo_rep_df['SeqCounts_Trt1'], y=combo_rep_df['SeqCounts_Trt2'])
ax.set_xlabel('Counts Trt1 {}'.format(treatment_tag))
ax.set_ylabel('Counts Trt2 {}'.format(treatment_tag))
plt.savefig('WTC11_CRISPRi_{}_GQ_replicate_counts.pdf'.format(treatment_tag))
plt.close("all")

f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=combo_rep_df['SeqCounts_Ctrl1'], y=combo_rep_df['SeqCounts_Ctrl2'])
ax.set_xlabel('Counts Ctrl1 {}'.format(control_tag))
ax.set_ylabel('Counts Ctrl2 {}'.format(control_tag))
plt.savefig('WTC11_CRISPRi_{}_GQ_replicate_counts.pdf'.format(control_tag))
plt.close("all")

## Plot correlation of replicate average z scores
## Note: The following code works 
f, ax = plt.subplots(figsize=(6, 6))
ax.scatter(x=combo_rep_df['z_log2fc_rep1'], y=combo_rep_df['z_log2fc_rep2'])
ax.set_xlabel('Z-score Rep1 {} vs {}'.format(treatment_tag,control_tag))
ax.set_ylabel('Z-score Rep2 {} vs {}'.format(treatment_tag,control_tag))
plt.savefig('WTC11_CRISPRi_ZScores_Replicates_eq31_{0}_{1}_.pdf'.format(treatment_tag,control_tag))
plt.close("all")


"""

    
    h1 = sns.histplot(data=gq_rep_df_merged, x='SeqCounts_Pla')
    plt.savefig('WTC11_CRISPRi_Plasmid_SeqCounts.pdf')
    plt.close('all')
    
    seqcounts = 'SeqCounts_x'
    h2 = sns.histplot(data=gq_rep_df_merged, x=seqcounts)
    ofile = 'WTC11_CRISPRi_' + str(rep) + '_' + seqcounts + '.pdf'
    plt.xscale('log')
    plt.savefig(ofile)
    plt.close('all')
    
    seqcounts = 'SeqCounts_y'
    h2 = sns.histplot(data=gq_rep_df_merged, x=seqcounts)
    ofile = 'WTC11_CRISPRi_' + str(rep) + '_' + seqcounts + '.pdf'
    plt.xscale('log')
    plt.savefig(ofile)
    plt.close('all')
    

    # Add list of average z-scores to dictionary. The following section is sloppy, it's just for graphing
    # I can do this crude list-based approach since the rep dfs all share the same index
    all_avg_z_scores[rep] = gq_rep_df_merged['avg_z_log2fc'].tolist()
    # Exclude NT guides
    typed_df = gq_rep_df_merged.merge(pd.read_csv(gq_directory+'/'+pla_gq_file[0], sep = '\t', usecols = ['name','guideType']))
    #display(typed_df)
    targeting_df = typed_df.loc[typed_df['guideType']=='targeting']
    targeting_avg_z_scores[rep] = typed_df.loc[typed_df['guideType']=='targeting']['avg_z_log2fc'].tolist()
               


    ## Plot correlation of replicate average z scores for only targeting guides
    ## Note: The following code works 
    f, ax = plt.subplots(figsize=(6, 6))
    ax.scatter(x=targeting_avg_z_scores['Rep1'], y=targeting_avg_z_scores['Rep2'])
    #plt.xlim(0,1)
    #plt.ylim(0,1)
    #ax.plot([0, 1], [0, 1], ls="--", c=".3")
    ax.set_xlabel('Avg Z-score X')
    ax.set_ylabel('Avg z-score Y')
    plt.savefig('WTC11_CRISPRi_AvgZScores_TargetingGuides_Replicates_eq31.pdf'.format(rep))
    plt.close("all")



"""


Unnamed: 0,chrPerturbationTarget,startPerturbationTarget,endPerturbationTarget,name,strandPerturbationTarget,PerturbationTargetID,chrTSS,startTSS,endTSS,strandGene,...,avg_SeqCounts_Trt,log2fc_avg,log2fc_rep1,log2fc_rep2,z_log2fc_avg,z_log2fc_rep1,z_log2fc_rep2,z_log2fc_repavg,onebase_start,onebase_end
0,chr2,46464589,46464591,NA|chr2:46464589-46464591:+,+,chr2:46464589-46464591:+,,,,,...,978.0,0.280608,0.150232,0.374436,0.981268,0.710833,1.117771,0.914302,46464588,46464589
1,chr2,46464591,46464593,NA|chr2:46464591-46464593:+,+,chr2:46464591-46464593:+,,,,,...,121.0,-0.643184,-0.638981,-0.646961,-0.422204,-0.475305,-0.306564,-0.390935,46464590,46464591
2,chr2,46464569,46464571,NA|chr2:46464569-46464571:-,-,chr2:46464569-46464571:-,,,,,...,1739.0,1.006482,0.314522,1.562268,2.084051,0.957749,2.774197,1.865973,46464571,46464572
3,chr2,46464607,46464609,NA|chr2:46464607-46464609:-,-,chr2:46464607-46464609:-,,,,,...,29.0,-0.657438,-0.496606,-0.796025,-0.443860,-0.261324,-0.514433,-0.387879,46464609,46464610
4,chr2,46464648,46464650,NA|chr2:46464648-46464650:+,+,chr2:46464648-46464650:+,,,,,...,207.5,-0.907584,-0.628840,-1.201189,-0.823893,-0.460064,-1.079434,-0.769749,46464647,46464648
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12697,chr7,6947558,6947560,NA|chr7:6947558-6947560:+,+,chr7:6947558-6947560:+,,,,,...,88.0,-0.672463,-0.670587,-0.670541,-0.466685,-0.522807,-0.339446,-0.431126,6947557,6947558
12698,chr7,6964585,6964587,NA|chr7:6964585-6964587:-,-,chr7:6964585-6964587:-,,,,,...,482.5,-0.291004,-0.300932,-0.280858,0.112847,0.032761,0.203966,0.118363,6964587,6964588
12699,chr7,6978813,6978815,NA|chr7:6978813-6978815:+,+,chr7:6978813-6978815:+,,,,,...,350.5,-0.409383,-0.572145,-0.253189,-0.067001,-0.374855,0.242549,-0.066153,6978812,6978813
12700,chr7,6993727,6993729,NA|chr7:6993727-6993729:+,+,chr7:6993727-6993729:+,,,,,...,44.5,-0.629059,-0.788572,-0.460804,-0.400744,-0.700130,-0.046969,-0.373550,6993726,6993727


'\n\n    \n    h1 = sns.histplot(data=gq_rep_df_merged, x=\'SeqCounts_Pla\')\n    plt.savefig(\'WTC11_CRISPRi_Plasmid_SeqCounts.pdf\')\n    plt.close(\'all\')\n    \n    seqcounts = \'SeqCounts_x\'\n    h2 = sns.histplot(data=gq_rep_df_merged, x=seqcounts)\n    ofile = \'WTC11_CRISPRi_\' + str(rep) + \'_\' + seqcounts + \'.pdf\'\n    plt.xscale(\'log\')\n    plt.savefig(ofile)\n    plt.close(\'all\')\n    \n    seqcounts = \'SeqCounts_y\'\n    h2 = sns.histplot(data=gq_rep_df_merged, x=seqcounts)\n    ofile = \'WTC11_CRISPRi_\' + str(rep) + \'_\' + seqcounts + \'.pdf\'\n    plt.xscale(\'log\')\n    plt.savefig(ofile)\n    plt.close(\'all\')\n    \n\n    # Add list of average z-scores to dictionary. The following section is sloppy, it\'s just for graphing\n    # I can do this crude list-based approach since the rep dfs all share the same index\n    all_avg_z_scores[rep] = gq_rep_df_merged[\'avg_z_log2fc\'].tolist()\n    # Exclude NT guides\n    typed_df = gq_rep_df_merged.merge(pd.read_

In [84]:
##### Fig 2A. Avg sgRNA effects at element H3K27ac and ATAC (as a proxy of DHS) peaks
"""
Get significant elements. 
61 sig elements in 240mg, 32 sig elements in 160mg

Note -- the order of the intersections below is intentional, and matches what was previously done
Note -- There are some differences in processing algorithms, since gRNAs at the same DHS arent being measured orthogonally multiple times (e.g. FADS1, 2, and 3)

First intersect the significant elements with 1 kb expanded TSSs (using hg38_mart_protein_coding_tss1kb.bed), excluding those that intersect
Second, intersect significant elements with DHS/ATAC
- For now, can ignore multi-summit peaks... Just make sure that I remove any duplicate peaks (these are multi-summit)
Third, intersect the non-TSS sig cCRE DHS peaks with H3K27ac. No slop/range expansion! Remove duplicates.
Fourth, if an element has multiple DHS peaks that intersect H3K27ac, nominate a single DHS peak. Only one DHS peak per sig element!
- Option 1: Use the DHS peak with the highest average (per base) signal.
- Option 2: Empirically select the DHS peak closest to the top 3 sgRNAs
Fifth, get the average effects of all sgRNAs intersecting a given epifeature peak.
- Effect is the average z-transformed log2FC from both screen replicates
- sgRNA intersections are based on PAM coordinates, with no further slop/range expansion
- If a DHS intersects two H3K27ac peaks, treat each H3K27ac peak independently
"""
##### (0)
### Process EQ files, get significant common elements, and write to bed
eq_directory = 'drive-download-20230708T035752Z-001/crispri/elementQuant'
elements_rep1 = pd.read_csv(eq_directory+'/160mg_ENCFF646RAN.tsv',sep='\t',usecols=['chrom','chromStart','chromEnd','PerturbationTargetID'])

# Write table for intersection purposes
common_sig_elements_output = 'WTC11_CRISPRi_Sig_Elements_{}.bed'.format(treatment_tag)
elements_rep1.to_csv(common_sig_elements_output, sep = '\t', index=False, header=False)


In [85]:
##### (1) Intersect with 1 kb expanded TSSs, and exclude any overlapping elements
# Note: Manually copied over hg38_mart_protein_coding_tss1kb.bed into 230707_WTC11_Analysis
illegal_tss_ofile = 'illegal_tss_common_sig_elements_intersect.bed' 
fopen = open(illegal_tss_ofile, 'w')
subprocess.call(['bedtools','intersect','-a',common_sig_elements_output,'-b','hg38_mart_protein_coding_tss1kb.bed','-wa','-wb'],stdout=fopen)
fopen.close()

### Get common sig non-TSS elements 
# Process illegal tss intersecting elements
illegal_tss_common_elements = list(set(pd.read_csv(illegal_tss_ofile, sep = '\t', header = None, names = ['chr1','start1','end1','PerturbTargetID','chr2','start2','end2','symbol'])['PerturbTargetID'].tolist()))
print('There are {} common significant elements that intersect +/- 1kb expanded TSS ranges'.format(len(illegal_tss_common_elements)), illegal_tss_common_elements)
# Exclude the illegal elements from the common sig elements
non_promoter_common_sig_elements = common_sig_elements.loc[~common_sig_elements['PerturbationTargetID'].isin(illegal_tss_common_elements)]
# Write this dataframe to a bed for intersection purposes
tssfiltered_common_sig_elements_output = 'WTC11_CRISPRi_NonTSS_Common_Sig_Elements.bed'
non_promoter_common_sig_elements.to_csv(tssfiltered_common_sig_elements_output, sep = '\t', index=False, header=False)


There are 40 common significant elements that intersect +/- 1kb expanded TSS ranges ['chr3:38024347-38025405:.', 'chr2:47034146-47034911:.', 'chr2:47368918-47370143:.', 'chr20:4221298-4222072:.', 'chr2:46616497-46617342:.', 'chr20:4686246-4687437:.', 'chr20:5911126-5911906:.', 'chrX:135031924-135033120:.', 'chr2:47175627-47177035:.', 'chr3:37860400-37861401:.', 'chrX:135342909-135344870:.', 'chr2:46698689-46699696:.', 'chr7:5513400-5514161:.', 'chr2:46541698-46544265:.', 'chr20:4148481-4149850:.', 'chr20:6005736-6006471:.', 'chr7:6058597-6059412:.', 'chr7:6374225-6375026:.', 'chr20:4249093-4249842:.', 'chr3:37175800-37176855:.', 'chrX:134373043-134373594:.', 'chr2:47570494-47571981:.', 'chr7:5562057-5563839:.', 'chr2:46940390-46941878:.', 'chrX:134796192-134797382:.', 'chr2:48314106-48315728:.', 'chr20:4172153-4172791:.', 'chrX:134914362-134915747:.', 'chr2:48440406-48441214:.', 'chr7:6104367-6105482:.', 'chrX:135098651-135099180:.', 'chr20:4822643-4824017:.', 'chr3:37242911-37243712:.

In [86]:
##### (2) Intersect with DHS
# Note, the DHS file lacks summit positions!!!
WTC11_DHS_file = 'WTC11_DNase_hg38_ENCFF854DSG.bed.gz' # Note, this does not come from the same experiment/line
dhs_intersect_ofile = 'NonTSS_Common_Sig_Elements_DHS_Intersect.bed'
fopen = open(dhs_intersect_ofile, 'w')
subprocess.call(['bedtools','intersect','-a',tssfiltered_common_sig_elements_output,'-b',WTC11_DHS_file,'-wa','-wb'],stdout=fopen)
fopen.close()

# Process and get DHS peaks
dhs_intersect_df = pd.read_csv(dhs_intersect_ofile, sep = '\t', header = None, names = ['chr1','start1','end1','PerturbTargetID','chr2','start2','end2','null1','null2','null3','signal','null4','null5','null6'], usecols=['chr1','start1','end1','PerturbTargetID','chr2','start2','end2','signal'])
dhs_intersect_df['dhs_cc'] = dhs_intersect_df['chr2'].astype(str) + '_' + dhs_intersect_df['start2'].astype(str) + '_' + dhs_intersect_df['end2'].astype(str)

# Remove duplicates
# Note, there are no duplicates
dhs_intersect_df.drop_duplicates(subset = ['PerturbTargetID','dhs_cc'], inplace=True)
display(dhs_intersect_df)

# Record element_dhs dict
element_dhs_dict = dhs_intersect_df.set_index('PerturbTargetID').to_dict()['dhs_cc'] # Key = PerturbTargetID; Value = dhs_cc

# Write DHS to output, keep the PerturbTargetID
dhs_intersect_ofile2 = 'NonTSS_Common_Sig_Elements_DHS_Intersect_cleaned.bed'
dhs_intersect_df.to_csv(dhs_intersect_ofile2, sep = '\t', columns=['chr2','start2','end2','dhs_cc','PerturbTargetID'], header=False, index=False)




Unnamed: 0,chr1,start1,end1,PerturbTargetID,chr2,start2,end2,signal,dhs_cc
0,chr2,47191521,47193363,chr2:47191521-47193363:.,chr2,47191548,47191727,1.01758,chr2_47191548_47191727
1,chr2,47191521,47193363,chr2:47191521-47193363:.,chr2,47192620,47192709,0.866826,chr2_47192620_47192709
2,chr20,4760486,4761348,chr20:4760486-4761348:.,chr20,4760680,4760840,2.90827,chr20_4760680_4760840
3,chr20,4760486,4761348,chr20:4760486-4761348:.,chr20,4760960,4761280,0.829138,chr20_4760960_4761280
4,chr7,5486761,5488068,chr7:5486761-5488068:.,chr7,5486960,5487180,4.02635,chr7_5486960_5487180
5,chr7,5486761,5488068,chr7:5486761-5488068:.,chr7,5487920,5488100,2.45601,chr7_5487920_5488100
6,chr7,5607061,5607714,chr7:5607061-5607714:.,chr7,5607560,5607760,2.13566,chr7_5607560_5607760
7,chr7,6525977,6526746,chr7:6525977-6526746:.,chr7,6526560,6526840,2.11682,chr7_6526560_6526840
8,chr7,5426774,5428241,chr7:5426774-5428241:.,chr7,5426800,5426960,2.8957,chr7_5426800_5426960
9,chr7,5426774,5428241,chr7:5426774-5428241:.,chr7,5427360,5427634,1.33165,chr7_5427360_5427634


In [87]:
##### (3) Intersect with H3K27ac

# Intersect
H3K27ac_file = 'WTC11_H3K27ac-MintChIPSeq_hg38_ENCFF655PNM.bed.gz'
h3k27ac_intersect_ofile = 'NonTSS_Common_Sig_Element_DHS_H3K27ac_intersect.bed'
fopen = open(h3k27ac_intersect_ofile, 'w')
subprocess.call(['bedtools','intersect','-a',dhs_intersect_ofile2,'-b',H3K27ac_file, '-wa','-wb'],stdout=fopen)
fopen.close()

# Process intersect
h3k27ac_intersect_df = pd.read_csv(h3k27ac_intersect_ofile, sep='\t', header=None, names=['dhs_chr','dhs_start','dhs_end','dhs_cc','PerturbTargetID','h3k27ac_chr','h3k27ac_start','h3k27ac_end','h3k27ac_peak_id','h3k27ac_total_signal','null1','null2','null3','h3k27ac_avg_signal','h3k27ac_summit'])
h3k27ac_intersect_df['h3k27ac_cc'] = h3k27ac_intersect_df['h3k27ac_chr'].astype(str) + '_' + h3k27ac_intersect_df['h3k27ac_start'].astype(str) + '_' + h3k27ac_intersect_df['h3k27ac_end'].astype(str)

display(h3k27ac_intersect_df)

# Note, the original intersection only provides 8 intersections

# Write to output, keep the PerturbTargetID
h3k27ac_intersect_ofile2 = 'NonTSS_Common_Sig_Element_DHS_H3K27ac_intersect_cleaned.bed'
h3k27ac_intersect_df.to_csv(h3k27ac_intersect_ofile2, sep = '\t', columns=['h3k27ac_chr','h3k27ac_start','h3k27ac_end','h3k27ac_cc','PerturbTargetID'], index = False, header = False)

# ***** MAY COME BACK HERE TO REPEAT THIS AFTER SLOPPING THE DHS PEAKS *****
# ***** MAY COME BACK HERE TO REPEAT THIS AFTER SLOPPING THE DHS PEAKS *****
# ***** MAY COME BACK HERE TO REPEAT THIS AFTER SLOPPING THE DHS PEAKS *****



Unnamed: 0,dhs_chr,dhs_start,dhs_end,dhs_cc,PerturbTargetID,h3k27ac_chr,h3k27ac_start,h3k27ac_end,h3k27ac_peak_id,h3k27ac_total_signal,null1,null2,null3,h3k27ac_avg_signal,h3k27ac_summit,h3k27ac_cc
0,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191431,47192115,Peak_46664,287,.,5.77803,28.79111,26.25397,588,chr2_47191431_47192115
1,chr2,47192620,47192709,chr2_47192620_47192709,chr2:47191521-47193363:.,chr2,47192356,47193228,Peak_23593,465,.,4.83537,46.57778,43.75097,407,chr2_47192356_47193228
2,chr7,5607560,5607760,chr7_5607560_5607760,chr7:5607061-5607714:.,chr7,5607342,5607850,Peak_208671,101,.,3.9815,10.14974,8.17458,103,chr7_5607342_5607850
3,chr7,5427360,5427634,chr7_5427360_5427634,chr7:5426774-5428241:.,chr7,5427367,5427851,Peak_12084,752,.,13.36647,75.26088,72.10416,157,chr7_5427367_5427851
4,chr7,6081460,6081680,chr7_6081460_6081680,chr7:6081033-6081797:.,chr7,6081662,6081841,Peak_161857,122,.,4.40809,12.24941,10.17617,93,chr7_6081662_6081841
5,chr7,5555880,5556080,chr7_5555880_5556080,chr7:5555056-5556791:.,chr7,5555668,5555887,Peak_110371,160,.,5.11907,16.01559,13.80025,124,chr7_5555668_5555887
6,chr7,5569460,5569820,chr7_5569460_5569820,chr7:5569296-5570610:.,chr7,5569258,5569635,Peak_15187,639,.,11.29301,63.96555,60.92708,269,chr7_5569258_5569635
7,chrX,134549540,134549700,chrX_134549540_134549700,chrX:134549336-134550882:.,chrX,134548604,134549585,Peak_17867,567,.,11.09132,56.76051,53.80297,583,chrX_134548604_134549585


In [29]:
##### (4) Handle elements with multiple unique DHS_CC and H3K27ac peaks
# Note, for a first pass, will proceed with ALL peaks, do not try to nominate one DHS/H3K27ac peak per element
"""
Fourth, if an element has multiple DHS peaks that intersect H3K27ac, nominate a single DHS peak. Only one DHS peak per sig element!
- Option 1: Use the DHS peak with the highest average (per base) signal.
- Option 2: Empirically select the DHS peak closest to the top 3 sgRNAs
- Option 3: ALTERNATIVELY, KEEP ALL UNIQUE DHS/H3K27ac PEAKS, AND TREAT THEM INDEPENDENTLY!
"""

'\nFourth, if an element has multiple DHS peaks that intersect H3K27ac, nominate a single DHS peak. Only one DHS peak per sig element!\n- Option 1: Use the DHS peak with the highest average (per base) signal.\n- Option 2: Empirically select the DHS peak closest to the top 3 sgRNAs\n- Option 3: ALTERNATIVELY, KEEP ALL UNIQUE DHS/H3K27ac PEAKS, AND TREAT THEM INDEPENDENTLY!\n'

In [88]:
##### (5) Get avg effects of all sgRNAs intersecting a given epifeature peak
"""
Fifth, get the average effects of all sgRNAs intersecting a given epifeature peak.
- Effect is the average z-transformed log2FC from both screen replicates
- sgRNA intersections are based on PAM coordinates, with no further slop/range expansion
- If a DHS intersects two H3K27ac peaks, treat each H3K27ac peak independently
"""
# Input files
dhs_element_file = dhs_intersect_ofile2
h3k27ac_element_file = h3k27ac_intersect_ofile2
grna_pam_file = 'WTC11_CRISPRi_{0}_{1}_Processed_Simple_PamCC.bed'.format(treatment_tag,control_tag)

## Intersect epifiles with grna file
# DHS - guide
dhs_grna_intersect_file = 'dhs_grna_intersect.bed'
fopen = open(dhs_grna_intersect_file, 'w')
subprocess.call(['bedtools','intersect','-a',dhs_element_file,'-b',grna_pam_file,'-wa','-wb'],stdout=fopen)

fopen.close()
# H3K27ac - guide
h3k27ac_grna_intersect_file = 'h3k27ac_grna_intersect.bed'
fopen = open(h3k27ac_grna_intersect_file, 'w')
subprocess.call(['bedtools','intersect','-a',h3k27ac_element_file,'-b',grna_pam_file,'-wa','-wb'],stdout=fopen)
fopen.close()

### Process intersections
for f in [dhs_grna_intersect_file, h3k27ac_grna_intersect_file]:
    # Read file to df
    epifeature_grna_intersect_df = pd.read_csv(f, sep = '\t', header=None, names = ['epi_chr','epi_start','epi_end','epi_cc','PerturbTargetID','chrPerturbationTarget','startPerturbationTarget','endPerturbationTarget','name','all_rep_z_log2fc'])
    display(epifeature_grna_intersect_df)
    # For each epi_cc, use groupby to get average all_rep_z_log2fc
    averaged_df = epifeature_grna_intersect_df.groupby('epi_cc')['all_rep_z_log2fc'].mean()
    
    display(averaged_df)




Unnamed: 0,epi_chr,epi_start,epi_end,epi_cc,PerturbTargetID,chrPerturbationTarget,startPerturbationTarget,endPerturbationTarget,name,all_rep_z_log2fc
0,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191563,47191565,NA|chr2:47191563-47191565:+,0.189141
1,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191565,47191567,NA|chr2:47191565-47191567:-,-0.374988
2,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191602,47191604,NA|chr2:47191602-47191604:+,-0.564378
3,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191620,47191622,NA|chr2:47191620-47191622:+,-0.197447
4,chr2,47191548,47191727,chr2_47191548_47191727,chr2:47191521-47193363:.,chr2,47191630,47191632,NA|chr2:47191630-47191632:+,1.647936
...,...,...,...,...,...,...,...,...,...,...
422,chrX,134550780,134551000,chrX_134550780_134551000,chrX:134549336-134550882:.,chrX,134550880,134550882,NA|chrX:134550880-134550882:+,-0.612369
423,chrX,134550780,134551000,chrX_134550780_134551000,chrX:134549336-134550882:.,chrX,134550858,134550860,NA|chrX:134550858-134550860:-,-0.380782
424,chrX,134550780,134551000,chrX_134550780_134551000,chrX:134549336-134550882:.,chrX,134550895,134550897,NA|chrX:134550895-134550897:+,-0.121992
425,chrX,134550780,134551000,chrX_134550780_134551000,chrX:134549336-134550882:.,chrX,134550898,134550900,NA|chrX:134550898-134550900:+,-0.344531


epi_cc
chr20_4760680_4760840       0.143323
chr20_4760960_4761280      -0.148852
chr2_47191548_47191727      0.279352
chr2_47192620_47192709     -0.191931
chr7_5071940_5072180       -0.021583
chr7_5426800_5426960       -0.023514
chr7_5427360_5427634       -0.045770
chr7_5427860_5428240       -0.060878
chr7_5486960_5487180       -0.221552
chr7_5487920_5488100        0.007402
chr7_5554920_5555060       -1.038194
chr7_5555120_5555340       -0.249265
chr7_5555880_5556080        0.256044
chr7_5556580_5556660       -0.111073
chr7_5569460_5569820       -0.176050
chr7_5570160_5570340        0.014211
chr7_5607560_5607760        0.012159
chr7_6081460_6081680        0.085489
chr7_6526560_6526840       -0.038710
chr7_6663840_6664040       -0.303666
chrX_134549540_134549700   -0.171844
chrX_134550780_134551000   -0.367152
Name: all_rep_z_log2fc, dtype: float64

Unnamed: 0,epi_chr,epi_start,epi_end,epi_cc,PerturbTargetID,chrPerturbationTarget,startPerturbationTarget,endPerturbationTarget,name,all_rep_z_log2fc
0,chr2,47191431,47192115,chr2_47191431_47192115,chr2:47191521-47193363:.,chr2,47191530,47191532,NA|chr2:47191530-47191532:+,-0.870077
1,chr2,47191431,47192115,chr2_47191431_47192115,chr2:47191521-47193363:.,chr2,47191521,47191523,NA|chr2:47191521-47191523:-,-0.381478
2,chr2,47191431,47192115,chr2_47191431_47192115,chr2:47191521-47193363:.,chr2,47191522,47191524,NA|chr2:47191522-47191524:-,-0.484247
3,chr2,47191431,47192115,chr2_47191431_47192115,chr2:47191521-47193363:.,chr2,47191563,47191565,NA|chr2:47191563-47191565:+,0.189141
4,chr2,47191431,47192115,chr2_47191431_47192115,chr2:47191521-47193363:.,chr2,47191565,47191567,NA|chr2:47191565-47191567:-,-0.374988
...,...,...,...,...,...,...,...,...,...,...
227,chrX,134548604,134549585,chrX_134548604_134549585,chrX:134549336-134550882:.,chrX,134549536,134549538,NA|chrX:134549536-134549538:+,0.726574
228,chrX,134548604,134549585,chrX_134548604_134549585,chrX:134549336-134550882:.,chrX,134549560,134549562,NA|chrX:134549560-134549562:+,1.150136
229,chrX,134548604,134549585,chrX_134548604_134549585,chrX:134549336-134550882:.,chrX,134549567,134549569,NA|chrX:134549567-134549569:+,-0.293044
230,chrX,134548604,134549585,chrX_134548604_134549585,chrX:134549336-134550882:.,chrX,134549575,134549577,NA|chrX:134549575-134549577:+,-0.124921


epi_cc
chr2_47191431_47192115      0.053818
chr2_47192356_47193228     -0.099076
chr7_5427367_5427851       -0.027801
chr7_5555668_5555887       -0.070976
chr7_5569258_5569635        0.019631
chr7_5607342_5607850        0.056293
chr7_6081662_6081841        0.074098
chrX_134548604_134549585   -0.058290
Name: all_rep_z_log2fc, dtype: float64