In [9]:
import pandas as pd
import numpy as np
import scipy
import scipy.stats as stats
import math
import re
import subprocess
from os import listdir
from os.path import isfile, join

In [2]:
import matplotlib as mpl 
mpl.use('Agg')
mpl.rcParams['pdf.fonttype'] = 42
import seaborn as sns
import matplotlib.pyplot as plt
plt.close("all")
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams["font.family"] = 'sans-serif'
mpl.rcParams['font.size'] = 8
sns.set(style="ticks")

In [3]:
cd /Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide

/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide


In [4]:
### Get screen genes
all_eq_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/all_series') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/all_series', f))]
eq_files = [f for f in all_eq_files if '_EQ_' in f]


screen_genes = set([f.split('_')[0] for f in eq_files])


print(screen_genes)

{'HDAC6', 'FADS2', 'GATA1', 'ERP29', 'LMO2', 'MEF2C', 'CAT', 'CD164', 'CAPRIN1', 'FEN1', 'FADS3', 'PVT1', 'FADS1', 'NMU', 'MYC'}


In [5]:
### Get the TSS coordinates of the genes
hg38_mart_df = pd.read_csv('/Users/davidy/misc_resources/hg38_mart_export.txt', sep = ' ', header = 1, names = ['ensg','symbol','type','chr','gene_start','gene_end','tss','tx_start','tx_end'])

# Get only the rows matching our genes of interest
hg38_mart_df = hg38_mart_df.loc[hg38_mart_df['symbol'].isin(screen_genes)]

# Create a new TSS window. TSS +/- 250. This will be intersected with DHS peaks later.
hg38_mart_df['tss_start'] = hg38_mart_df['tss']-250
hg38_mart_df['tss_end'] = hg38_mart_df['tss']+250

display(hg38_mart_df)

# Write out to bedfile
hg38_mart_df.to_csv('HCRFF_gene_tss.bed', sep = '\t', columns=['chr','tss_start','tss_end','symbol'], index = False, header = False)

### Intersect with DHS peaks
int_file = 'gene_tss_dhs_peak_intersect.bed'
fopen = open(int_file, 'w')
subprocess.call(['bedtools','intersect','-a','HCRFF_gene_tss.bed','-b','/Users/davidy/misc_resources/chip/results/macs2/dhs_k562_hg38_optimal_peak.narrowPeak.bed', '-wa','-wb'],stdout=fopen)
fopen.close()

### Process intersect file. For each DHS peak that was intersected, only take the strongest summit. 
gene_tss_dhs_df = pd.read_csv('gene_tss_dhs_peak_intersect.bed', sep = '\t', header = None, names = ['chr1','start1','end1','symbol' ,'chr2','start2','end2','blank1','blank2','blank3','signal1','signal2','signal3','summit'])
gene_tss_dhs_df['dhs_peak_cc'] = gene_tss_dhs_df['chr2'].astype(str) + '_' + gene_tss_dhs_df['start2'].astype(str) + '_' + gene_tss_dhs_df['end2'].astype(str)
# Get only strongest summit
gene_tss_dhs_df = gene_tss_dhs_df.loc[gene_tss_dhs_df.groupby(['dhs_peak_cc', 'symbol'])['signal1'].idxmax()]

gene_tss_dhs_df.to_csv('gene_tss_dhs_peak_intersect.bed', sep = '\t', index = False, columns=['chr2','start2','end2','dhs_peak_cc','symbol','summit'], header = False)
display(gene_tss_dhs_df)



Unnamed: 0,ensg,symbol,type,chr,gene_start,gene_end,tss,tx_start,tx_end,tss_start,tss_end
29665,ENSG00000221968,FADS3,protein_coding,chr11,61873519,61892051,61892051,61873519,61892051,61891801,61892301
29666,ENSG00000221968,FADS3,protein_coding,chr11,61873519,61892051,61882526,61873524,61882526,61882276,61882776
29667,ENSG00000221968,FADS3,protein_coding,chr11,61873519,61892051,61891545,61873526,61891545,61891295,61891795
29668,ENSG00000221968,FADS3,protein_coding,chr11,61873519,61892051,61880702,61873527,61880702,61880452,61880952
29669,ENSG00000221968,FADS3,protein_coding,chr11,61873519,61892051,61878582,61873735,61878582,61878332,61878832
...,...,...,...,...,...,...,...,...,...,...,...
220327,ENSG00000094631,HDAC6,protein_coding,chrX,48801377,48824982,48819447,48819447,48824976,48819197,48819697
220328,ENSG00000094631,HDAC6,protein_coding,chrX,48801377,48824982,48823518,48823518,48824976,48823268,48823768
222670,ENSG00000102145,GATA1,protein_coding,chrX,48786554,48794311,48786554,48786554,48794311,48786304,48786804
222671,ENSG00000102145,GATA1,protein_coding,chrX,48786554,48794311,48786562,48786562,48794297,48786312,48786812


Unnamed: 0,chr1,start1,end1,symbol,chr2,start2,end2,blank1,blank2,blank3,signal1,signal2,signal3,summit,dhs_peak_cc
131,chr11,33869286,33869786,LMO2,chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78,chr11_33869527_33869751
133,chr11,33869566,33870066,LMO2,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106,chr11_33869786_33870019
140,chr11,33870062,33870562,LMO2,chr11,33870097,33871195,.,970,.,3.94733,31.98774,29.49923,514,chr11_33870097_33871195
136,chr11,33891826,33892326,LMO2,chr11,33891749,33892410,.,1000,.,4.2016,36.67241,34.09317,362,chr11_33891749_33892410
149,chr11,34051481,34051981,CAPRIN1,chr11,34051352,34052969,.,1000,.,5.0197,51.86429,48.95948,159,chr11_34051352_34052969
199,chr11,34052731,34053231,CAPRIN1,chr11,34053012,34053606,.,941,.,4.23407,36.98389,34.39718,492,chr11_34053012_34053606
130,chr11,34438684,34439184,CAT,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
29,chr11,61792730,61793230,FADS2,chr11,61792013,61793571,.,1000,.,5.39764,50.24253,47.37516,623,chr11_61792013_61793571
55,chr11,61792661,61793161,FEN1,chr11,61792013,61793571,.,1000,.,5.39764,50.24253,47.37516,623,chr11_61792013_61793571
74,chr11,61815041,61815541,FADS1,chr11,61814898,61815720,.,1000,.,4.57494,47.89878,45.08473,465,chr11_61814898_61815720


In [6]:
### Get the positive control sgRNAs from the Broad library...
dolcetto_all_df = pd.read_csv('/Users/davidy/misc_resources/guide_libraries/broadgpp-dolcetto-targets-all.txt', sep = '\t', header = 0, names = ['spacer','symbol','geneid'],usecols=['spacer','symbol'])

# Subset to just spacers targeting our genes of interest
dolcetto_all_df = dolcetto_all_df.loc[dolcetto_all_df['symbol'].isin(screen_genes)]
display(dolcetto_all_df)

dolcetto_spacers = set(dolcetto_all_df['spacer'].tolist())


Unnamed: 0,spacer,symbol
616,AAATCCACTCCTGGAGCCCG,FADS1
652,AAATGCGGCGCGGGGCACAG,ERP29
1469,AAGCAGCGGGGAGAGAAATG,ERP29
1931,AAGGTGCGAGGGGCTCCGGG,LMO2
3462,ACCCGCCGCTAAGCTGAGAA,FEN1
...,...,...
109436,TGCAGCGAGCAGCCGGCGCG,FADS2
109469,TGCAGGAGGCCTCGGCTCGT,CAT
109925,TGCGCTGCGCCACGCGTAGC,NMU
110086,TGCTATTGGATTGACTTTGA,MEF2C


In [54]:
### Get the positive control sgRNAs from the Weissman library...
weissman_all_df = pd.read_csv('/Users/davidy/misc_resources/guide_libraries/crisprI_human_20160604_ref.csv', header=None, names=['name','sequence'])
# Subset to just spacers targeting oru genes of interest
weissman_all_df['symbol'] = weissman_all_df['name'].str.split('_',expand=True)[0]
weissman_all_df['spacer'] = weissman_all_df['sequence'].str.split('CCACCTTGTTGG',expand=True)[1].str.split('GTTTAAGAGCTAAGCTG',expand=True)[0]
weissman_all_df = weissman_all_df.loc[(weissman_all_df['symbol']!='0Safe')&(weissman_all_df['symbol']!='0NONE')]
weissman_all_df = weissman_all_df[['symbol','spacer']]

weissman_spacers = set(weissman_all_df['spacer'].tolist())
display(weissman_all_df)


Unnamed: 0,symbol,spacer
0,A1BG,GAGACCCAGCGCTAACCAG
1,A1BG,GGGCACCCAGGAGCGGTAG
2,A1BG,CTCCGGGCGACGTGGAGTG
3,A1BG,AACCAGGGGTGCCCAAGGG
4,A1BG,GCGAGGAACCGCCCAGCAA
...,...,...
205275,ZZZ3,GACCCACCTGGAAGCGCCG
205276,ZZZ3,TCCACTAGACCCAACATGG
205277,ZZZ3,TGTGCGCGGGCGAAAGGGG
205278,ZZZ3,GATTCTCGCGGGACCTCGT


In [7]:
pseudocount = 10

### For each gene, calculate the avg_z_log2FCs
all_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/all_series') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/all_series', f))]
eq_files = [f for f in all_files if '_EQ_' in f]
gq_files = [f for f in all_files if '_EQ_' not in f and '_files' not in f]
gene_list = set([f.split('_')[0] for f in eq_files])

# Iterate over genes
for gene in gene_list:
    new_df = pd.DataFrame()
    z_lfc_labels = []
    lfc_labels = []
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/all_series' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]

    ### Sort files into their HS/LS and reps
    HS_R1 = [f for f in gene_gq_files if 'HS_R1' in f][0]
    HS_R2 = [f for f in gene_gq_files if 'HS_R2' in f][0]
    LS_R1 = [f for f in gene_gq_files if 'LS_R1' in f][0]
    LS_R2 = [f for f in gene_gq_files if 'LS_R2' in f][0]

    ### Calculate screen/rep-specific LFCs
    for screens in [(HS_R1, LS_R1),(HS_R2, LS_R2)]:

        screen1 = screens[0] # HS
        screen2 = screens[1] # LS
        locus = screen1.split('/')[-1].split('_')[0]
        rep = screen1.split('/')[-1].split('_')[2]

        # Create a df for each rep
        rep_start = pd.read_csv(screens[0], sep = '\t', usecols = [0,1,2,3,4,5,13,15], names = ['chr','start','end','name','countsHS'+'_'+locus+'_'+rep, 'strand', 'sequence', 'guideType'], index_col = ['chr','start','end','name','strand','sequence','guideType'])
        rep_end = pd.read_csv(screens[1], sep = '\t', usecols = [0,1,2,3,4,5,13,15], names = ['chr','start','end','name','countsLS'+'_'+locus+'_'+rep, 'strand', 'sequence', 'guideType'], index_col = ['chr','start','end','name','strand','sequence','guideType'])

        # Apply a pseudocount
        rep_start['countsHS'+'_'+locus+'_'+rep].clip(lower=pseudocount, inplace=True)
        rep_end['countsLS'+'_'+locus+'_'+rep].clip(lower=pseudocount, inplace=True)

        # Get the total counts
        total_start_counts = rep_start['countsHS'+'_'+locus+'_'+rep].sum()
        total_end_counts = rep_end['countsLS'+'_'+locus+'_'+rep].sum()

        #print(screens, total_start_counts, total_end_counts)

        # Merge replicate information together and reset index
        merged_rep = pd.concat([rep_start, rep_end], axis = 1).reset_index()

        # Create log2FC score. Add a pseudocount of 1 to everything, just in case; ((final/total_final) / (initial/total_initial) + 1)
        merged_rep['log2FC_'+locus+'_'+rep] = np.log2(((merged_rep['countsLS'+'_'+locus+'_'+rep]/total_end_counts) / (merged_rep['countsHS'+'_'+locus+'_'+rep]/total_start_counts))+1)

        # Get the mean and standard deviation of negative control scores
        # Need to remove nans
        negctrl_log2fc = merged_rep.loc[merged_rep['guideType']=='negative_control', 'log2FC_'+locus+'_'+rep].dropna().tolist()
        negctrl_log2fc_mean = np.mean(negctrl_log2fc)
        negctrl_log2fc_sd = np.std(negctrl_log2fc, ddof=1)

        # Get mean and std of all scores
        allgrna_log2fc = merged_rep['log2FC_'+locus+'_'+rep].dropna().tolist()
        allgrna_log2fc_mean = np.mean(allgrna_log2fc)
        allgrna_log2fc_sd = np.std(allgrna_log2fc, ddof = 1)

        # Z-score transform all gRNA
        #merged_rep['z_log2FC_'+locus+'_'+rep] = (merged_rep['log2FC_'+locus+'_'+rep] - negctrl_log2fc_mean) / negctrl_log2fc_sd
        merged_rep['z_log2FC_'+locus+'_'+rep] = (merged_rep['log2FC_'+locus+'_'+rep] - allgrna_log2fc_mean) / allgrna_log2fc_sd

        # Fill out new_df that contains information across all screens-reps
        if new_df.empty:
            new_df = merged_rep
        else:
            new_df['countsHS'+'_'+locus+'_'+rep] = merged_rep['countsHS'+'_'+locus+'_'+rep]
            new_df['countsLS'+'_'+locus+'_'+rep] = merged_rep['countsLS'+'_'+locus+'_'+rep]
            new_df['log2FC_'+locus+'_'+rep] = merged_rep['log2FC_'+locus+'_'+rep]
            new_df['z_log2FC_'+locus+'_'+rep] = merged_rep['z_log2FC_'+locus+'_'+rep]

        # Record the pairing information for later deriving averages
        pairing = ('log2FC_'+locus+'_R1', 'log2FC_'+locus+'_R2')
        z_pairing = ('z_log2FC_'+locus+'_R1', 'z_log2FC_'+locus+'_R2')

        if z_pairing not in z_lfc_labels:
            z_lfc_labels.append(z_pairing)
        if pairing not in lfc_labels:
            lfc_labels.append(pairing)  

    ### Calculate average LFCs
    new_df['avg_log2FC_'+locus] = abs((new_df[lfc_labels[0][0]] + new_df[lfc_labels[0][1]])/2)
    new_df['avg_zlog2FC_'+locus] = abs((new_df[z_lfc_labels[0][0]] + new_df[z_lfc_labels[0][1]])/2)

    ## Record the average negative control statistics
    negative_control_mean = np.mean(new_df.loc[new_df['guideType']=='negative_control']['avg_zlog2FC_'+locus])
    negative_control_sd = np.std(new_df.loc[new_df['guideType']=='negative_control']['avg_zlog2FC_'+locus], ddof = 1)
    #print(negative_control_mean, negative_control_sd)

    ### Only keep the targeting guides
    nonnan_df = new_df.dropna(subset=['chr'])
    nonnan_df.start = nonnan_df.start.astype(int)
    nonnan_df.end = nonnan_df.end.astype(int)

    ### Create some intermediate files
    targeting_guide_qbed = locus + '_grna_avg_z_log2FC.qBed'
    nonnan_df.to_csv(targeting_guide_qbed, columns = ['chr','start','end','avg_zlog2FC_'+locus, 'z_log2FC_'+locus+'_R1', 'z_log2FC_'+locus+'_R2','name', 'strand', 'sequence'], sep = '\t', index = False, header = False)

    # Repwise lfc
    targeting_guide_qbed_rep1 = locus + '_grna_zlog2FC_Rep1.qBed'
    targeting_guide_qbed_rep2 = locus + '_grna_zlog2FC_Rep2.qBed'
    nonnan_df.to_csv(targeting_guide_qbed_rep1, columns = ['chr','start','end','z_log2FC_'+locus+'_R1', 'name', 'strand', 'sequence'], sep = '\t', index = False, header = False)
    nonnan_df.to_csv(targeting_guide_qbed_rep2, columns = ['chr','start','end','z_log2FC_'+locus+'_R2', 'name', 'strand', 'sequence'], sep = '\t', index = False, header = False)



    
    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the d

In [85]:
### Process TSS-DHS peaks using FANTOM5 hg19->hg38 liftOver coordinates
fantom_df = pd.read_csv('/Users/davidy/misc_resources/LiftOver_Timo_FANTOM5_hg38_TSS_human.bed', sep = '\t', names = ['chr','start','end','simple','signal','strand','start1','start2','rgb'], usecols=['chr','start','end','simple','strand'])
fantom_df['gene'] = fantom_df['simple'].str.split('@', expand=True)[1].str.split(',', expand=True)[0]

fantom_df.to_csv('/Users/davidy/misc_resources/LiftOver_Timo_FANTOM5_hg38_TSS_human_simplified.bed', columns=['chr','start','end','gene','strand','simple'], sep = '\t', header=False, index=False)

# Get only the rows of interest
fantom_df = fantom_df.loc[fantom_df['gene'].isin(gene_list)]
display(fantom_df)

fantom_df.to_csv('HCRFF_FANTOM5_hg38.bed', sep = '\t', index=False, header=False) # ['chr','start','end','simple','strand','gene']


Unnamed: 0,chr,start,end,simple,strand,gene
10946,chr11,33869521,33869530,"p14@LMO2,0.5901",-,LMO2
10947,chr11,33869534,33869556,"p8@LMO2,0.6821",-,LMO2
10948,chr11,33869559,33869571,"p11@LMO2,0.6461",-,LMO2
10949,chr11,33869575,33869619,"p7@LMO2,0.5652",-,LMO2
10950,chr11,33869679,33869709,"p12@LMO2,0.4828",-,LMO2
...,...,...,...,...,...,...
293421,chrX,48823279,48823290,"p23@HDAC6,0.1002",+,HDAC6
293422,chrX,48823319,48823324,"p26@HDAC6,0.0784",+,HDAC6
293423,chrX,48823328,48823336,"p22@HDAC6,0.0638",+,HDAC6
293424,chrX,48823358,48823385,"p5@HDAC6,0.0691",+,HDAC6


In [126]:
### Prioritize FANTOM TSS by Pol 2 ChIP signal
### Highest avg Pol 2 chip within +/- 500 bp of TSS position, using ENCFF914WIS.bigWig
pol2bigwig_file = '/Users/davidy/misc_resources/chip/results/macs2/ENCFF914WIS_POL2RA_hg38_K562.bigWig'
distance_range_window = 50

# Write out FANTOM TSSs of genes of interest only
# Have to update the coordinates to make them +/- 500 bp
# Note that the TSS positions are variable width
# Add +/-X bases as evenly as possible to both sides
# If imbalanced, add more bases in a strand-specific way further from tx start
lod = []
for i, row in fantom_df.iterrows():
    tss_length = row['end'] - row['start']
    split_difference = ((distance_range_window-tss_length)/2)//1
    new_start = row['start']-split_difference
    new_end = row['end']+split_difference
    if new_end-new_start < distance_range_window: # If smaller, need to add 1 base, arbitrarily add in strand-specific manner further from tx start
        if row['strand'] == '+':
            new_start -= 1
        else:
            new_end += 1
    adf = {'chrom':row['chr'],
          'start':new_start,
          'end':new_end,
          'simple':row['simple'],
          'strand':row['strand'],
          'gene':row['gene']}
    lod.append(adf)
# Create df
fantom_expanded500tss_df = pd.DataFrame(lod)
fantom_expanded500tss_df['start'] = fantom_expanded500tss_df['start'].astype(int)
fantom_expanded500tss_df['end'] = fantom_expanded500tss_df['end'].astype(int)
# Write df
genesofinterest_fantom_file = 'HCRFF_FANTOM_500bpExpanded_TSS.bed'
fantom_expanded500tss_df.to_csv(genesofinterest_fantom_file, columns = ['chrom','start','end','simple'],sep = '\t', index=False, header=False)

### Perform bigwig average over bed
path_to_bwtool = '/Users/davidy/pythonscripts/bigWigAverageOverBed'
out_bwavg = 'HCRFF_FANTOM500TSS_bwAvg.tsv'
subprocess.call([path_to_bwtool, pol2bigwig_file, genesofinterest_fantom_file, out_bwavg])
fopen.close()

bwavg_df = pd.read_csv(out_bwavg, sep = '\t', header=None, names = ['simple','size','covered','sum','mean0','mean'], usecols=['simple','covered','mean'])

# join the datafarmes
merged_fantom_tss_bw = fantom_expanded500tss_df.merge(bwavg_df, on='simple')
# Remove any rows that are not complete
merged_fantom_tss_bw = merged_fantom_tss_bw.dropna()
display(merged_fantom_tss_bw)
merged_fantom_tss_bw.to_csv('HCRFF_FANTOM_500bpExpanded_TSS_MergedBwAvgOverBed.tsv', sep = '\t', index=False, header=False)

### Note, running the below requires additional code several blocks down to be run. 
### Specifically, select strongest bwavg tss
strongest_tss_set = set(strongest_bwavg_fantom_df['simple'].tolist())
print(strongest_tss_set)


Unnamed: 0,chrom,start,end,simple,strand,gene,covered,mean
0,chr11,33869501,33869551,"p14@LMO2,0.5901",-,LMO2,50,0.135218
1,chr11,33869520,33869570,"p8@LMO2,0.6821",-,LMO2,50,0.396460
2,chr11,33869540,33869590,"p11@LMO2,0.6461",-,LMO2,50,0.627420
3,chr11,33869572,33869622,"p7@LMO2,0.5652",-,LMO2,50,0.708983
4,chr11,33869669,33869719,"p12@LMO2,0.4828",-,LMO2,50,0.386701
...,...,...,...,...,...,...,...,...
402,chrX,48823259,48823309,"p23@HDAC6,0.1002",+,HDAC6,50,0.219044
403,chrX,48823296,48823346,"p26@HDAC6,0.0784",+,HDAC6,50,0.456612
404,chrX,48823307,48823357,"p22@HDAC6,0.0638",+,HDAC6,50,0.701838
405,chrX,48823346,48823396,"p5@HDAC6,0.0691",+,HDAC6,50,2.327140


{'p44@FADS1,0.5744', 'p3@NMU,0.4754', 'p24@MEF2C,0.5372', 'p1@FEN1,0.3317', 'p38@FADS2,0.5200', 'p1@PVT1,0.3058', 'p21@LMO2,0.2997', 'p1@MYC,0.3321', 'p7@HDAC6,0.4811', 'p1@CD164,0.4826', 'p3@GATA1,0.4284', 'p4@ERP29,0.3598', 'p1@CAT,0.2381', 'p2@CAPRIN1,0.4267', 'p11@FADS3,0.2877'}


In [133]:
##### Get effects of nearest guides and library guides
new_screen_genes = set(merged_fantom_tss_bw['gene'].tolist())

all_gq_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide', f))]
gq_files = [f for f in all_gq_files if 'grna_avg_z_log2FC.qBed' in f]
print(gq_files)
lod = []

for gene in new_screen_genes:
    avg_dolcetto_effect = 0
    nearest_10_avg_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    #display(gene_gq_df)
    
    ### Get the Dolcetto library spacers
    temp_dolcetto_df = dolcetto_all_df.loc[dolcetto_all_df['symbol']==gene]
    dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(temp_dolcetto_df['spacer'])]
    #dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(dolcetto_spacers)]
    #display(dolcetto_spacers)
    if len(dolcetto_spacers) == 0:
        print('Defaulting {}, there are no dolcetto spacers'.format(gene))
        dolcetto_spacers = [0]
        avg_dolcetto_effect = 0
    else:
        avg_dolcetto_effect = np.mean(dolcetto_spacers['avg_zlog2FC'])
        adf = {'gene':gene,
              'variable':'dolcetto',
              'value':avg_dolcetto_effect}
        lod.append(adf)
    ### Get the Weissman library spacers
    temp_weissman_df = weissman_all_df.loc[weissman_all_df['symbol']==gene]
    weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].str[1:].isin(temp_weissman_df['spacer'])]
    #weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(weissman_spacers)]
    if len(weissman_spacers) == 0:
        print('Defaulting {}, there are no weissman spacers'.format(gene))
        weissman_spacers = [0]
        avg_weissman_effect = 0
        
    else:
        avg_weissman_effect = np.mean(weissman_spacers['avg_zlog2FC'])
        adf = {'gene':gene,
              'variable':'weissman',
              'value':avg_weissman_effect}
        lod.append(adf)
    
    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a','HCRFF_FANTOM5_hg38.bed', '-b',gene_gq_files[0], '-w', '500'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    first_columns = ['chr1','start1','end1','simple','strand1','gene']
    second_columns = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence']
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = first_columns+second_columns)
    int_df = int_df.loc[int_df['gene']==gene]
    
    
    # Get absolute distance from grna to promoter.
    # Distance is relative to the strand-aware TSS edge (closest to tx start), 0-based index for end position.
    int_df['tss_position'] = np.where(int_df['strand1']=='+',int_df['end1']-1,int_df['start1'])
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['tss_position'])
    #display(int_df)
    # There may be multiple DHS peaks if a gene has multiple TSSs
    # Get the effects local to each DHS summit
    
    int_df2 = int_df.groupby('simple').apply(lambda x : np.mean(x.sort_values(by = 'abs_distance_to_summit').head(10))).reset_index()
    #display(int_df2)
    for i, row in int_df2.iterrows():
        
        #adf = {'gene':gene,
        #       'simple':row['simple'],
        #      'nearest_10_avg_effect':row['avg_zlog2FC'],
        #      'dolcetto_number':len(dolcetto_spacers),
        #      'dolcetto_avg_effect':avg_dolcetto_effect,
        #      'weissman_number':len(weissman_spacers),
        #      'weissman_avg_effect':avg_weissman_effect}

        adf = {'gene':gene,
              'variable':'nearest10',
              'value':row['avg_zlog2FC']}
        lod.append(adf)
    

# Create and transform df
promoter_df = pd.DataFrame(lod)
promoter_df.to_csv('allfantom_nearest10.csv',index=False)
display(promoter_df)

### Melt the datafarme
#promoter_df2 = promoter_df.melt(id_vars=['gene','simple'],value_vars=['nearest_10_avg_effect','dolcetto_avg_effect','weissman_avg_effect'])
#display(promoter_df2)

g1 = sns.scatterplot(data=promoter_df,x='gene',y='value',hue='variable',alpha=0.1)
outplot = 'Scatterplot_DolcettoWeissmanPromoter_vs_NarrowNearest10TSS.pdf'
#g1.plot([-1,10],[-1,10], 'red', linewidth=10)
plt.savefig(outplot)
plt.close('all')

        


['CAT_grna_avg_z_log2FC.qBed', 'CD164_grna_avg_z_log2FC.qBed', 'FEN1_grna_avg_z_log2FC.qBed', 'NMU_grna_avg_z_log2FC.qBed', 'ERP29_grna_avg_z_log2FC.qBed', 'PVT1_grna_avg_z_log2FC.qBed', 'FADS3_grna_avg_z_log2FC.qBed', 'LMO2_grna_avg_z_log2FC.qBed', 'CAPRIN1_grna_avg_z_log2FC.qBed', 'MYC_grna_avg_z_log2FC.qBed', 'HDAC6_grna_avg_z_log2FC.qBed', 'MEF2C_grna_avg_z_log2FC.qBed', 'FADS1_grna_avg_z_log2FC.qBed', 'FADS2_grna_avg_z_log2FC.qBed', 'GATA1_grna_avg_z_log2FC.qBed']
MYC
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MYC_grna_avg_z_log2FC.qBed
HDAC6
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/HDAC6_grna_avg_z_log2FC.qBed
NMU
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/NMU_grna_avg_z_log2FC.qBed
FADS2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS2_grna_avg_z_log2FC.qBed
GATA1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/GATA1_grna_avg_z_log2FC.qBed
LMO2

Unnamed: 0,gene,variable,value
0,MYC,dolcetto,0.547720
1,MYC,weissman,1.379206
2,MYC,nearest10,0.876055
3,MYC,nearest10,0.818828
4,MYC,nearest10,1.083189
...,...,...,...
423,FEN1,nearest10,3.227156
424,FEN1,nearest10,2.463599
425,FEN1,nearest10,0.708041
426,FEN1,nearest10,2.520629


In [95]:
### This time, evaluate the FANTOM TSS DHS summit-proximal sgRNAs (rather than strictly closest to TSS)
merged_fantom_tss_bw_file = 'HCRFF_FANTOM_500bpExpanded_TSS_MergedBwAvgOverBed.tsv'
fopen=open('HCRFF_FANTOM_500bpExpanded_TSS_MergedBwAvgOverBed_DHSSummitIntersection.tsv', 'w')
subprocess.call(['bedtools','intersect','-a',merged_fantom_tss_bw_file, '-b', '/Users/davidy/misc_resources/chip/results/macs2/dhs_k562_hg38_optimal_peak.narrowPeak.bed', '-wa','-wb'],stdout=fopen)
fopen.close()

# Process intersect file. 
first_columns=['chr','start','end','simple','strand1','gene','coverage','mean']
second_columns=['chr2','start2','end2','dot1','dot2','dot3','score1','score2','score3','summit']
fantomtss_dhsint_df = pd.read_csv('HCRFF_FANTOM_500bpExpanded_TSS_MergedBwAvgOverBed_DHSSummitIntersection.tsv', sep='\t',
                                 header=None, names=first_columns+second_columns)
display(fantomtss_dhsint_df)
# For each DHS peak that is intersected by a TSS only keep strongest summit
fantomtss_dhsint_df2 = fantomtss_dhsint_df.sort_values('score3',ascending=False).drop_duplicates(['simple'],keep='first')

# In many cases, alternative TSSs share the same DHS summit/peak. Create a CC, and for each gene groupby, keep unique peaks
fantomtss_dhsint_df2['dhs_cc'] = fantomtss_dhsint_df2['chr2'] +'_'+ fantomtss_dhsint_df2['start2'].astype(str) + '_'+ fantomtss_dhsint_df2['end2'].astype(str)
display(fantomtss_dhsint_df2)

fantomtss_dhsint_df2 = fantomtss_dhsint_df2.drop_duplicates(subset=['gene','dhs_cc'],keep='first')
display(fantomtss_dhsint_df2)

# Create a new summit-oriented bed, 1 bp width
fantomtss_dhsint_df2['start3'] = fantomtss_dhsint_df2['start2'] + fantomtss_dhsint_df2['summit']
fantomtss_dhsint_df2['end3'] = fantomtss_dhsint_df2['start3']+1

tss_dhsint_file = 'fantomtss_dhsint.bed'
fantomtss_dhsint_df2.to_csv(tss_dhsint_file, sep = '\t', index=False, header=False, columns=['chr2','start3','end3','gene','mean','dhs_cc'])

##### Get effects of nearest guides and library guides
new_screen_genes = set(merged_fantom_tss_bw['gene'].tolist())

all_gq_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide', f))]
gq_files = [f for f in all_gq_files if 'grna_avg_z_log2FC.qBed' in f]
print(gq_files)
lod = []

for gene in new_screen_genes:
    avg_dolcetto_effect = 0
    nearest_10_avg_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    #display(gene_gq_df)
    
    ### Get the Dolcetto library spacers
    temp_dolcetto_df = dolcetto_all_df.loc[dolcetto_all_df['symbol']==gene]
    dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(temp_dolcetto_df['spacer'])]
    #dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(dolcetto_spacers)]
    #display(dolcetto_spacers)
    if len(dolcetto_spacers) == 0:
        print('Defaulting {}, there are no dolcetto spacers'.format(gene))
        dolcetto_spacers = [0]
        avg_dolcetto_effect = 0
    else:
        avg_dolcetto_effect = np.mean(dolcetto_spacers['avg_zlog2FC'])

    ### Get the Weissman library spacers
    temp_weissman_df = weissman_all_df.loc[weissman_all_df['symbol']==gene]
    weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].str[1:].isin(temp_weissman_df['spacer'])]
    #weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(weissman_spacers)]
    if len(weissman_spacers) == 0:
        print('Defaulting {}, there are no weissman spacers'.format(gene))
        weissman_spacers = [0]
        avg_weissman_effect = 0
        
    else:
        avg_weissman_effect = np.mean(weissman_spacers['avg_zlog2FC'])
    
    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a',tss_dhsint_file, '-b',gene_gq_files[0], '-w', '500'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    first_columns = ['chr1','start1','end1','gene','mean','dhs_cc']
    second_columns = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence']
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = first_columns+second_columns)
    int_df = int_df.loc[int_df['gene']==gene]
    
    # Get absolute distance from grna to promoter.
    # Distance is relative to the strand-aware TSS edge (closest to tx start), 0-based index for end position.
    int_df['tss_position'] = int_df['start1']
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['tss_position'])
    
    # There may be multiple DHS peaks if a gene has multiple TSSs
    # Get the effects local to each DHS summit
    int_df2 = int_df.groupby('dhs_cc').apply(lambda x : np.mean(x.sort_values(by = 'abs_distance_to_summit').head(10))).reset_index()

    for i, row in int_df2.iterrows():
        
        adf = {'gene':gene,
               'dhs_cc':row['dhs_cc'],
              'nearest_10_avg_effect':row['avg_zlog2FC'],
              'dolcetto_number':len(dolcetto_spacers),
              'dolcetto_avg_effect':avg_dolcetto_effect,
              'weissman_number':len(weissman_spacers),
              'weissman_avg_effect':avg_weissman_effect}
        lod.append(adf)

# Create and transform df
promoter_df = pd.DataFrame(lod)
display(promoter_df)

### Melt the datafarme
promoter_df2 = promoter_df.melt(id_vars=['gene','dhs_cc'],value_vars=['nearest_10_avg_effect','dolcetto_avg_effect','weissman_avg_effect'])
display(promoter_df2)

g2 = sns.scatterplot(data=promoter_df2,x='gene',y='value',hue='variable')
outplot = 'Scatterplot_DolcettoWeissmanPromoter_vs_Nearest10TSS_StrongestDHSSummit.pdf'
#g1.plot([-1,10],[-1,10], 'red', linewidth=10)
plt.savefig(outplot)
plt.close('all')



Unnamed: 0,chr,start,end,simple,strand1,gene,coverage,mean,chr2,start2,end2,dot1,dot2,dot3,score1,score2,score3,summit
0,chr11,33869276,33869776,"p14@LMO2,0.5901",-,LMO2,500,0.225887,chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78
1,chr11,33869295,33869795,"p8@LMO2,0.6821",-,LMO2,500,0.224961,chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78
2,chr11,33869295,33869795,"p8@LMO2,0.6821",-,LMO2,500,0.224961,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106
3,chr11,33869315,33869815,"p11@LMO2,0.6461",-,LMO2,500,0.224520,chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78
4,chr11,33869315,33869815,"p11@LMO2,0.6461",-,LMO2,500,0.224520,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
870,chrX,48802411,48802911,"p10@HDAC6,0.7051",+,HDAC6,500,72.082300,chrX,48801224,48802871,.,1000,.,2.47566,12.31018,10.24427,1510
871,chrX,48802411,48802911,"p10@HDAC6,0.7051",+,HDAC6,500,72.082300,chrX,48801224,48802871,.,1000,.,3.90863,36.76175,34.17915,126
872,chrX,48802411,48802911,"p10@HDAC6,0.7051",+,HDAC6,500,72.082300,chrX,48801224,48802871,.,1000,.,4.42496,46.87328,44.08230,700
873,chrX,48802411,48802911,"p10@HDAC6,0.7051",+,HDAC6,500,72.082300,chrX,48801224,48802871,.,1000,.,3.79647,33.95245,31.42549,1225


Unnamed: 0,chr,start,end,simple,strand1,gene,coverage,mean,chr2,start2,end2,dot1,dot2,dot3,score1,score2,score3,summit,dhs_cc
115,chr11,34438707,34439207,"p1@CAT,0.2381",+,CAT,500,125.368000,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
652,chr11,34438431,34438931,"p13@CAT,0.1858",+,CAT,500,24.913600,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
655,chr11,34438463,34438963,"p5@CAT,0.1649",+,CAT,500,35.998700,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
658,chr11,34438499,34438999,"p11@CAT,0.2190",+,CAT,500,52.705200,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
730,chr11,61891523,61892023,"p9@FADS3,0.4067",-,FADS3,500,7.987580,chr11,61890554,61892139,.,1000,.,7.13587,67.51185,64.17668,1168,chr11_61890554_61892139
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,chr11,33869295,33869795,"p8@LMO2,0.6821",-,LMO2,500,0.224961,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106,chr11_33869786_33870019
8,chr11,33869444,33869944,"p12@LMO2,0.4828",-,LMO2,500,0.990574,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106,chr11_33869786_33870019
6,chr11,33869347,33869847,"p7@LMO2,0.5652",-,LMO2,500,0.225452,chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106,chr11_33869786_33870019
0,chr11,33869276,33869776,"p14@LMO2,0.5901",-,LMO2,500,0.225887,chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78,chr11_33869527_33869751


Unnamed: 0,chr,start,end,simple,strand1,gene,coverage,mean,chr2,start2,end2,dot1,dot2,dot3,score1,score2,score3,summit,dhs_cc
115,chr11,34438707,34439207,"p1@CAT,0.2381",+,CAT,500,125.368,chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
730,chr11,61891523,61892023,"p9@FADS3,0.4067",-,FADS3,500,7.98758,chr11,61890554,61892139,.,1000,.,7.13587,67.51185,64.17668,1168,chr11_61890554_61892139
287,chr12,112013143,112013643,"p3@ERP29,0.5629",+,ERP29,500,170.8,chr12,112012583,112014198,.,1000,.,6.50712,64.04637,60.81848,630,chr12_112012583_112014198
136,chr11,61800698,61801198,"p27@FADS1,0.1473",-,FADS1,500,61.9069,chr11,61800578,61800938,.,1000,.,5.50026,52.27794,49.36228,218,chr11_61800578_61800938
640,chr11,34051272,34051772,"p17@CAPRIN1,0.3996",+,CAPRIN1,500,170.162,chr11,34051352,34052969,.,1000,.,5.0197,51.86429,48.95948,159,chr11_34051352_34052969
132,chr11,61792661,61793161,"p1@FEN1,0.3317",+,FEN1,500,227.166,chr11,61792013,61793571,.,1000,.,5.39764,50.24253,47.37516,623,chr11_61792013_61793571
138,chr11,61814891,61815391,"p7@FADS1,0.2794",-,FADS1,500,400.115,chr11,61814898,61815720,.,1000,.,4.57494,47.89878,45.08473,465,chr11_61814898_61815720
414,chr6,109382158,109382658,"p1@CD164,0.4826",-,CD164,500,492.563,chr6,109381490,109383470,.,1000,.,4.65871,47.48898,44.68362,956,chr6_109381490_109383470
545,chrX,48802218,48802718,"p15@HDAC6,0.4809",+,HDAC6,500,316.863,chrX,48801224,48802871,.,1000,.,4.42496,46.87328,44.0823,700,chrX_48801224_48802871
844,chr8,127854146,127854646,"p23@PVT1,0.1943",+,PVT1,500,2.67368,chr8,127854280,127854805,.,919,.,6.07482,45.68789,42.92287,105,chr8_127854280_127854805


['CAT_grna_avg_z_log2FC.qBed', 'CD164_grna_avg_z_log2FC.qBed', 'FEN1_grna_avg_z_log2FC.qBed', 'NMU_grna_avg_z_log2FC.qBed', 'ERP29_grna_avg_z_log2FC.qBed', 'PVT1_grna_avg_z_log2FC.qBed', 'FADS3_grna_avg_z_log2FC.qBed', 'LMO2_grna_avg_z_log2FC.qBed', 'CAPRIN1_grna_avg_z_log2FC.qBed', 'MYC_grna_avg_z_log2FC.qBed', 'HDAC6_grna_avg_z_log2FC.qBed', 'MEF2C_grna_avg_z_log2FC.qBed', 'FADS1_grna_avg_z_log2FC.qBed', 'FADS2_grna_avg_z_log2FC.qBed', 'GATA1_grna_avg_z_log2FC.qBed']
MYC
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MYC_grna_avg_z_log2FC.qBed
HDAC6
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/HDAC6_grna_avg_z_log2FC.qBed
NMU
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/NMU_grna_avg_z_log2FC.qBed
FADS2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS2_grna_avg_z_log2FC.qBed
GATA1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/GATA1_grna_avg_z_log2FC.qBed
LMO2

Unnamed: 0,gene,dhs_cc,nearest_10_avg_effect,dolcetto_number,dolcetto_avg_effect,weissman_number,weissman_avg_effect
0,MYC,chr8_127734994_127736600,1.235346,2,0.54772,8,1.379206
1,HDAC6,chrX_48801224_48802871,1.68009,3,7.365781,8,5.373309
2,NMU,chr4_55635806_55636498,2.28599,6,8.173992,10,4.98231
3,FADS2,chr11_61815972_61817449,2.287985,4,0.275205,13,1.333576
4,GATA1,chrX_48785819_48786123,4.25587,4,4.850091,6,8.975002
5,GATA1,chrX_48786179_48786788,5.872799,4,4.850091,6,8.975002
6,LMO2,chr11_33869527_33869751,0.997039,3,1.99231,7,2.070285
7,LMO2,chr11_33869786_33870019,2.798485,3,1.99231,7,2.070285
8,LMO2,chr11_33870097_33871195,2.202582,3,1.99231,7,2.070285
9,LMO2,chr11_33891749_33892410,0.799109,3,1.99231,7,2.070285


Unnamed: 0,gene,dhs_cc,variable,value
0,MYC,chr8_127734994_127736600,nearest_10_avg_effect,1.235346
1,HDAC6,chrX_48801224_48802871,nearest_10_avg_effect,1.680090
2,NMU,chr4_55635806_55636498,nearest_10_avg_effect,2.285990
3,FADS2,chr11_61815972_61817449,nearest_10_avg_effect,2.287985
4,GATA1,chrX_48785819_48786123,nearest_10_avg_effect,4.255870
...,...,...,...,...
82,FADS1,chr11_61800578_61800938,weissman_avg_effect,4.979456
83,FADS1,chr11_61814898_61815720,weissman_avg_effect,4.979456
84,FADS1,chr11_61815972_61817449,weissman_avg_effect,4.979456
85,ERP29,chr12_112012583_112014198,weissman_avg_effect,10.336433


In [125]:
##### Now prioritize by bigwig signal
# First need to trim back down to minimal TSS range
bwavg_fantom_df = fantom_df.merge(bwavg_df, on='simple')
display(bwavg_fantom_df)
# Get only the strongest TSS per gene, by mean bigwigavgoverbed
strongest_bwavg_fantom_df = bwavg_fantom_df.sort_values('mean',ascending=False).drop_duplicates('gene')
display(strongest_bwavg_fantom_df)
strongest_file = 'strongest_bwavg_hcrff_fantom.bed'
strongest_bwavg_fantom_df.to_csv(strongest_file, sep = '\t', index=False, header=False)

##### Get effects of nearest guides and library guides
new_screen_genes = set(merged_fantom_tss_bw['gene'].tolist())

all_gq_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide', f))]
gq_files = [f for f in all_gq_files if 'grna_avg_z_log2FC.qBed' in f]
print(gq_files)
lod = []

for gene in new_screen_genes:
    avg_dolcetto_effect = 0
    nearest_10_avg_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    #display(gene_gq_df)
    
    ### Get the Dolcetto library spacers
    temp_dolcetto_df = dolcetto_all_df.loc[dolcetto_all_df['symbol']==gene]
    dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(temp_dolcetto_df['spacer'])]
    #dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(dolcetto_spacers)]
    #display(dolcetto_spacers)
    if len(dolcetto_spacers) == 0:
        print('Defaulting {}, there are no dolcetto spacers'.format(gene))
        dolcetto_spacers = [0]
        avg_dolcetto_effect = 0
    else:
        avg_dolcetto_effect = np.mean(dolcetto_spacers['avg_zlog2FC'])
        for i in dolcetto_spacers['avg_zlog2FC'].tolist():
            adf = {'gene':gene,
                  'variable':'dolcetto',
                  'value':float(i)}
            lod.append(adf)

    ### Get the Weissman library spacers
    temp_weissman_df = weissman_all_df.loc[weissman_all_df['symbol']==gene]
    weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].str[1:].isin(temp_weissman_df['spacer'])]
    #weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(weissman_spacers)]
    if len(weissman_spacers) == 0:
        print('Defaulting {}, there are no weissman spacers'.format(gene))
        weissman_spacers = [0]
        avg_weissman_effect = 0
    else:
        avg_weissman_effect = np.mean(weissman_spacers['avg_zlog2FC'])
        for i in weissman_spacers['avg_zlog2FC'].tolist():
            adf = {'gene':gene,
                  'variable':'weissman',
                  'value':float(i)}
            lod.append(adf)
    
    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a',strongest_file, '-b',gene_gq_files[0], '-w', '500'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    first_columns = ['chr1','start1','end1','simple','strand1','gene','covered','mean']
    second_columns = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence']
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = first_columns+second_columns)
    int_df = int_df.loc[int_df['gene']==gene]
    
    # Get absolute distance from grna to promoter.
    # Distance is relative to the strand-aware TSS edge (closest to tx start), 0-based index for end position.
    int_df['tss_position'] = np.where(int_df['strand1']=='+',int_df['end1']-1,int_df['start1'])
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['tss_position'])
    
    # Sort the dataframe
    int_df.sort_values('abs_distance_to_summit',ascending=True,inplace=True)
    display(int_df)
    for i, row in int_df.head(10).iterrows():
        
        adf = {'gene':gene,
               'variable':'nearest10',
               'value':row['avg_zlog2FC']}
        lod.append(adf)

# Create and transform df
promoter_df = pd.DataFrame(lod)
display(promoter_df)
promoter_df.to_csv('test.csv',index=False)


#g3 = sns.scatterplot(data=promoter_df2,x='gene',y='value',hue='variable')
#outplot = 'Scatterplot_DolcettoWeissmanPromoter_vs_Nearest10TSS_StrongestDHSSummit.pdf'
#plt.savefig(outplot)
#plt.close('all')

g3 = sns.pointplot(
    data=promoter_df, x="gene", y="value", hue='variable', palette='pastel',orient='v',
    errorbar=("sd", 100), capsize=.4, join=False, color=".5",
)
plt.xticks(rotation=45)
outplot='pointplot_DolcettoWeissmanPromoter_vs_Nearest10TSS_StrongestbwAvgSignal.pdf'
plt.savefig(outplot)
plt.close('all')

Unnamed: 0,chr,start,end,simple,strand,gene,covered,mean
0,chr11,33869521,33869530,"p14@LMO2,0.5901",-,LMO2,50,0.135218
1,chr11,33869534,33869556,"p8@LMO2,0.6821",-,LMO2,50,0.396460
2,chr11,33869559,33869571,"p11@LMO2,0.6461",-,LMO2,50,0.627420
3,chr11,33869575,33869619,"p7@LMO2,0.5652",-,LMO2,50,0.708983
4,chr11,33869679,33869709,"p12@LMO2,0.4828",-,LMO2,50,0.386701
...,...,...,...,...,...,...,...,...
402,chrX,48823279,48823290,"p23@HDAC6,0.1002",+,HDAC6,50,0.219044
403,chrX,48823319,48823324,"p26@HDAC6,0.0784",+,HDAC6,50,0.456612
404,chrX,48823328,48823336,"p22@HDAC6,0.0638",+,HDAC6,50,0.701838
405,chrX,48823358,48823385,"p5@HDAC6,0.0691",+,HDAC6,50,2.327140


Unnamed: 0,chr,start,end,simple,strand,gene,covered,mean
170,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988
179,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923
16,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079
49,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805
289,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13
219,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188
119,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889
39,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967
30,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739
120,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549


['CAT_grna_avg_z_log2FC.qBed', 'CD164_grna_avg_z_log2FC.qBed', 'FEN1_grna_avg_z_log2FC.qBed', 'NMU_grna_avg_z_log2FC.qBed', 'ERP29_grna_avg_z_log2FC.qBed', 'PVT1_grna_avg_z_log2FC.qBed', 'FADS3_grna_avg_z_log2FC.qBed', 'LMO2_grna_avg_z_log2FC.qBed', 'CAPRIN1_grna_avg_z_log2FC.qBed', 'MYC_grna_avg_z_log2FC.qBed', 'HDAC6_grna_avg_z_log2FC.qBed', 'MEF2C_grna_avg_z_log2FC.qBed', 'FADS1_grna_avg_z_log2FC.qBed', 'FADS2_grna_avg_z_log2FC.qBed', 'GATA1_grna_avg_z_log2FC.qBed']
MYC
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MYC_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
56,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736255,127736258,0.769436,0.936529,0.602342,MYC|chr8:127736236-127736255:+,+,GCTGTAGTAATTCCAGCGAG,127736240,127736254,14
55,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736209,127736212,0.059029,0.376295,-0.258236,MYC|chr8:127736213-127736232:-,-,TTAGATAAAGCCCCGAAAAC,127736240,127736212,28
57,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736272,127736275,1.707560,2.104375,1.310745,MYC|chr8:127736253-127736272:+,+,GAGAGGCAGAGGGAGCGAGC,127736240,127736271,31
54,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736210,127736213,0.155769,0.493577,-0.182039,MYC|chr8:127736191-127736210:+,+,ATCGCGCTGAGTATAAAAGC,127736240,127736209,31
61,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736278,127736281,0.788394,-1.007830,-0.568957,MYC|chr8:127736282-127736301:-,-,CCCGGCTCTTCCACCCTAGC,127736240,127736281,41
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127736719,127736722,0.079021,-0.109766,-0.048275,MYC|chr8:127736723-127736742:-,-,GAGAAAAGTGTCAATAGCGC,127736240,127736722,482
3,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127735755,127735758,0.117600,-0.000644,-0.234556,MYC|chr8:127735736-127735755:+,+,TGATTTATACTCACAGGACA,127736240,127735754,486
1,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127735749,127735752,0.260961,-0.166001,-0.355922,MYC|chr8:127735730-127735749:+,+,CTGCGATGATTTATACTCAC,127736240,127735748,492
2,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,50,859.923,chr8,127735728,127735731,0.186258,-0.047650,-0.324865,MYC|chr8:127735732-127735751:-,-,CTGTGAGTATAAATCATCGC,127736240,127735731,509


HDAC6
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/HDAC6_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
90,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802249,48802252,0.965390,0.966558,0.964223,HDAC6|chrX:48802230-48802249:+,+,GAAAGGGTGGGTGATTATCC,48802245,48802248,3
91,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802250,48802253,5.924046,7.230679,4.617413,HDAC6|chrX:48802231-48802250:+,+,AAAGGGTGGGTGATTATCCC,48802245,48802249,4
96,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802247,48802250,0.930678,1.799094,0.062261,HDAC6|chrX:48802251-48802270:-,-,GCCCTTTCGGCCTATCTCCC,48802245,48802250,5
97,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802248,48802251,1.278761,2.918030,-0.360508,HDAC6|chrX:48802252-48802271:-,-,TGCCCTTTCGGCCTATCTCC,48802245,48802251,6
88,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802237,48802240,8.756741,10.066376,7.447106,HDAC6|chrX:48802218-48802237:+,+,GGCCGGCTGAGTGAAAGGGT,48802245,48802236,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48801766,48801769,0.210301,1.263385,-0.842782,HDAC6|chrX:48801747-48801766:+,+,GAGAGGTGTGGCTCGCGCGA,48802245,48801765,480
176,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802723,48802726,0.810244,-0.829248,-0.791240,HDAC6|chrX:48802727-48802746:-,-,CTGCCTACTTCTTCGCTGCC,48802245,48802726,481
1,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48801754,48801757,0.825255,1.042766,0.607744,HDAC6|chrX:48801735-48801754:+,+,TTGTGAGCTCGCGAGAGGTG,48802245,48801753,492
175,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,50,531.188,chrX,48802740,48802743,4.963681,5.842392,4.084969,HDAC6|chrX:48802721-48802740:+,+,CAACCAGGCAGCGAAGAAGT,48802245,48802739,494


NMU
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/NMU_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
171,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636257,55636260,11.047789,10.347026,11.748553,NMU|chr4:55636261-55636280:-,-,CGAGGCACAGCCAGGGCACC,55636262,55636260,2
169,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636259,55636262,7.069244,5.758599,8.379888,NMU|chr4:55636240-55636259:+,+,CGCGTAGCTGGTGCTCCACC,55636262,55636258,4
170,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636267,55636270,9.728978,7.481434,11.976522,NMU|chr4:55636248-55636267:+,+,TGGTGCTCCACCTGGTGCCC,55636262,55636266,4
173,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636264,55636267,5.527811,3.806717,7.248905,NMU|chr4:55636268-55636287:-,-,CGGGCCCCGAGGCACAGCCA,55636262,55636267,5
174,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636265,55636268,3.070211,2.388941,3.751481,NMU|chr4:55636269-55636288:-,-,CCGGGCCCCGAGGCACAGCC,55636262,55636268,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
264,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55636758,55636761,9.385516,11.805563,6.965470,NMU|chr4:55636762-55636781:-,-,ACACTGCCCTGATATGCACC,55636262,55636761,499
65,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55635762,55635765,0.136091,0.574230,-0.846413,NMU|chr4:55635743-55635762:+,+,ACCCGTTTCCACGGGCCGGG,55636262,55635761,501
64,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55635761,55635764,0.213881,-0.423460,-0.004302,NMU|chr4:55635742-55635761:+,+,TACCCGTTTCCACGGGCCGG,55636262,55635760,502
63,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,50,178.549,chr4,55635760,55635763,0.207178,-0.590572,0.176217,NMU|chr4:55635741-55635760:+,+,GTACCCGTTTCCACGGGCCG,55636262,55635759,503


FADS2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS2_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
173,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816117,61816120,3.182857,4.038292,2.327422,FADS2|chr11:61816098-61816117:+,+,CCGGCACCTAGGTCCAGACG,61816122,61816116,6
177,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816128,61816131,7.805215,9.478265,6.132164,FADS2|chr11:61816132-61816151:-,-,CCTGTCTCGATGGCTAGGAG,61816122,61816131,9
175,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816103,61816106,4.778085,6.701829,2.854341,FADS2|chr11:61816107-61816126:-,-,GCGCAGCCGCGTCTGGACCT,61816122,61816106,16
172,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816106,61816109,3.508405,4.190653,2.826158,FADS2|chr11:61816087-61816106:+,+,GCGAGTGGAGACCGGCACCT,61816122,61816105,17
178,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816138,61816141,4.312068,4.916642,3.707494,FADS2|chr11:61816142-61816161:-,-,GGAGTCACATCCTGTCTCGA,61816122,61816141,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61815634,61815637,3.834943,7.008400,0.661486,FADS2|chr11:61815638-61815657:-,-,TCTGCTGGGTTGGAGTCCTG,61816122,61815637,485
228,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816605,61816608,4.081816,5.828488,2.335144,FADS2|chr11:61816609-61816628:-,-,ACATCAGCGAGTTCACCCGC,61816122,61816608,486
223,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816609,61816612,3.331579,5.785993,0.877164,FADS2|chr11:61816590-61816609:+,+,GGGAGCCCCCTGGATGCCGG,61816122,61816608,486
224,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,50,587.13,chr11,61816610,61816613,2.587091,3.870377,1.303806,FADS2|chr11:61816591-61816610:+,+,GGAGCCCCCTGGATGCCGGC,61816122,61816609,487


GATA1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/GATA1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
233,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786601,48786604,6.059515,6.348261,5.770768,GATA1|chrX:48786605-48786624:-,-,GTTCGGCCGCCTTGGGGATG,48786606,48786604,2
234,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786607,48786610,6.167791,7.341844,4.993739,GATA1|chrX:48786611-48786630:-,-,CGGAGGGTTCGGCCGCCTTG,48786606,48786610,4
235,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786608,48786611,5.784577,6.774893,4.794260,GATA1|chrX:48786612-48786631:-,-,GCGGAGGGTTCGGCCGCCTT,48786606,48786611,5
231,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786612,48786615,8.252070,9.732671,6.771470,GATA1|chrX:48786593-48786612:+,+,CTGAGCTTGCCACATCCCCA,48786606,48786611,5
236,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786609,48786612,0.213733,-0.029814,-0.397651,GATA1|chrX:48786613-48786632:-,-,TGCGGAGGGTTCGGCCGCCT,48786606,48786612,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786347,48786350,4.498167,4.097466,4.898868,GATA1|chrX:48786351-48786370:-,-,TCGCTCGACCTAGGCTCCCG,48786606,48786350,256
180,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786351,48786354,4.650292,4.334688,4.965895,GATA1|chrX:48786332-48786351:+,+,GTCTGGACACTACAGTCCAC,48786606,48786350,256
179,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786350,48786353,4.437524,5.674050,3.200998,GATA1|chrX:48786331-48786350:+,+,TGTCTGGACACTACAGTCCA,48786606,48786349,257
178,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,50,140.505,chrX,48786319,48786322,8.178027,7.863331,8.492722,GATA1|chrX:48786323-48786342:-,-,AGTGTCCAGACAAGCAAAAT,48786606,48786322,284


LMO2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/LMO2_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
385,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870651,33870654,2.159335,-0.060647,4.379317,LMO2|chr11:33870632-33870651:+,+,CGGGGCTCCTCTCAGTCGGC,33870651,33870650,1
386,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870654,33870657,1.198144,0.899144,1.497144,LMO2|chr11:33870635-33870654:+,+,GGCTCCTCTCAGTCGGCTGG,33870651,33870653,2
383,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870647,33870650,2.289392,2.670982,1.907803,LMO2|chr11:33870628-33870647:+,+,GGGCCGGGGCTCCTCTCAGT,33870651,33870646,5
387,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870657,33870660,1.092278,0.391756,1.792800,LMO2|chr11:33870638-33870657:+,+,TCCTCTCAGTCGGCTGGTGG,33870651,33870656,5
388,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870638,33870641,2.493317,0.690151,4.296483,LMO2|chr11:33870642-33870661:-,-,TCCGCCACCAGCCGACTGAG,33870651,33870641,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
340,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870164,33870167,1.078364,1.712396,0.444333,LMO2|chr11:33870145-33870164:+,+,TTCTTAAAGGGGCCAGTGTC,33870651,33870163,488
339,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870163,33870166,0.036319,0.592588,-0.665227,LMO2|chr11:33870144-33870163:+,+,CTTCTTAAAGGGGCCAGTGT,33870651,33870162,489
343,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870156,33870159,1.219042,2.154609,0.283474,LMO2|chr11:33870160-33870179:-,-,GCTCTCCCCGTCCCCGACAC,33870651,33870159,492
338,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,50,41.8122,chr11,33870152,33870155,2.161498,1.674550,2.648446,LMO2|chr11:33870133-33870152:+,+,GACTGACTTGGCTTCTTAAA,33870651,33870151,500


MEF2C
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MEF2C_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
86,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883382,88883385,2.615784,4.095897,1.135671,MEF2C|chr5:88883363-88883382:+,+,AGTGGCTCTCAGCGGCCGGT,88883363,88883381,18
87,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883396,88883399,0.517998,0.847483,0.188513,MEF2C|chr5:88883400-88883419:-,-,CGCAGTTTTCTGGACGAGTC,88883363,88883399,36
88,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883422,88883425,3.306926,7.274405,-0.660553,MEF2C|chr5:88883403-88883422:+,+,TCGTCCAGAAAACTGCGTCC,88883363,88883421,58
84,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883306,88883309,6.372319,11.405101,1.339538,MEF2C|chr5:88883287-88883306:+,+,CCGAGGCCGCTCGGAAGAGG,88883363,88883305,58
83,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883303,88883306,1.425222,2.102225,0.74822,MEF2C|chr5:88883284-88883303:+,+,GCGCCGAGGCCGCTCGGAAG,88883363,88883302,61
89,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883426,88883429,1.329604,1.360232,1.298977,MEF2C|chr5:88883407-88883426:+,+,CCAGAAAACTGCGTCCAGGA,88883363,88883425,62
82,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883297,88883300,0.17034,-0.563777,0.223098,MEF2C|chr5:88883278-88883297:+,+,TCGCGCGCGCCGAGGCCGCT,88883363,88883296,67
85,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883286,88883289,0.707277,-0.465464,1.880018,MEF2C|chr5:88883290-88883309:-,-,CCTCCTCTTCCGAGCGGCCT,88883363,88883289,74
80,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883289,88883292,1.683688,1.429935,1.937441,MEF2C|chr5:88883270-88883289:+,+,CCGCGCATTCGCGCGCGCCG,88883363,88883288,75
81,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,50,92.2192,chr5,88883269,88883272,0.145201,0.327506,-0.617907,MEF2C|chr5:88883273-88883292:-,-,CCTCGGCGCGCGCGAATGCG,88883363,88883272,91


CAT
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CAT_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
270,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34439014,34439017,7.430443,8.008706,6.852179,CAT|chr11:34438995-34439014:+,+,GCACAGCAAACCGCACGCTA,34439013,34439013,0
271,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34439025,34439028,2.411193,2.660512,2.161873,CAT|chr11:34439006-34439025:+,+,CGCACGCTATGGCTGACAGC,34439013,34439024,11
272,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34439026,34439029,2.130650,3.114399,1.146901,CAT|chr11:34439007-34439026:+,+,GCACGCTATGGCTGACAGCC,34439013,34439025,12
273,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34439024,34439027,8.363015,7.769929,8.956101,CAT|chr11:34439028-34439047:-,-,TCTGGTCGCTGGCGGGATCC,34439013,34439027,14
274,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34439031,34439034,0.602180,-0.661315,-0.543045,CAT|chr11:34439035-34439054:-,-,TGCTGCATCTGGTCGCTGGC,34439013,34439034,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34438513,34438516,0.723489,-0.639677,-0.807301,CAT|chr11:34438517-34438536:-,-,TCTTAAAAAGTCAGCAATCC,34439013,34438516,497
198,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34438515,34438518,8.494420,8.791831,8.197008,CAT|chr11:34438496-34438515:+,+,TGTACAGAGTAATTTAACCC,34439013,34438514,499
197,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34438446,34438449,2.532412,2.094929,2.969894,CAT|chr11:34438450-34438469:-,-,TAAGTATTATTTGCAACCAA,34439013,34438449,564
195,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,50,209.739,chr11,34438450,34438453,2.550512,3.108920,1.992104,CAT|chr11:34438431-34438450:+,+,AAAAGCATCCATCCATCCTT,34439013,34438449,564


CAPRIN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CAPRIN1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
59,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34051739,34051742,3.966681,4.285446,3.647916,CAPRIN1|chr11:34051720-34051739:+,+,GGCTCTCGCCTCACTAGGAG,34051739,34051738,1
60,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34051746,34051749,2.931081,3.064672,2.797490,CAPRIN1|chr11:34051727-34051746:+,+,GCCTCACTAGGAGCGGCTCT,34051739,34051745,6
56,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34051734,34051737,1.418708,1.064795,1.772620,CAPRIN1|chr11:34051715-34051734:+,+,CTCCCGGCTCTCGCCTCACT,34051739,34051733,6
61,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34051754,34051757,1.673882,1.161603,2.186162,CAPRIN1|chr11:34051735-34051754:+,+,AGGAGCGGCTCTCGGTGCAG,34051739,34051753,14
62,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34051755,34051758,3.714979,3.893900,3.536059,CAPRIN1|chr11:34051736-34051755:+,+,GGAGCGGCTCTCGGTGCAGC,34051739,34051754,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34052218,34052221,0.250910,0.203635,0.298185,CAPRIN1|chr11:34052222-34052241:-,-,CCGCCAGCGGGGCGCCCGGC,34051739,34052221,482
129,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34052222,34052225,2.324980,2.246559,2.403400,CAPRIN1|chr11:34052226-34052245:-,-,CGACCCGCCAGCGGGGCGCC,34051739,34052225,486
130,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34052230,34052233,0.548402,-0.046034,1.142838,CAPRIN1|chr11:34052234-34052253:-,-,CGCGCGCGCGACCCGCCAGC,34051739,34052233,494
131,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,50,646.079,chr11,34052231,34052234,1.764499,2.064059,1.464939,CAPRIN1|chr11:34052235-34052254:-,-,GCGCGCGCGCGACCCGCCAG,34051739,34052234,495


FADS3
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS3_grna_avg_z_log2FC.qBed
Defaulting FADS3, there are no dolcetto spacers


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
419,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891295,61891298,0.612224,-0.609795,-0.614652,FADS3|chr11:61891276-61891295:+,+,CGCCGGGCTGGTCGTGCGCG,61891295,61891294,1
424,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891305,61891308,5.976078,5.759413,6.192743,FADS3|chr11:61891309-61891328:-,-,GCCGCTGCCCACCTTCTGCT,61891295,61891308,13
417,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891279,61891282,3.075464,2.726980,3.423949,FADS3|chr11:61891260-61891279:+,+,ATGACCAGCCACTTGTCGCC,61891295,61891278,17
420,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891314,61891317,5.315120,5.175822,5.454418,FADS3|chr11:61891295-61891314:+,+,GCGGATCTGCTCCCAGCAGA,61891295,61891313,18
416,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891278,61891281,4.214660,4.103227,4.326094,FADS3|chr11:61891259-61891278:+,+,GATGACCAGCCACTTGTCGC,61891295,61891277,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61890803,61890806,0.922583,0.860487,0.984679,FADS3|chr11:61890807-61890826:-,-,GCTTGCTGAGTGCTAGGGGT,61891295,61890806,489
515,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891782,61891785,4.015581,3.721071,4.310091,FADS3|chr11:61891786-61891805:-,-,GCCGGCAGCCCGGGAAAAGG,61891295,61891785,490
353,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61890797,61890800,3.137139,2.806396,3.467882,FADS3|chr11:61890801-61890820:-,-,TGAGTGCTAGGGGTAGGGCA,61891295,61890800,495
512,chr11,61891295,61891308,"p11@FADS3,0.2877",-,FADS3,50,18.7952,chr11,61891794,61891797,2.234073,2.408941,2.059205,FADS3|chr11:61891775-61891794:+,+,CCGTCCCGCCCCCTTTTCCC,61891295,61891793,498


CD164
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CD164_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
116,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382399,109382402,12.114747,11.043089,13.186405,CD164|chr6:109382380-109382399:+,+,GTGTCCTCAGCGCTGGCGTT,109382397,109382398,1
117,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382400,109382403,5.646423,4.315129,6.977716,CD164|chr6:109382381-109382400:+,+,TGTCCTCAGCGCTGGCGTTC,109382397,109382399,2
115,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382392,109382395,8.427411,7.847494,9.007328,CD164|chr6:109382373-109382392:+,+,CGACATCGTGTCCTCAGCGC,109382397,109382391,6
118,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382383,109382386,10.020977,7.000458,13.041495,CD164|chr6:109382387-109382406:-,-,TCTCCCGAACGCCAGCGCTG,109382397,109382386,11
119,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382412,109382415,7.676821,8.109027,7.244616,CD164|chr6:109382393-109382412:+,+,TGGCGTTCGGGAGAAAGCTA,109382397,109382411,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
209,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109382888,109382891,1.013584,0.260975,1.766193,CD164|chr6:109382869-109382888:+,+,GGGCGGAAGTTGGTTTTGTG,109382397,109382887,490
0,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109381906,109381909,0.421911,-0.314936,-0.528886,CD164|chr6:109381887-109381906:+,+,CCCATTTCCGACCCCAAGTG,109382397,109381905,492
4,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109381899,109381902,0.431882,-0.094046,-0.769718,CD164|chr6:109381903-109381922:-,-,TCGTAATTCCTAGCCTCACT,109382397,109381902,495
3,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,50,980.988,chr6,109381898,109381901,0.116620,0.133226,-0.366466,CD164|chr6:109381902-109381921:-,-,CGTAATTCCTAGCCTCACTT,109382397,109381901,496


PVT1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/PVT1_grna_avg_z_log2FC.qBed
Defaulting PVT1, there are no dolcetto spacers
Defaulting PVT1, there are no weissman spacers


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
141,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794566,127794569,0.064302,-0.277569,0.406174,PVT1|chr8:127794547-127794566:+,+,CGTGTGGCGGCCGAGCACAT,127794568,127794565,3
140,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794565,127794568,0.667675,-0.414635,-0.920716,PVT1|chr8:127794546-127794565:+,+,GCGTGTGGCGGCCGAGCACA,127794568,127794564,4
142,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794579,127794582,0.212533,-0.441147,0.016080,PVT1|chr8:127794560-127794579:+,+,AGCACATGGGCCCGCGGGCC,127794568,127794578,10
139,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794553,127794556,0.296573,-0.368022,0.961168,PVT1|chr8:127794534-127794553:+,+,TCCGGGCAGAGCGCGTGTGG,127794568,127794552,16
138,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794550,127794553,1.941871,1.462807,2.420934,PVT1|chr8:127794531-127794550:+,+,TCCTCCGGGCAGAGCGCGTG,127794568,127794549,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127795063,127795066,0.075572,-0.576182,0.425037,PVT1|chr8:127795067-127795086:-,-,CAGACCTCTAGTTTCGCCAG,127794568,127795066,498
117,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794057,127794060,0.463055,-0.573099,1.499209,PVT1|chr8:127794061-127794080:-,-,GGCGAGTTCAGTGAAATAGG,127794568,127794060,508
116,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794035,127794038,0.376384,-0.452936,-0.299832,PVT1|chr8:127794039-127794058:-,-,GGCCAGAGATGCTCCGGGAT,127794568,127794038,530
115,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,50,10.1724,chr8,127794025,127794028,1.223419,3.752820,-1.305981,PVT1|chr8:127794006-127794025:+,+,CTCAGTCGGCGGTCTTCTGC,127794568,127794024,544


FADS1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
60,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816091,61816094,3.268773,3.587657,2.949889,FADS1|chr11:61816072-61816091:+,+,TCTCCGCTCCTGCTGGCGAG,61816091,61816090,1
57,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816084,61816087,2.708075,2.349841,3.066309,FADS1|chr11:61816065-61816084:+,+,TGCTCCCTCTCCGCTCCTGC,61816091,61816083,8
65,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816097,61816100,3.948044,4.131121,3.764968,FADS1|chr11:61816101-61816120:-,-,CCGCGTCTGGACCTAGGTGC,61816091,61816100,9
62,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816079,61816082,1.146076,0.579593,1.712559,FADS1|chr11:61816083-61816102:-,-,GCCGGTCTCCACTCGCCAGC,61816091,61816082,9
63,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816106,61816109,6.563594,7.254408,5.872779,FADS1|chr11:61816087-61816106:+,+,GCGAGTGGAGACCGGCACCT,61816091,61816105,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61815607,61815610,0.089589,0.192481,-0.013303,FADS1|chr11:61815611-61815630:-,-,AGTCCTAGGGACTCGAAAGT,61816091,61815610,481
3,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61815605,61815608,2.091440,1.991028,2.191852,FADS1|chr11:61815609-61815628:-,-,TCCTAGGGACTCGAAAGTCG,61816091,61815608,483
110,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61816576,61816579,5.123604,5.197955,5.049253,FADS1|chr11:61816557-61816576:+,+,TGGCATCCTGCCCGGCGTAG,61816091,61816575,484
1,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,50,606.805,chr11,61815598,61815601,2.671447,2.678321,2.664573,FADS1|chr11:61815579-61815598:+,+,CAAGGGGGAAACGCGGTGAA,61816091,61815597,494


ERP29
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/ERP29_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
62,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013571,112013574,9.421944,13.493027,5.350860,ERP29|chr12:112013552-112013571:+,+,CGGCAGCGGCCTGCACACCA,112013538,112013570,32
63,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013572,112013575,12.093925,15.577785,8.610065,ERP29|chr12:112013553-112013572:+,+,GGCAGCGGCCTGCACACCAA,112013538,112013571,33
61,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013482,112013485,7.499764,8.616430,6.383098,ERP29|chr12:112013486-112013505:-,-,GCGGGGAGAGAAATGCGGCG,112013538,112013485,53
60,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013481,112013484,6.449639,9.551034,3.348244,ERP29|chr12:112013485-112013504:-,-,CGGGGAGAGAAATGCGGCGC,112013538,112013484,54
59,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013480,112013483,10.908533,15.886417,5.930649,ERP29|chr12:112013484-112013503:-,-,GGGGAGAGAAATGCGGCGCG,112013538,112013483,55
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013047,112013050,1.453099,1.471845,1.434353,ERP29|chr12:112013028-112013047:+,+,CTGCCTGACGGCCCAAAGGC,112013538,112013046,492
1,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013043,112013046,2.023805,1.892952,2.154658,ERP29|chr12:112013024-112013043:+,+,CCAACTGCCTGACGGCCCAA,112013538,112013042,496
127,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112014038,112014041,9.596340,12.424374,6.768305,ERP29|chr12:112014042-112014061:-,-,GAAGAGGGGCGCGGACACGC,112013538,112014041,503
0,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,50,270.889,chr12,112013035,112013038,1.653160,0.834384,2.471936,ERP29|chr12:112013016-112013035:+,+,GGTCCCAACCAACTGCCTGA,112013538,112013034,504


FEN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FEN1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr1,start1,end1,simple,strand1,gene,covered,mean,chr,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand,sequence,tss_position,pam_coordinate,abs_distance_to_summit
283,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792946,61792949,2.074577,-0.178871,4.328025,FEN1|chr11:61792927-61792946:+,+,ACCCGCCGCTAAGCTGAGAA,61792946,61792945,1
286,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792931,61792934,2.609592,1.862148,3.357036,FEN1|chr11:61792935-61792954:-,-,GCTCTCCCTTCTCAGCTTAG,61792946,61792934,12
285,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792928,61792931,0.223999,-0.178871,0.626870,FEN1|chr11:61792932-61792951:-,-,CTCCCTTCTCAGCTTAGCGG,61792946,61792931,15
284,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792927,61792930,4.692887,2.086313,7.299461,FEN1|chr11:61792931-61792950:-,-,TCCCTTCTCAGCTTAGCGGC,61792946,61792930,16
282,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792924,61792927,4.107559,5.814942,2.400176,FEN1|chr11:61792928-61792947:-,-,CTTCTCAGCTTAGCGGCGGG,61792946,61792927,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
234,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792434,61792437,2.185505,4.125841,0.245170,FEN1|chr11:61792438-61792457:-,-,CTAGACGCTCCTGGAACCTC,61792946,61792437,509
233,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792428,61792431,4.083541,2.916144,5.250939,FEN1|chr11:61792432-61792451:-,-,GCTCCTGGAACCTCCGGCTT,61792946,61792431,515
231,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792402,61792405,0.018061,-0.782514,0.746391,FEN1|chr11:61792383-61792402:+,+,CCTAAGGAGTTCATGGCAAG,61792946,61792401,545
230,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,50,268.967,chr11,61792401,61792404,1.433677,1.058042,1.809312,FEN1|chr11:61792382-61792401:+,+,ACCTAAGGAGTTCATGGCAA,61792946,61792400,546


Unnamed: 0,gene,variable,value
0,MYC,dolcetto,0.307046
1,MYC,dolcetto,0.788394
2,MYC,weissman,2.809243
3,MYC,weissman,1.714057
4,MYC,weissman,1.532038
...,...,...,...
310,FEN1,nearest10,4.078306
311,FEN1,nearest10,5.021488
312,FEN1,nearest10,5.333635
313,FEN1,nearest10,3.733789


In [157]:
##### Now try refgene
# First just plot 10 grnas closest to each refgene
refgene_schema = ['bin','name','chrom','strand','txstart','txend','cdsstart','cdsend','exoncount','exonstarts','exonends','score','symbol','cdsstartstat','cdsendstat','exonframes']
refgene_file = '/Users/davidy/misc_resources/refGene.txt'
isoform_priority = '/Users/davidy/misc_resources/isoform_index_with_K562_pol2_TSS.out'

refgene_df = pd.read_csv(refgene_file, sep = '\t', header=None, names=refgene_schema, usecols=['chrom','txstart','txend','strand','symbol'])
refgene_df['tss_start'] = np.where(refgene_df['strand']=='+', refgene_df['txstart'], refgene_df['txend'])
refgene_df['tss_end'] = refgene_df['tss_start']+1
refgene_df['tss_cc'] = refgene_df['chrom'] + '_' + refgene_df['tss_start'].astype(str) + '_' + refgene_df['tss_end'].astype(str)

refgene_df = refgene_df.loc[refgene_df['symbol'].isin(screen_genes)]
display(refgene_df)
refgene_df.to_csv('HCRFF_refgene.bed', sep = '\t', index=False, header=False, columns=['chrom','tss_start','tss_end','tss_cc','strand','symbol'])

lod = []

for gene in new_screen_genes:
    avg_dolcetto_effect = 0
    nearest_10_avg_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    #display(gene_gq_df)

    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a','HCRFF_refgene.bed', '-b',gene_gq_files[0], '-w', '500'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    first_columns = ['chrom','tss_start','tss_end','tss_cc','strand','symbol']
    second_columns = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence']
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = first_columns+second_columns)
    int_df = int_df.loc[int_df['symbol']==gene]
    
    
    # Get absolute distance from grna to promoter.
    # Distance is relative to the strand-aware TSS edge (closest to tx start), 0-based index for end position.
    int_df['tss_position'] = int_df['start1']
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['tss_position'])
    #display(int_df)
    # There may be multiple DHS peaks if a gene has multiple TSSs
    # Get the effects local to each DHS summit
    
    int_df2 = int_df.groupby('tss_cc').apply(lambda x : np.mean(x.sort_values(by = 'abs_distance_to_summit').head(10))).reset_index()
    if gene == 'MYC':
        display(int_df2)
    for i, row in int_df2.iterrows():
        if gene == 'MYC':
            print(row)
        #adf = {'gene':gene,
        #       'simple':row['simple'],
        #      'nearest_10_avg_effect':row['avg_zlog2FC'],
        #      'dolcetto_number':len(dolcetto_spacers),
        #      'dolcetto_avg_effect':avg_dolcetto_effect,
        #      'weissman_number':len(weissman_spacers),
        #      'weissman_avg_effect':avg_weissman_effect}

        adf = {'gene':gene,
              'variable':'refgene',
              'value':row['avg_zlog2FC']}
        lod.append(adf)
    

# Create and transform df
refgene_grna_df = pd.DataFrame(lod)
display(refgene_grna_df)

fantom_refgene_library_df = pd.concat([promoter_df, refgene_grna_df]).drop_duplicates()
fantom_refgene_library_df.to_csv('test_refgene.csv')
### Melt the datafarme
#promoter_df2 = promoter_df.melt(id_vars=['gene','simple'],value_vars=['nearest_10_avg_effect','dolcetto_avg_effect','weissman_avg_effect'])
#display(promoter_df2)

sns.set(rc={'figure.figsize':(15,5)})
sns.set(style="ticks")
g4 = sns.stripplot(data=fantom_refgene_library_df,x='gene',y='value',hue='variable',palette='colorblind',
                   size=3, dodge=True, jitter=False)
outplot = 'Scatterplot_DolcettoWeissmanPromoter_vs_FANTOM_Refgene_NarrowNearestTSS10.pdf'
plt.xticks(rotation=45)
plt.savefig(outplot)
plt.close('all')


Unnamed: 0,chrom,strand,txstart,txend,symbol,tss_start,tss_end,tss_cc
2207,chr8,+,127736230,127742951,MYC,127736230,127736231,chr8_127736230_127736231
2454,chr5,-,88717116,88883184,MEF2C,88883184,88883185,chr5_88883184_88883185
5303,chr11,-,33858575,33869878,LMO2,33869878,33869879,chr11_33869878_33869879
5304,chr11,-,33858575,33869878,LMO2,33869878,33869879,chr11_33869878_33869879
6361,chr5,-,88718242,88883184,MEF2C,88883184,88883185,chr5_88883184_88883185
...,...,...,...,...,...,...,...,...
80866,chr5,-,88718242,88785511,MEF2C,88785511,88785512,chr5_88785511_88785512
84791,chr11,+,34051730,34099060,CAPRIN1,34051730,34051731,chr11_34051730_34051731
86136,chrX,+,48802066,48806960,HDAC6,48802066,48802067,chrX_48802066_48802067
86137,chrX,+,48802066,48814854,HDAC6,48802066,48802067,chrX_48802066_48802067


MYC
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MYC_grna_avg_z_log2FC.qBed


Unnamed: 0,tss_cc,start1,end1,start,end,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr8_127735433_127735434,127735433.0,127735434.0,127735441.1,127735444.1,0.433534,0.075298,0.402245,127735433.0,127735442.1,21.7
1,chr8_127736230_127736231,127736230.0,127736231.0,127736233.5,127736236.5,0.892306,0.946643,0.384959,127736230.0,127736233.3,43.9


tss_cc                    chr8_127735433_127735434
start1                                 127735433.0
end1                                   127735434.0
start                                  127735441.1
end                                    127735444.1
avg_zlog2FC                               0.433534
z_log2FC_R1                               0.075298
z_log2FC_R2                               0.402245
tss_position                           127735433.0
pam_coordinate                         127735442.1
abs_distance_to_summit                        21.7
Name: 0, dtype: object
tss_cc                    chr8_127736230_127736231
start1                                 127736230.0
end1                                   127736231.0
start                                  127736233.5
end                                    127736236.5
avg_zlog2FC                               0.892306
z_log2FC_R1                               0.946643
z_log2FC_R2                               0.384959
tss_posi

Unnamed: 0,gene,variable,value
0,MYC,refgene,0.433534
1,MYC,refgene,0.892306
2,HDAC6,refgene,0.327047
3,HDAC6,refgene,1.47595
4,HDAC6,refgene,7.673144
5,HDAC6,refgene,0.807141
6,NMU,refgene,4.291657
7,NMU,refgene,3.662439
8,FADS2,refgene,4.667887
9,FADS2,refgene,3.079941


In [176]:
fantom_tss_file = 'HCRFF_FANTOM5_hg38.bed' # ['chr','start','end','simple','strand','gene']
refgene_tss_file = 'HCRFF_refgene.bed' # ['chrom','tss_start','tss_end','tss_cc','strand','symbol']

path_to_bwtool = '/Users/davidy/pythonscripts/bigWigAverageOverBed'
pol2bigwig_file = '/Users/davidy/misc_resources/chip/results/macs2/ENCFF914WIS_POL2RA_hg38_K562.bigWig'
distance_range_window = 250

# Write out FANTOM TSSs of genes of interest only
# Have to update the coordinates to make them +/- distance bp
# Note that the TSS positions are variable width
# Add +/-X bases as evenly as possible to both sides
# If imbalanced, add more bases in a strand-specific way further from tx start

### Process fantom and refseq get strongest peaks
fantom_df_again = pd.read_csv(fantom_tss_file, sep = '\t', header=None, names = ['chr','start','end','cc','strand','gene'])
refgene_df_again = pd.read_csv(refgene_tss_file, sep = '\t', header=None, names = ['chr','start','end','gene','strand','cc'])
def strongest_bw(df, dataset):
    lod = []
    for i, row in df.iterrows():
        tss_length = row['end']-row['start']
        split_difference = ((distance_range_window-tss_length)/2)//1
        new_start = row['start']-split_difference
        new_end = row['end']+split_difference
        if new_end-new_start < distance_range_window: # If smaller, need to add 1 base, arbitrarily add in strand-specific manner further from tx start
            if row['strand'] == '+':
                new_start -= 1
            else:
                new_end += 1
        adf = {'chr':row['chr'],
              'start':new_start,
              'end':new_end,
              'cc':row['cc'],
              'strand':row['strand'],
              'gene':row['gene']}
        lod.append(adf)
    # Write df
    temp_df = pd.DataFrame(lod)
    temp_df['start'] = temp_df['start'].astype(int)
    temp_df['end'] = temp_df['end'].astype(int)
    temp_df.drop_duplicates().to_csv('temp_df.bed', sep = '\t', index=False, header=False, columns = ['chr','start','end','cc'])
    # Run bigwig average over bed
    out_bwavg = 'temp_df_bigwigaverage.bed'
    subprocess.call([path_to_bwtool, pol2bigwig_file, 'temp_df.bed', out_bwavg])
    # Process
    bwavg_df = pd.read_csv(out_bwavg, sep = '\t', header=None, names = ['cc','size','covered','sum','mean0','mean'], usecols=['cc','covered','mean'])
    # Join dataframes, dropping any incomplete lines
    merged_df = df.merge(bwavg_df, on='cc').dropna()
    # Select the strongest bwavg mean signal cc for each gene
    merged_df = merged_df.sort_values('mean',ascending=False).drop_duplicates('gene',keep='first')
    # Provide a screen type row
    merged_df['variable'] = dataset
    # Return df
    return(merged_df)

fantom_refseq_strongest_df = pd.concat([strongest_bw(fantom_df_again, 'fantom'), 
                                        strongest_bw(refgene_df_again, 'refseq')])

display(fantom_refseq_strongest_df)
fantom_refseq_strongest_file = 'HCRFF_FANTOM_refGene_bestTSS_{}bp_bwavgoverbed.bed'.format(distance_range_window)
fantom_refseq_strongest_df.to_csv(fantom_refseq_strongest_file, sep = '\t', index=False, header=False)

### Now get the guides for each
lod = []

for gene in new_screen_genes:
    avg_dolcetto_effect = 0
    avg_weissman_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    
    ### Get the Dolcetto library spacers
    temp_dolcetto_df = dolcetto_all_df.loc[dolcetto_all_df['symbol']==gene]
    dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(temp_dolcetto_df['spacer'])]
    #dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(dolcetto_spacers)]
    #display(dolcetto_spacers)
    if len(dolcetto_spacers) == 0:
        print('Defaulting {}, there are no dolcetto spacers'.format(gene))
        dolcetto_spacers = [0]
        avg_dolcetto_effect = 0
    else:
        avg_dolcetto_effect = np.mean(dolcetto_spacers['avg_zlog2FC'])
        #for i in dolcetto_spacers['avg_zlog2FC'].tolist():
        adf = {'gene':gene,
              'variable':'dolcetto',
              'value':avg_dolcetto_effect}
               #'value':float(i)}
        lod.append(adf)

    ### Get the Weissman library spacers
    temp_weissman_df = weissman_all_df.loc[weissman_all_df['symbol']==gene]
    weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].str[1:].isin(temp_weissman_df['spacer'])]
    #weissman_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(weissman_spacers)]
    if len(weissman_spacers) == 0:
        print('Defaulting {}, there are no weissman spacers'.format(gene))
        weissman_spacers = [0]
        avg_weissman_effect = 0
    else:
        avg_weissman_effect = np.mean(weissman_spacers['avg_zlog2FC'])
        #for i in weissman_spacers['avg_zlog2FC'].tolist():
        adf = {'gene':gene,
              'variable':'weissman',
              'value':avg_weissman_effect}
               #'value':float(i)}
        lod.append(adf)
            
            
    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a',fantom_refseq_strongest_file, '-b',gene_gq_files[0], '-w', '500'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    first_columns = ['chr','start','end','cc','strand','gene','covered','mean','variable']
    second_columns = ['chr2','start2','end2','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2', 'name', 'strand2', 'sequence']
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = first_columns+second_columns)
    int_df = int_df.loc[int_df['gene']==gene]
    
    # Get absolute distance from grna to promoter.
    # Distance is relative to the strand-aware TSS edge (closest to tx start), 0-based index for end position.
    int_df['tss_position'] = np.where(int_df['strand']=='+',int_df['end']-1,int_df['start'])
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['tss_position'])
    display(int_df)
    int_df2 = int_df.groupby('cc').apply(lambda x : np.mean(x.sort_values(by = 'abs_distance_to_summit').head(10))).reset_index()
    display(int_df2)
    for i, row in int_df2.iterrows():
        if '@' in row['cc']:
            variable = 'FANTOM'
        else:
            variable='refGene'
            
        adf = {'gene':gene,
              'variable':variable,
              'value':row['avg_zlog2FC']}
        lod.append(adf)
    
fantom_refseq_pol2_df = pd.DataFrame(lod)

display(fantom_refseq_pol2_df)
sns.set(rc={'figure.figsize':(15,5)})
sns.set(style="ticks")
g4 = sns.stripplot(data=fantom_refseq_pol2_df,x='gene',y='value',hue='variable',palette='colorblind',
                   size=3, dodge=True, jitter=False)
outplot = 'Scatterplot_DolcettoWeissmanPromoter_vs_FANTOM_Refgene_Pol2OptimalTSS.pdf'
plt.xticks(rotation=45)
plt.savefig(outplot)
plt.close('all')
    



Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable
170,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom
179,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom
49,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom
289,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom
16,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom
219,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.251,fantom
39,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom
119,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom
30,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom
120,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom


MYC
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MYC_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
0,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom,chr8,...,127735728,0.241474,0.329632,-0.812581,MYC|chr8:127735729-127735748:-,-,TGAGTATAAATCATCGCAGG,127736240,127736223,17
1,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom,chr8,...,127735752,0.260961,-0.166001,-0.355922,MYC|chr8:127735730-127735749:+,+,CTGCGATGATTTATACTCAC,127736240,127736223,17
2,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom,chr8,...,127735731,0.186258,-0.047650,-0.324865,MYC|chr8:127735732-127735751:-,-,CTGTGAGTATAAATCATCGC,127736240,127736223,17
3,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom,chr8,...,127735758,0.117600,-0.000644,-0.234556,MYC|chr8:127735736-127735755:+,+,TGATTTATACTCACAGGACA,127736240,127736223,17
4,chr8,127736224,127736241,"p1@MYC,0.3321",+,MYC,250,776.357,fantom,chr8,...,127735787,3.750333,4.811261,2.689405,MYC|chr8:127735765-127735784:+,+,TTTGTCAAACAGTACTGCTA,127736240,127736223,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319,chr8,127736230,127736231,chr8_127736230_127736231,+,MYC,250,774.255,refseq,chr8,...,127736703,1.116844,1.362151,0.871537,MYC|chr8:127736681-127736700:+,+,ATGAGTCGAATGCCTAAATA,127736230,127736229,1
320,chr8,127736230,127736231,chr8_127736230_127736231,+,MYC,250,774.255,refseq,chr8,...,127736695,0.377912,-0.279541,-0.476283,MYC|chr8:127736696-127736715:-,-,GGAGAAAAGACACCCTATTT,127736230,127736229,1
321,chr8,127736230,127736231,chr8_127736230_127736231,+,MYC,250,774.255,refseq,chr8,...,127736716,0.651603,0.789056,0.514149,MYC|chr8:127736717-127736736:-,-,AGTGTCAATAGCGCAGGAAT,127736230,127736229,1
322,chr8,127736230,127736231,chr8_127736230_127736231,+,MYC,250,774.255,refseq,chr8,...,127736717,0.963298,0.263854,1.662742,MYC|chr8:127736718-127736737:-,-,AAGTGTCAATAGCGCAGGAA,127736230,127736229,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr8_127736230_127736231,127736230.0,127736231.0,250.0,774.255,127736347.3,127736350.3,0.350121,-0.010705,0.331058,127736230.0,127736229.0,1.0
1,"p1@MYC,0.3321",127736224.0,127736241.0,250.0,776.357,127736344.9,127736347.9,0.348172,0.038858,0.285392,127736240.0,127736223.0,17.0


HDAC6
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/HDAC6_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
0,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.2510,fantom,chrX,...,48801752,2.093878,2.109288,2.078468,HDAC6|chrX:48801730-48801749:+,+,GCCGCTTGTGAGCTCGCGAG,48802245,48802235,10
1,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.2510,fantom,chrX,...,48801757,0.825255,1.042766,0.607744,HDAC6|chrX:48801735-48801754:+,+,TTGTGAGCTCGCGAGAGGTG,48802245,48802235,10
2,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.2510,fantom,chrX,...,48801769,0.210301,1.263385,-0.842782,HDAC6|chrX:48801747-48801766:+,+,GAGAGGTGTGGCTCGCGCGA,48802245,48802235,10
3,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.2510,fantom,chrX,...,48801798,4.403949,5.289515,3.518382,HDAC6|chrX:48801776-48801795:+,+,CCTTGAGGCACGGTCCCCTC,48802245,48802235,10
4,chrX,48802236,48802246,"p7@HDAC6,0.4811",+,HDAC6,250,468.2510,fantom,chrX,...,48801801,5.014956,6.079639,3.950273,HDAC6|chrX:48801779-48801798:+,+,TGAGGCACGGTCCCCTCTGG,48802245,48802235,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,chrX,48801397,48801398,chrX_48801397_48801398,+,HDAC6,250,69.2027,refseq,chrX,...,48801868,4.191198,4.918947,3.463449,HDAC6|chrX:48801869-48801888:-,-,CAAGTTTCTGGCGACCTGTG,48801397,48801396,1
511,chrX,48801397,48801398,chrX_48801397_48801398,+,HDAC6,250,69.2027,refseq,chrX,...,48801892,1.693225,3.085562,0.300888,HDAC6|chrX:48801870-48801889:+,+,ACAGGTCGCCAGAAACTTGG,48801397,48801396,1
512,chrX,48801397,48801398,chrX_48801397_48801398,+,HDAC6,250,69.2027,refseq,chrX,...,48801869,1.437713,1.786357,1.089069,HDAC6|chrX:48801870-48801889:-,-,CCAAGTTTCTGGCGACCTGT,48801397,48801396,1
513,chrX,48801397,48801398,chrX_48801397_48801398,+,HDAC6,250,69.2027,refseq,chrX,...,48801870,0.492322,-0.739130,-0.245514,HDAC6|chrX:48801871-48801890:-,-,ACCAAGTTTCTGGCGACCTG,48801397,48801396,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chrX_48801397_48801398,48801397.0,48801398.0,250.0,69.2027,48801543.3,48801546.3,1.035372,1.207344,0.552806,48801397.0,48801396.0,1.0
1,"p7@HDAC6,0.4811",48802236.0,48802246.0,250.0,468.251,48802298.6,48802301.6,3.042227,3.751161,2.113879,48802245.0,48802235.0,10.0


NMU
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/NMU_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
63,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom,chr4,...,55635763,0.207178,-0.590572,0.176217,NMU|chr4:55635741-55635760:+,+,GTACCCGTTTCCACGGGCCG,55636262,55636273,11
64,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom,chr4,...,55635764,0.213881,-0.423460,-0.004302,NMU|chr4:55635742-55635761:+,+,TACCCGTTTCCACGGGCCGG,55636262,55636273,11
65,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom,chr4,...,55635765,0.136091,0.574230,-0.846413,NMU|chr4:55635743-55635762:+,+,ACCCGTTTCCACGGGCCGGG,55636262,55636273,11
66,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom,chr4,...,55635770,0.002267,0.157013,-0.161547,NMU|chr4:55635748-55635767:+,+,TTTCCACGGGCCGGGGGGTC,55636262,55636273,11
67,chr4,55636262,55636273,"p3@NMU,0.4754",-,NMU,250,154.871,fantom,chr4,...,55635781,2.141908,0.706615,3.577202,NMU|chr4:55635759-55635778:+,+,CGGGGGGTCTGGAAATCCCG,55636262,55636273,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509,chr4,55636298,55636299,chr4_55636298_55636299,-,NMU,250,133.044,refseq,chr4,...,55636763,6.869965,7.961481,5.778449,NMU|chr4:55636741-55636760:+,+,CCAGTTCCAAGCGCAATGCC,55636298,55636299,1
510,chr4,55636298,55636299,chr4_55636298_55636299,-,NMU,250,133.044,refseq,chr4,...,55636743,5.052001,5.942644,4.161357,NMU|chr4:55636744-55636763:-,-,CCAGGCATTGCGCTTGGAAC,55636298,55636299,1
511,chr4,55636298,55636299,chr4_55636298_55636299,-,NMU,250,133.044,refseq,chr4,...,55636775,8.203020,8.862285,7.543755,NMU|chr4:55636753-55636772:+,+,GCAATGCCTGGTGCATATCA,55636298,55636299,1
512,chr4,55636298,55636299,chr4_55636298_55636299,-,NMU,250,133.044,refseq,chr4,...,55636761,9.385516,11.805563,6.965470,NMU|chr4:55636762-55636781:-,-,ACACTGCCCTGATATGCACC,55636298,55636299,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr4_55636298_55636299,55636298.0,55636299.0,250.0,133.044,55636344.4,55636347.4,7.990274,7.961473,7.912621,55636298.0,55636299.0,1.0
1,"p3@NMU,0.4754",55636262.0,55636273.0,250.0,154.871,55636319.9,55636322.9,8.379961,8.514787,8.162263,55636262.0,55636273.0,11.0


FADS2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS2_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
113,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom,chr11,...,61815627,0.869831,0.951571,0.788092,FADS2|chr11:61815605-61815624:+,+,GCCCCGACTTTCGAGTCCCT,61816122,61816109,13
114,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom,chr11,...,61815637,3.834943,7.008400,0.661486,FADS2|chr11:61815638-61815657:-,-,TCTGCTGGGTTGGAGTCCTG,61816122,61816109,13
115,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom,chr11,...,61815661,3.011269,4.303484,1.719053,FADS2|chr11:61815639-61815658:+,+,AGGACTCCAACCCAGCAGAG,61816122,61816109,13
116,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom,chr11,...,61815667,0.321505,1.112573,-0.469564,FADS2|chr11:61815645-61815664:+,+,CCAACCCAGCAGAGAGGCGC,61816122,61816109,13
117,chr11,61816110,61816123,"p38@FADS2,0.5200",+,FADS2,250,556.746,fantom,chr11,...,61815669,3.939668,5.486278,2.393057,FADS2|chr11:61815647-61815666:+,+,AACCCAGCAGAGAGGCGCCG,61816122,61816109,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
620,chr11,61816202,61816203,chr11_61816202_61816203,+,FADS2,250,467.085,refseq,chr11,...,61816664,4.053246,3.881500,4.224992,FADS2|chr11:61816665-61816684:-,-,GGCCCAGCGCTCAGGGTGCG,61816202,61816201,1
621,chr11,61816202,61816203,chr11_61816202_61816203,+,FADS2,250,467.085,refseq,chr11,...,61816697,2.914532,4.187452,1.641613,FADS2|chr11:61816675-61816694:+,+,GCGCTGGGCCACCTCGTCCC,61816202,61816201,1
622,chr11,61816202,61816203,chr11_61816202_61816203,+,FADS2,250,467.085,refseq,chr11,...,61816685,0.116349,0.784164,-0.551467,FADS2|chr11:61816686-61816705:-,-,CTACTTCACCTGGGACGAGG,61816202,61816201,1
623,chr11,61816202,61816203,chr11_61816202_61816203,+,FADS2,250,467.085,refseq,chr11,...,61816688,1.584732,2.607831,0.561633,FADS2|chr11:61816689-61816708:-,-,GCGCTACTTCACCTGGGACG,61816202,61816201,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_61816202_61816203,61816202.0,61816203.0,250.0,467.085,61816360.3,61816363.3,2.799634,3.352317,2.246951,61816202.0,61816201.0,1.0
1,"p38@FADS2,0.5200",61816110.0,61816123.0,250.0,556.746,61816242.5,61816245.5,3.091559,4.054536,1.893345,61816122.0,61816109.0,13.0


GATA1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/GATA1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
177,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,250,134.706,fantom,chrX,...,48786288,1.848547,2.151641,1.545453,GATA1|chrX:48786289-48786308:-,-,ATAAATGAAAATAGCAGATA,48786606,48786601,5
178,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,250,134.706,fantom,chrX,...,48786322,8.178027,7.863331,8.492722,GATA1|chrX:48786323-48786342:-,-,AGTGTCCAGACAAGCAAAAT,48786606,48786601,5
179,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,250,134.706,fantom,chrX,...,48786353,4.437524,5.674050,3.200998,GATA1|chrX:48786331-48786350:+,+,TGTCTGGACACTACAGTCCA,48786606,48786601,5
180,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,250,134.706,fantom,chrX,...,48786354,4.650292,4.334688,4.965895,GATA1|chrX:48786332-48786351:+,+,GTCTGGACACTACAGTCCAC,48786606,48786601,5
181,chrX,48786602,48786607,"p3@GATA1,0.4284",+,GATA1,250,134.706,fantom,chrX,...,48786362,0.608500,-1.218354,0.001355,GATA1|chrX:48786340-48786359:+,+,ACTACAGTCCACGGGAGCCT,48786606,48786601,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
374,chrX,48786589,48786590,chrX_48786589_48786590,+,GATA1,250,119.211,refseq,chrX,...,48786771,8.862115,9.783830,7.940400,GATA1|chrX:48786772-48786791:-,-,GTGGGTTGGAGGAGCTGTAA,48786589,48786588,1
375,chrX,48786589,48786590,chrX_48786589_48786590,+,GATA1,250,119.211,refseq,chrX,...,48786789,8.676219,9.008302,8.344137,GATA1|chrX:48786790-48786809:-,-,TGGGAGTTGGAAAGACAGGT,48786589,48786588,1
376,chrX,48786589,48786590,chrX_48786589_48786590,+,GATA1,250,119.211,refseq,chrX,...,48786790,4.513327,4.462199,4.564454,GATA1|chrX:48786791-48786810:-,-,ATGGGAGTTGGAAAGACAGG,48786589,48786588,1
377,chrX,48786589,48786590,chrX_48786589_48786590,+,GATA1,250,119.211,refseq,chrX,...,48786793,7.098818,7.063353,7.134283,GATA1|chrX:48786794-48786813:-,-,GAAATGGGAGTTGGAAAGAC,48786589,48786588,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chrX_48786589_48786590,48786589.0,48786590.0,250.0,119.211,48786624.5,48786627.5,6.262643,5.891966,6.633321,48786589.0,48786588.0,1.0
1,"p3@GATA1,0.4284",48786602.0,48786607.0,250.0,134.706,48786624.5,48786627.5,6.262643,5.891966,6.633321,48786606.0,48786601.0,5.0


LMO2
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/LMO2_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
337,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,250,37.9279,fantom,chr11,...,33870154,1.741352,0.563157,2.919547,LMO2|chr11:33870132-33870151:+,+,GGACTGACTTGGCTTCTTAA,33870651,33870663,12
338,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,250,37.9279,fantom,chr11,...,33870155,2.161498,1.674550,2.648446,LMO2|chr11:33870133-33870152:+,+,GACTGACTTGGCTTCTTAAA,33870651,33870663,12
339,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,250,37.9279,fantom,chr11,...,33870166,0.036319,0.592588,-0.665227,LMO2|chr11:33870144-33870163:+,+,CTTCTTAAAGGGGCCAGTGT,33870651,33870663,12
340,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,250,37.9279,fantom,chr11,...,33870167,1.078364,1.712396,0.444333,LMO2|chr11:33870145-33870164:+,+,TTCTTAAAGGGGCCAGTGTC,33870651,33870663,12
341,chr11,33870651,33870663,"p21@LMO2,0.2997",-,LMO2,250,37.9279,fantom,chr11,...,33870168,3.031485,2.561122,3.501848,LMO2|chr11:33870146-33870165:+,+,TCTTAAAGGGGCCAGTGTCG,33870651,33870663,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
825,chr11,33869878,33869879,chr11_33869878_33869879,-,LMO2,250,6.0681,refseq,chr11,...,33870263,0.946168,-0.930926,2.823261,LMO2|chr11:33870264-33870283:-,-,TCTGCCTTAGTTAAACCAAA,33869878,33869879,1
826,chr11,33869878,33869879,chr11_33869878_33869879,-,LMO2,250,6.0681,refseq,chr11,...,33870292,0.272003,0.380489,0.163518,LMO2|chr11:33870293-33870312:-,-,TGCTTGCTCGAGCTGTTCGT,33869878,33869879,1
827,chr11,33869878,33869879,chr11_33869878_33869879,-,LMO2,250,6.0681,refseq,chr11,...,33870293,0.029088,-0.556203,0.614379,LMO2|chr11:33870294-33870313:-,-,TTGCTTGCTCGAGCTGTTCG,33869878,33869879,1
828,chr11,33869878,33869879,chr11_33869878_33869879,-,LMO2,250,6.0681,refseq,chr11,...,33870363,4.217595,4.265870,4.169320,LMO2|chr11:33870364-33870383:-,-,GGGACGGGCGGAAAATCCAC,33869878,33869879,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_33869878_33869879,33869878.0,33869879.0,250.0,6.0681,33869797.2,33869800.2,2.336979,2.434844,1.726624,33869878.0,33869879.0,1.0
1,"p21@LMO2,0.2997",33870651.0,33870663.0,250.0,37.9279,33870633.6,33870636.6,0.897178,0.366095,1.156964,33870651.0,33870663.0,12.0


MEF2C
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MEF2C_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
63,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,250,70.6537,fantom,chr5,...,88882882,1.646124,2.837276,0.454971,MEF2C|chr5:88882860-88882879:+,+,GATAGATAGACACAGTGCCG,88883363,88883368,5
64,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,250,70.6537,fantom,chr5,...,88882917,0.412193,0.733572,0.090814,MEF2C|chr5:88882895-88882914:+,+,CAGATTTTAGAGACAATACC,88883363,88883368,5
65,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,250,70.6537,fantom,chr5,...,88882915,0.483816,-0.082650,-0.884983,MEF2C|chr5:88882916-88882935:-,-,CTCGAGGAAATAGAAAACCC,88883363,88883368,5
66,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,250,70.6537,fantom,chr5,...,88882945,0.007370,-0.583346,0.568606,MEF2C|chr5:88882923-88882942:+,+,CTATTTCCTCGAGAAATATC,88883363,88883368,5
67,chr5,88883363,88883368,"p24@MEF2C,0.5372",-,MEF2C,250,70.6537,fantom,chr5,...,88882947,1.373166,1.912919,0.833413,MEF2C|chr5:88882925-88882944:+,+,ATTTCCTCGAGAAATATCAG,88883363,88883368,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
214,chr5,88883184,88883185,chr5_88883184_88883185,-,MEF2C,250,45.2235,refseq,chr5,...,88883538,2.962157,2.653126,3.271188,MEF2C|chr5:88883516-88883535:+,+,GGGGATTGAAGGATACTGGG,88883184,88883185,1
215,chr5,88883184,88883185,chr5_88883184_88883185,-,MEF2C,250,45.2235,refseq,chr5,...,88883575,2.450940,2.066334,2.835547,MEF2C|chr5:88883553-88883572:+,+,TAGGGAAGACGGAGCACGAA,88883184,88883185,1
216,chr5,88883184,88883185,chr5_88883184_88883185,-,MEF2C,250,45.2235,refseq,chr5,...,88883579,1.147504,1.198445,1.096563,MEF2C|chr5:88883557-88883576:+,+,GAAGACGGAGCACGAATGGT,88883184,88883185,1
217,chr5,88883184,88883185,chr5_88883184_88883185,-,MEF2C,250,45.2235,refseq,chr5,...,88883580,1.139575,1.141925,1.137226,MEF2C|chr5:88883558-88883577:+,+,AAGACGGAGCACGAATGGTT,88883184,88883185,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr5_88883184_88883185,88883184.0,88883185.0,250.0,45.2235,88883249.0,88883252.0,1.448501,2.024527,0.556164,88883184.0,88883185.0,1.0
1,"p24@MEF2C,0.5372",88883363.0,88883368.0,250.0,70.6537,88883490.5,88883493.5,1.331737,1.433677,1.16422,88883363.0,88883368.0,5.0


CAT
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CAT_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
195,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom,chr11,...,34438453,2.550512,3.108920,1.992104,CAT|chr11:34438431-34438450:+,+,AAAAGCATCCATCCATCCTT,34439013,34438899,114
196,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom,chr11,...,34438445,2.423527,2.310936,2.536118,CAT|chr11:34438446-34438465:-,-,TATTATTTGCAACCAAAGGA,34439013,34438899,114
197,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom,chr11,...,34438449,2.532412,2.094929,2.969894,CAT|chr11:34438450-34438469:-,-,TAAGTATTATTTGCAACCAA,34439013,34438899,114
198,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom,chr11,...,34438518,8.494420,8.791831,8.197008,CAT|chr11:34438496-34438515:+,+,TGTACAGAGTAATTTAACCC,34439013,34438899,114
199,chr11,34438900,34439014,"p1@CAT,0.2381",+,CAT,250,178.581,fantom,chr11,...,34438516,0.723489,-0.639677,-0.807301,CAT|chr11:34438517-34438536:-,-,TCTTAAAAAGTCAGCAATCC,34439013,34438899,114
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
736,chr11,34438933,34438934,chr11_34438933_34438934,+,CAT,250,157.833,refseq,chr11,...,34439342,8.756016,10.480953,7.031080,CAT|chr11:34439343-34439362:-,-,GCTGCGACCTCTGTCCCAGT,34438933,34438932,1
737,chr11,34438933,34438934,chr11_34438933_34438934,+,CAT,250,157.833,refseq,chr11,...,34439387,10.965486,11.353348,10.577624,CAT|chr11:34439365-34439384:+,+,GAGGACAGATTGAGGGCTCA,34438933,34438932,1
738,chr11,34438933,34438934,chr11_34438933_34438934,+,CAT,250,157.833,refseq,chr11,...,34439388,9.349662,8.450295,10.249029,CAT|chr11:34439366-34439385:+,+,AGGACAGATTGAGGGCTCAT,34438933,34438932,1
739,chr11,34438933,34438934,chr11_34438933_34438934,+,CAT,250,157.833,refseq,chr11,...,34439389,9.765957,10.670391,8.861524,CAT|chr11:34439367-34439386:+,+,GGACAGATTGAGGGCTCATG,34438933,34438932,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_34438933_34438934,34438933.0,34438934.0,250.0,157.833,34439060.1,34439063.1,4.774662,5.151794,4.39753,34438933.0,34438932.0,1.0
1,"p1@CAT,0.2381",34438900.0,34439014.0,250.0,178.581,34439074.8,34439077.8,5.252934,5.622435,4.883432,34439013.0,34438899.0,114.0


CAPRIN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CAPRIN1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom,chr11,...,34051290,0.506901,0.598324,0.415477,CAPRIN1|chr11:34051268-34051287:+,+,TTGCTCGCTCTGCATACAGA,34051739,34051721,18
1,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom,chr11,...,34051322,1.097761,-1.488068,-0.707454,CAPRIN1|chr11:34051323-34051342:-,-,CTGCAAGTAAGGGGTGCGTG,34051739,34051721,18
2,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom,chr11,...,34051324,4.796797,4.570265,5.023329,CAPRIN1|chr11:34051325-34051344:-,-,TTCTGCAAGTAAGGGGTGCG,34051739,34051721,18
3,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom,chr11,...,34051332,0.281696,-0.215491,-0.347901,CAPRIN1|chr11:34051333-34051352:-,-,GCAAAGGTTTCTGCAAGTAA,34051739,34051721,18
4,chr11,34051722,34051740,"p2@CAPRIN1,0.4267",+,CAPRIN1,250,548.255,fantom,chr11,...,34051333,1.397740,1.624458,1.171021,CAPRIN1|chr11:34051334-34051353:-,-,GGCAAAGGTTTCTGCAAGTA,34051739,34051721,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,chr11,34051730,34051731,chr11_34051730_34051731,+,CAPRIN1,250,545.963,refseq,chr11,...,34052170,2.235905,1.332818,3.138993,CAPRIN1|chr11:34052171-34052190:-,-,CCCCCGCGGAGCCGCCGGCT,34051730,34051729,1
543,chr11,34051730,34051731,chr11_34051730_34051731,+,CAPRIN1,250,545.963,refseq,chr11,...,34052220,4.525121,5.226875,3.823368,CAPRIN1|chr11:34052221-34052240:-,-,CGCCAGCGGGGCGCCCGGCT,34051730,34051729,1
544,chr11,34051730,34051731,chr11_34051730_34051731,+,CAPRIN1,250,545.963,refseq,chr11,...,34052221,0.250910,0.203635,0.298185,CAPRIN1|chr11:34052222-34052241:-,-,CCGCCAGCGGGGCGCCCGGC,34051730,34051729,1
545,chr11,34051730,34051731,chr11_34051730_34051731,+,CAPRIN1,250,545.963,refseq,chr11,...,34052225,2.324980,2.246559,2.403400,CAPRIN1|chr11:34052226-34052245:-,-,CGACCCGCCAGCGGGGCGCC,34051730,34051729,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_34051730_34051731,34051730.0,34051731.0,250.0,545.963,34051893.8,34051896.8,0.866457,1.019007,0.628496,34051730.0,34051729.0,1.0
1,"p2@CAPRIN1,0.4267",34051722.0,34051740.0,250.0,548.255,34051909.5,34051912.5,0.977233,0.997895,0.956571,34051739.0,34051721.0,18.0


FADS3
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS3_grna_avg_z_log2FC.qBed
Defaulting FADS3, there are no dolcetto spacers


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
353,chr11,61891873,61891896,"p13@FADS3,0.3990",-,FADS3,250,14.38830,fantom,chr11,...,61891376,5.855943,5.659037,6.052848,FADS3|chr11:61891377-61891396:-,-,ACTCGTGCGTGCAGCATGGG,61891873,61891896,23
354,chr11,61891873,61891896,"p13@FADS3,0.3990",-,FADS3,250,14.38830,fantom,chr11,...,61891380,2.630664,2.662029,2.599298,FADS3|chr11:61891381-61891400:-,-,CAGGACTCGTGCGTGCAGCA,61891873,61891896,23
355,chr11,61891873,61891896,"p13@FADS3,0.3990",-,FADS3,250,14.38830,fantom,chr11,...,61891403,7.346751,9.141723,5.551779,FADS3|chr11:61891381-61891400:+,+,TGCTGCACGCACGAGTCCTG,61891873,61891896,23
356,chr11,61891873,61891896,"p13@FADS3,0.3990",-,FADS3,250,14.38830,fantom,chr11,...,61891411,8.392367,8.630351,8.154383,FADS3|chr11:61891389-61891408:+,+,GCACGAGTCCTGGGGATCCC,61891873,61891896,23
357,chr11,61891873,61891896,"p13@FADS3,0.3990",-,FADS3,250,14.38830,fantom,chr11,...,61891414,7.925714,7.907395,7.944033,FADS3|chr11:61891392-61891411:+,+,CGAGTCCTGGGGATCCCAGG,61891873,61891896,23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1024,chr11,61891545,61891546,chr11_61891545_61891546,-,FADS3,250,1.16056,refseq,chr11,...,61892010,0.434633,-0.752045,-0.117220,FADS3|chr11:61892011-61892030:-,-,ATGGTTTGAAGGCTGTTCAC,61891545,61891546,1
1025,chr11,61891545,61891546,chr11_61891545_61891546,-,FADS3,250,1.16056,refseq,chr11,...,61892032,0.922951,0.852773,0.993130,FADS3|chr11:61892010-61892029:+,+,TGTGAACAGCCTTCAAACCA,61891545,61891546,1
1026,chr11,61891545,61891546,chr11_61891545_61891546,-,FADS3,250,1.16056,refseq,chr11,...,61892033,1.275063,0.488814,2.061312,FADS3|chr11:61892011-61892030:+,+,GTGAACAGCCTTCAAACCAT,61891545,61891546,1
1027,chr11,61891545,61891546,chr11_61891545_61891546,-,FADS3,250,1.16056,refseq,chr11,...,61892021,1.677598,1.290476,2.064720,FADS3|chr11:61892022-61892041:-,-,GATTGGGGCCCATGGTTTGA,61891545,61891546,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_61891545_61891546,61891545.0,61891546.0,250.0,1.16056,61891567.4,61891570.4,1.768654,1.525068,1.678067,61891545.0,61891546.0,1.0
1,"p13@FADS3,0.3990",61891873.0,61891896.0,250.0,14.3883,61891893.7,61891896.7,2.273965,2.020934,2.526996,61891873.0,61891896.0,23.0


CD164
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CD164_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
0,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom,chr6,...,109381909,0.421911,-0.314936,-0.528886,CD164|chr6:109381887-109381906:+,+,CCCATTTCCGACCCCAAGTG,109382397,109382418,21
1,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom,chr6,...,109381914,0.574457,0.578347,0.570567,CD164|chr6:109381892-109381911:+,+,TTCCGACCCCAAGTGAGGCT,109382397,109382418,21
2,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom,chr6,...,109381900,0.648063,1.024567,0.271559,CD164|chr6:109381901-109381920:-,-,GTAATTCCTAGCCTCACTTG,109382397,109382418,21
3,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom,chr6,...,109381901,0.116620,0.133226,-0.366466,CD164|chr6:109381902-109381921:-,-,CGTAATTCCTAGCCTCACTT,109382397,109382418,21
4,chr6,109382397,109382418,"p1@CD164,0.4826",-,CD164,250,778.782,fantom,chr6,...,109381902,0.431882,-0.094046,-0.769718,CD164|chr6:109381903-109381922:-,-,TCGTAATTCCTAGCCTCACT,109382397,109382418,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,chr6,109382467,109382468,chr6_109382467_109382468,-,CD164,250,600.823,refseq,chr6,...,109382930,0.895054,0.793839,0.996268,CD164|chr6:109382931-109382950:-,-,AAGATATAAGGAAGCTCCCT,109382467,109382468,1
476,chr6,109382467,109382468,chr6_109382467_109382468,-,CD164,250,600.823,refseq,chr6,...,109382931,0.624173,0.175897,1.072450,CD164|chr6:109382932-109382951:-,-,GAAGATATAAGGAAGCTCCC,109382467,109382468,1
477,chr6,109382467,109382468,chr6_109382467_109382468,-,CD164,250,600.823,refseq,chr6,...,109382953,0.017141,0.394712,-0.428994,CD164|chr6:109382931-109382950:+,+,AGGGAGCTTCCTTATATCTT,109382467,109382468,1
478,chr6,109382467,109382468,chr6_109382467_109382468,-,CD164,250,600.823,refseq,chr6,...,109382957,0.768769,-0.269902,-1.267637,CD164|chr6:109382935-109382954:+,+,AGCTTCCTTATATCTTCGGC,109382467,109382468,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr6_109382467_109382468,109382467.0,109382468.0,250.0,600.823,109382507.3,109382510.3,1.179372,1.153273,0.923249,109382467.0,109382468.0,1.0
1,"p1@CD164,0.4826",109382397.0,109382418.0,250.0,778.782,109382451.1,109382454.1,0.45383,0.330178,-0.110326,109382397.0,109382418.0,21.0


PVT1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/PVT1_grna_avg_z_log2FC.qBed
Defaulting PVT1, there are no dolcetto spacers
Defaulting PVT1, there are no weissman spacers


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
114,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,250,14.9222,fantom,chr8,...,127794017,0.454858,0.741450,0.168266,PVT1|chr8:127793995-127794014:+,+,TGGAGCTGGAGCTCAGTCGG,127794568,127794506,62
115,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,250,14.9222,fantom,chr8,...,127794028,1.223419,3.752820,-1.305981,PVT1|chr8:127794006-127794025:+,+,CTCAGTCGGCGGTCTTCTGC,127794568,127794506,62
116,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,250,14.9222,fantom,chr8,...,127794038,0.376384,-0.452936,-0.299832,PVT1|chr8:127794039-127794058:-,-,GGCCAGAGATGCTCCGGGAT,127794568,127794506,62
117,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,250,14.9222,fantom,chr8,...,127794060,0.463055,-0.573099,1.499209,PVT1|chr8:127794061-127794080:-,-,GGCGAGTTCAGTGAAATAGG,127794568,127794506,62
118,chr8,127794507,127794569,"p1@PVT1,0.3058",+,PVT1,250,14.9222,fantom,chr8,...,127794100,1.079223,-0.655769,-1.502677,PVT1|chr8:127794078-127794097:+,+,GCCTTCCTCAGCAGGAAAGT,127794568,127794506,62
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
406,chr8,127794532,127794533,chr8_127794532_127794533,+,PVT1,250,13.9803,refseq,chr8,...,127794924,2.302932,2.439398,2.166466,PVT1|chr8:127794902-127794921:+,+,GGGCGTTTGGGGAGGGTGAG,127794532,127794531,1
407,chr8,127794532,127794533,chr8_127794532_127794533,+,PVT1,250,13.9803,refseq,chr8,...,127794958,0.429218,-0.885198,0.026763,PVT1|chr8:127794936-127794955:+,+,ACCGAATGGCCGGTGCATAG,127794532,127794531,1
408,chr8,127794532,127794533,chr8_127794532_127794533,+,PVT1,250,13.9803,refseq,chr8,...,127794988,0.856022,0.411560,1.300485,PVT1|chr8:127794966-127794985:+,+,GGATTTATGAACTTGATCAA,127794532,127794531,1
409,chr8,127794532,127794533,chr8_127794532_127794533,+,PVT1,250,13.9803,refseq,chr8,...,127795018,1.064969,-0.697500,-1.432438,PVT1|chr8:127795019-127795038:-,-,AAATCTGTTCTGACTGCAGA,127794532,127794531,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr8_127794532_127794533,127794532.0,127794533.0,250.0,13.9803,127794709.6,127794712.6,0.705793,0.082553,-0.088817,127794532.0,127794531.0,1.0
1,"p1@PVT1,0.3058",127794507.0,127794569.0,250.0,14.9222,127794728.9,127794731.9,0.566526,-0.330157,-0.168574,127794568.0,127794506.0,62.0


FADS1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FADS1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom,chr11,...,61815594,2.108518,1.709205,2.507831,FADS1|chr11:61815572-61815591:+,+,TGGCCGCCAAGGGGGAAACG,61816091,61816094,3
1,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom,chr11,...,61815601,2.671447,2.678321,2.664573,FADS1|chr11:61815579-61815598:+,+,CAAGGGGGAAACGCGGTGAA,61816091,61816094,3
2,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom,chr11,...,61815627,3.147205,2.298819,3.995592,FADS1|chr11:61815605-61815624:+,+,GCCCCGACTTTCGAGTCCCT,61816091,61816094,3
3,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom,chr11,...,61815608,2.091440,1.991028,2.191852,FADS1|chr11:61815609-61815628:-,-,TCCTAGGGACTCGAAAGTCG,61816091,61816094,3
4,chr11,61816091,61816094,"p44@FADS1,0.5744",-,FADS1,250,573.169,fantom,chr11,...,61815610,0.089589,0.192481,-0.013303,FADS1|chr11:61815611-61815630:-,-,AGTCCTAGGGACTCGAAAGT,61816091,61816094,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
861,chr11,61817003,61817004,chr11_61817003_61817004,-,FADS1,250,128.875,refseq,chr11,...,61817455,4.333662,4.422758,4.244566,FADS1|chr11:61817433-61817452:+,+,ggcttggtggtggttggggg,61817003,61817004,1
862,chr11,61817003,61817004,chr11_61817003_61817004,-,FADS1,250,128.875,refseq,chr11,...,61817456,2.907527,3.215362,2.599693,FADS1|chr11:61817434-61817453:+,+,gcttggtggtggttggggga,61817003,61817004,1
863,chr11,61817003,61817004,chr11_61817003_61817004,-,FADS1,250,128.875,refseq,chr11,...,61817499,0.286593,0.051679,0.521507,FADS1|chr11:61817500-61817519:-,-,acagggttataaaaattgtg,61817003,61817004,1
864,chr11,61817003,61817004,chr11_61817003_61817004,-,FADS1,250,128.875,refseq,chr11,...,61817500,0.303679,-0.467337,-0.140022,FADS1|chr11:61817501-61817520:-,-,aacagggttataaaaattgt,61817003,61817004,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_61817003_61817004,61817003.0,61817004.0,250.0,128.875,61817106.6,61817109.6,3.414045,3.28255,3.2049,61817003.0,61817004.0,1.0
1,"p44@FADS1,0.5744",61816091.0,61816094.0,250.0,573.169,61816171.8,61816174.8,3.80425,3.879403,3.729096,61816091.0,61816094.0,3.0


ERP29
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/ERP29_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
63,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom,chr12,...,112013038,1.653160,0.834384,2.471936,ERP29|chr12:112013016-112013035:+,+,GGTCCCAACCAACTGCCTGA,112013538,112013523,15
64,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom,chr12,...,112013046,2.023805,1.892952,2.154658,ERP29|chr12:112013024-112013043:+,+,CCAACTGCCTGACGGCCCAA,112013538,112013523,15
65,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom,chr12,...,112013026,1.194571,0.721521,1.667620,ERP29|chr12:112013027-112013046:-,-,CCTTTGGGCCGTCAGGCAGT,112013538,112013523,15
66,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom,chr12,...,112013050,1.453099,1.471845,1.434353,ERP29|chr12:112013028-112013047:+,+,CTGCCTGACGGCCCAAAGGC,112013538,112013523,15
67,chr12,112013524,112013539,"p4@ERP29,0.3598",+,ERP29,250,211.533,fantom,chr12,...,112013084,0.047869,0.244177,-0.148439,ERP29|chr12:112013062-112013081:+,+,GTAGGCCTTGTCATCTTCGA,112013538,112013523,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
365,chr12,112013425,112013426,chr12_112013425_112013426,+,ERP29,250,132.855,refseq,chr12,...,112013852,1.748093,3.931777,-0.435591,ERP29|chr12:112013853-112013872:-,-,CCCTCGGCTGCATCTTTGCA,112013425,112013424,1
366,chr12,112013425,112013426,chr12_112013425_112013426,+,ERP29,250,132.855,refseq,chr12,...,112013879,9.492315,12.263898,6.720732,ERP29|chr12:112013857-112013876:+,+,AAGATGCAGCCGAGGGAACT,112013425,112013424,1
367,chr12,112013425,112013426,chr12_112013425_112013426,+,ERP29,250,132.855,refseq,chr12,...,112013901,4.335562,7.172372,1.498752,ERP29|chr12:112013879-112013898:+,+,GTGCCCCTTCCCTTCACCTG,112013425,112013424,1
368,chr12,112013425,112013426,chr12_112013425_112013426,+,ERP29,250,132.855,refseq,chr12,...,112013917,1.937426,0.904213,2.970638,ERP29|chr12:112013918-112013937:-,-,CTCGAGGGTTGGATCTCGCC,112013425,112013424,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr12_112013425_112013426,112013425.0,112013426.0,250.0,132.855,112013579.4,112013582.4,4.857642,5.350171,4.310391,112013425.0,112013424.0,1.0
1,"p4@ERP29,0.3598",112013524.0,112013539.0,250.0,211.533,112013668.3,112013671.3,4.547857,5.947475,3.12624,112013538.0,112013523.0,15.0


FEN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FEN1_grna_avg_z_log2FC.qBed


Unnamed: 0,chr,start,end,cc,strand,gene,covered,mean,variable,chr2,...,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,name,strand2,sequence,tss_position,pam_coordinate,abs_distance_to_summit
229,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom,chr11,...,61792389,0.677131,0.737490,0.616771,FEN1|chr11:61792367-61792386:+,+,ATCGCGAGCTGAGAAACCTA,61792946,61792875,71
230,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom,chr11,...,61792404,1.433677,1.058042,1.809312,FEN1|chr11:61792382-61792401:+,+,ACCTAAGGAGTTCATGGCAA,61792946,61792875,71
231,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom,chr11,...,61792405,0.018061,-0.782514,0.746391,FEN1|chr11:61792383-61792402:+,+,CCTAAGGAGTTCATGGCAAG,61792946,61792875,71
232,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom,chr11,...,61792441,0.814120,0.632018,0.996221,FEN1|chr11:61792419-61792438:+,+,CACCCTTCAGCCCAAGCCGG,61792946,61792875,71
233,chr11,61792876,61792947,"p1@FEN1,0.3317",+,FEN1,250,264.718,fantom,chr11,...,61792431,4.083541,2.916144,5.250939,FEN1|chr11:61792432-61792451:-,-,GCTCCTGGAACCTCCGGCTT,61792946,61792875,71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,chr11,61792910,61792911,chr11_61792910_61792911,+,FEN1,250,263.607,refseq,chr11,...,61793383,5.010797,7.710298,2.311295,FEN1|chr11:61793361-61793380:+,+,CAGTCCACACTCCAGCAGCG,61792910,61792909,1
740,chr11,61792910,61792911,chr11_61792910_61792911,+,FEN1,250,263.607,refseq,chr11,...,61793387,0.766983,0.655776,0.878190,FEN1|chr11:61793365-61793384:+,+,CCACACTCCAGCAGCGCGGC,61792910,61792909,1
741,chr11,61792910,61792911,chr11_61792910_61792911,+,FEN1,250,263.607,refseq,chr11,...,61793386,0.603474,0.515788,0.691159,FEN1|chr11:61793387-61793406:-,-,CTTCGGCTCTTGCCTCAAGC,61792910,61792909,1
742,chr11,61792910,61792911,chr11_61792910_61792911,+,FEN1,250,263.607,refseq,chr11,...,61793408,4.116666,4.028515,4.204818,FEN1|chr11:61793386-61793405:+,+,GGCTTGAGGCAAGAGCCGAA,61792910,61792909,1


Unnamed: 0,cc,start,end,covered,mean,start2,end2,avg_zlog2FC,z_log2FC_R1,z_log2FC_R2,tss_position,pam_coordinate,abs_distance_to_summit
0,chr11_61792910_61792911,61792910.0,61792911.0,250.0,263.607,61793060.1,61793063.1,3.453485,4.081068,2.825903,61792910.0,61792909.0,1.0
1,"p1@FEN1,0.3317",61792876.0,61792947.0,250.0,264.718,61793058.2,61793061.2,3.161291,3.609008,2.713574,61792946.0,61792875.0,71.0


Unnamed: 0,gene,variable,value
0,MYC,dolcetto,0.54772
1,MYC,weissman,1.379206
2,MYC,refGene,0.350121
3,MYC,FANTOM,0.348172
4,HDAC6,dolcetto,7.365781
5,HDAC6,weissman,5.373309
6,HDAC6,refGene,1.035372
7,HDAC6,FANTOM,3.042227
8,NMU,dolcetto,8.173992
9,NMU,weissman,4.98231


In [75]:
### Get new TSS coords from FANTOM data
fantom_df.to_csv('fantom_gene_tss.bed', sep = '\t', columns=['chr','start','end','gene','strand','simple'],header=False,index=False)

### Intersect with DHS peaks
int_file = 'gene_tss_dhs_peak_intersect.bed'
fopen = open(int_file, 'w')
subprocess.call(['bedtools','intersect','-a','fantom_gene_tss.bed','-b','/Users/davidy/misc_resources/chip/results/macs2/dhs_k562_hg38_optimal_peak.narrowPeak.bed', '-wa','-wb'],stdout=fopen)
fopen.close()

### Process intersect file. For each DHS peak that was intersected, only take the strongest summit. 
gene_tss_dhs_df = pd.read_csv('gene_tss_dhs_peak_intersect.bed', sep = '\t', header = None, names = ['chr1','start1','end1','symbol','strand','simple','chr2','start2','end2','blank1','blank2','blank3','signal1','signal2','signal3','summit'])
gene_tss_dhs_df['dhs_peak_cc'] = gene_tss_dhs_df['chr2'].astype(str) + '_' + gene_tss_dhs_df['start2'].astype(str) + '_' + gene_tss_dhs_df['end2'].astype(str)
# Get only strongest summit
gene_tss_dhs_df = gene_tss_dhs_df.loc[gene_tss_dhs_df.groupby(['dhs_peak_cc', 'symbol'])['signal1'].idxmax()]

gene_tss_dhs_df.to_csv('gene_tss_dhs_peak_intersect.bed', sep = '\t', index = False, columns=['chr2','start2','end2','dhs_peak_cc','symbol','summit'], header = False)
display(gene_tss_dhs_df)

Unnamed: 0,chr1,start1,end1,symbol,strand,simple,chr2,start2,end2,blank1,blank2,blank3,signal1,signal2,signal3,summit,dhs_peak_cc
0,chr11,33869521,33869530,LMO2,-,"p14@LMO2,0.5901",chr11,33869527,33869751,.,639,.,2.98784,16.79869,14.61977,78,chr11_33869527_33869751
5,chr11,33869816,33869888,LMO2,-,"p1@LMO2,0.5001",chr11,33869786,33870019,.,864,.,3.48115,24.17932,21.84239,106,chr11_33869786_33870019
13,chr11,33870319,33870357,LMO2,-,"p17@LMO2,0.2641",chr11,33870097,33871195,.,970,.,3.94733,31.98774,29.49923,514,chr11_33870097_33871195
20,chr11,33892161,33892194,LMO2,-,"p15@LMO2,0.2528",chr11,33891749,33892410,.,1000,.,4.2016,36.67241,34.09317,362,chr11_33891749_33892410
28,chr11,34051562,34051585,CAPRIN1,+,"p29@CAPRIN1,0.2856",chr11,34051352,34052969,.,1000,.,5.0197,51.86429,48.95948,159,chr11_34051352_34052969
84,chr11,34053441,34053459,CAPRIN1,+,"p23@CAPRIN1,0.3043",chr11,34053012,34053606,.,941,.,4.23407,36.98389,34.39718,492,chr11_34053012_34053606
87,chr11,34438900,34439014,CAT,+,"p1@CAT,0.2381",chr11,34438478,34439511,.,1000,.,7.22638,71.08998,67.63677,244,chr11_34438478_34439511
92,chr11,61792670,61792696,FEN1,+,"p2@FEN1,0.4160",chr11,61792013,61793571,.,1000,.,5.39764,50.24253,47.37516,623,chr11_61792013_61793571
106,chr11,61800803,61800814,FADS1,-,"p35@FADS1,0.2600",chr11,61800578,61800938,.,1000,.,5.50026,52.27794,49.36228,218,chr11_61800578_61800938
110,chr11,61815107,61815174,FADS1,-,"p7@FADS1,0.2794",chr11,61814898,61815720,.,1000,.,4.57494,47.89878,45.08473,465,chr11_61814898_61815720


In [76]:
### Retrieve the gRNA avg_z_log2FCs
all_gq_files = [f for f in listdir('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide') if isfile(join('/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide', f))]
gq_files = [f for f in all_gq_files if 'grna_avg_z_log2FC.qBed' in f]
print(gq_files)

lod = []

for gene in screen_genes:
    avg_dolcetto_effect = 0
    nearest_10_avg_effect = 0
    adf = {}
    gene_gq_files = ['/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide' + '/' + f for f in gq_files if re.match(r'{}'.format(gene), f)]
    
    print(gene)
    try:
        print(gene_gq_files[0])
    except:
        print(gene, 'is missing a guide file for whatever fucking reason, skip. Probably no significant elements?')
        continue
        
    gene_gq_df = pd.read_csv(gene_gq_files[0], sep = '\t', names = ['chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    #display(gene_gq_df)
    
    ### Get the Dolcetto library spacers
    dolcetto_spacers = gene_gq_df.loc[gene_gq_df['sequence'].isin(spacers)]
    #display(dolcetto_spacers)
    if len(dolcetto_spacers) == 0:
        print('Skipping {}, there are no dolcetto spacers'.format(gene))
        continue
    else:
        avg_dolcetto_effect = np.mean(dolcetto_spacers['avg_zlog2FC'])

    
    # Get the gRNAs that intersect the TSS ranges
    # Note, using window. +/- 1000 around TSS. Just to make sure we have enough gRNAs to sample from if the summit is close to a TSS range edge. 
    fopen = open('temp_gene_grna_tss_intersect.bed', 'w')
    subprocess.call(['bedtools','window','-a','gene_tss_dhs_peak_intersect.bed','-b',gene_gq_files[0], '-w', '250'],stdout=fopen)
    fopen.close()
    
    # Only keep rows relating to the gene, to avoid duplicates
    int_df = pd.read_csv('temp_gene_grna_tss_intersect.bed', sep = '\t', names = ['chr2','start2','end2','dhs_peak_cc','symbol','summit','chr','start','end','avg_zlog2FC', 'z_log2FC_R1', 'z_log2FC_R2','name', 'strand', 'sequence'])
    int_df = int_df.loc[int_df['symbol']==gene]
    
    # Get absolute distance to summit using single base pam coordinates
    int_df['position'] = int_df['start2'].astype(int) + int_df['summit'].astype(int)
    int_df['pam_coordinate'] = np.where(int_df['strand']=='-',int_df['end'],int_df['start'].astype(int)-1)
    int_df['abs_distance_to_summit'] = abs(int_df['pam_coordinate'] - int_df['position'])
    
    # There may be multiple DHS peaks if a gene has multiple TSSs
    # Get the effects local to each DHS summit
    int_df2 = int_df.groupby('dhs_peak_cc').apply(lambda x : np.mean(x.sort_values(by = 'abs_distance_to_summit').head(10))).reset_index()

    for i, row in int_df2.iterrows():
        
        adf = {'gene':gene,
               'dhs_peak_cc':row['dhs_peak_cc'],
              'nearest_10_avg_effect':row['avg_zlog2FC'],
              'dolcetto_number':len(dolcetto_spacers),
              'dolcetto_avg_effect':avg_dolcetto_effect}
        lod.append(adf)



    



['CAT_grna_avg_z_log2FC.qBed', 'CD164_grna_avg_z_log2FC.qBed', 'FEN1_grna_avg_z_log2FC.qBed', 'NMU_grna_avg_z_log2FC.qBed', 'ERP29_grna_avg_z_log2FC.qBed', 'PVT1_grna_avg_z_log2FC.qBed', 'FADS3_grna_avg_z_log2FC.qBed', 'LMO2_grna_avg_z_log2FC.qBed', 'CAPRIN1_grna_avg_z_log2FC.qBed', 'MYC_grna_avg_z_log2FC.qBed', 'HDAC6_grna_avg_z_log2FC.qBed', 'MEF2C_grna_avg_z_log2FC.qBed', 'FADS1_grna_avg_z_log2FC.qBed', 'FADS2_grna_avg_z_log2FC.qBed', 'GATA1_grna_avg_z_log2FC.qBed']
PVT1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/PVT1_grna_avg_z_log2FC.qBed
MEF2C
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/MEF2C_grna_avg_z_log2FC.qBed
CAPRIN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/CAPRIN1_grna_avg_z_log2FC.qBed
HDAC6
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/HDAC6_grna_avg_z_log2FC.qBed
FEN1
/Users/davidy/jamboree20crispr/ontarget_analysis/220914_promoter_guide/FEN1_grna_avg_z_log2FC.q

In [79]:
promoter_df = pd.DataFrame(lod)
display(promoter_df)

g1 = sns.scatterplot(data=promoter_df,x='dolcetto_avg_effect',y='nearest_10_avg_effect',hue='gene')
outplot = 'Scatterplot_DolcettoPromoter_vs_Nearest10TSSSummit.pdf'
g1.set_xlim(0, 10)
g1.set_ylim(0, 10)
g1.plot([-1,10],[-1,10], 'red', linewidth=10)
plt.savefig(outplot)
plt.close('all')


Unnamed: 0,gene,dhs_peak_cc,nearest_10_avg_effect,dolcetto_number,dolcetto_avg_effect
0,PVT1,chr8_127794273_127795483,0.836118,2,0.86652
1,PVT1,chr8_127854280_127854805,0.462264,2,0.86652
2,PVT1,chr8_127890008_127890316,0.716704,2,0.86652
3,MEF2C,chr5_88785435_88785700,0.492593,7,2.671793
4,MEF2C,chr5_88785751_88786218,0.516932,7,2.671793
5,MEF2C,chr5_88827041_88827590,0.622206,7,2.671793
6,MEF2C,chr5_88883046_88883597,2.023804,7,2.671793
7,MEF2C,chr5_88884042_88884790,1.4218,7,2.671793
8,CAPRIN1,chr11_34051352_34052969,0.754021,14,1.468398
9,CAPRIN1,chr11_34053012_34053606,4.975914,14,1.468398
