# Per-Figure Computational Notebook (Abdulhay, Hsieh, McNally et al)

##  *In vitro* analyses

All following analyses proceed using processed MTase footprint calls, which are derived using our custom NN-HMM pipeline described above. Pre-processing of the footprint / accessibility calls is handled by the script `process_midpoint_distance_freq.py`, which outputs site_inaccess & density files for unremodeled and remodeled S1 and S2 fibers. For plotting, dependencies are tidyverse and patchwork.  

UMAP plots and leiden clustering was carried out using the `scanpy` package. UMAP and clustering is carried out using `process_unremodeled_fibers.py`, which prints out the S\*_unremodeled_leiden_umap.data files, which can be used to reproduce the final plots in these figures. Molecule sampling for the final panels was done using `sample_molecules_for_plotting_unremodeled.py`, which outputs S\*_unremodeled_sampled_mols.data.

*Footprint distributions*   
Footprint distributions are plotted using the S\*_site_inaccess.csv files from the NN-HMM pipeline described above, they are plotted using the R code below. 

*Density distributions*  
Density distributions are plotted using the S\*_site_density.csv files from the NN-HMM pipeline, and are plotted using the R code below.

*Horizon plots*  
Horizon plots are plotted using the S\*_site_inaccess.csv files from the NN-HMM pipeline described above, they are plotted using the R code below.  

*UMAP plots & sampled molecules*  
UMAP plots are plotted using the S\*_unremodeled_leiden_umap.data files, and sampled molecules are plotted using the S\*_unremodeled_sampled_mols.data.  

## Figure 1B-F, Supplementary Figure 2A-E

In [None]:
library(tidyverse)
library(ggplot2)
library(patchwork)
#Use R kernel
s1_fiber_inaccess <- read.csv("./S1_site_inaccess.csv",sep=',')
s1_fiber_inaccess <- s1_fiber_inaccess %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_fiber_inaccess$names <- factor(s1_fiber_inaccess$density, c('5:1','10:1','15:1','20:1'))
s1_fiber_density <- read.csv('./S1_site_density.csv',sep=',')                                                                                                                                                                                                                                                                                                                                                                                                 
s1_fiber_density <- s1_fiber_density %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_fiber_density$names <- factor(s1_fiber_density$density, c('5:1','10:1','15:1','20:1'))
p1 <- ggplot(subset(s1_fiber_inaccess, length >= 25 & remodeler == 'Native'),aes(x=length,y=..density..,fill=names)) + geom_histogram(binwidth=2) + theme_bw() + labs(x="Inaccessible run width (bp)",y="Density") + theme(text=element_text(size=15)) + theme(legend.position="none") + xlim(24,400) + facet_grid(names ~ .) + scale_fill_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000'))
p2 <- ggplot(subset(s1_fiber_density, remodeler == 'Native'),aes(x=nucs,fill=names,y=..density..)) + geom_histogram(binwidth=1) + facet_grid(names ~ .) + theme_bw() + labs(x="Inferred nucleosomes per template",y="Density") + scale_fill_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000')) + theme(text=element_text(size=20), legend.position="none")
p3 <- ggplot(subset(s1_fiber_inaccess, length >= 25 & remodeler == 'Native'),aes(y=length,x=midpoint)) + geom_hex(binwidth=10) + scale_fill_viridis_c(trans='log10',option = 'B') + ylim(24,1000) + theme_bw() + labs (x="Inaccessible run midpoint (bp)", y="Inaccessible run length (bp)") + theme(text=element_text(size = 25)) + theme(panel.grid = element_blank()) + facet_grid(names ~ .) 

s1_unremodeled_umap <- read.table('S1_unremodeled_leiden_umap.data')
s1_unremodeled_umap <-  s1_unremodeled_umap %>% separate(V1,c('density','remodeler','fiber','turnover','replicate'),sep='_')
s1_unremodeled_umap$names <- factor(s1_unremodeled_umap$density, c('5:1','10:1','15:1','20:1'))
p4 <- ggplot(subset(s1_unremodeled_umap, V2 != 7),aes(x=V3,y=V4,colour=names)) + geom_point_rast(stroke=0,size=1,alpha=0.15) + theme_bw() + scale_colour_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000')) + labs(x="UMAP1",y='UMAP2') + theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(), legend.position='none', text=element_text(size=20))

s1_samps <- read.table('S1_unremodeled_sampled_mols.data')
s1_samps <- subset(s1_samps, V4 == '0' | V4 == '3' | V4 == '5' | V4 == '8')
p5 <- ggplot(s1_samps) + geom_raster(aes(x=V2,y=V1,fill=as.logical(V3))) + scale_fill_manual(values = c('#e0ecf4','#6e016b')) + facet_grid(. ~ V4) + theme_bw() + theme(legend.position="none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(text=element_text(size=18))
(p1|p2|p3) / (p4|p5)
ggsave("s1_allplots_sfig1.pdf", width = 20, height = 20, units = "in")
ggplot() + geom_point_rast(data=subset(s1_unremodeled_umap, V2 != 7),aes(x=V3,y=V4),stroke=0,size=0.4,raster.dpi =500,colour='grey',alpha=0.8) + geom_point_rast(data=subset(s1_unremodeled_umap, V2 != '7' & (V2 == '3' | V2 == '5' | V2 == '8' | V2 == '0')),aes(x=V3,y=V4,colour=as.factor(V2)),stroke=0,size=0.4,raster.dpi =500) + scale_colour_brewer(palette='Set1') + theme_bw() + labs(x="UMAP1",y="UMAP2") + theme(legend.position="none", panel.border = element_rect(size=1), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
ggsave("s1_inset.pdf", width = 4, height = 4, units = "in")


s2_fiber_inaccess <- read.csv("./S2_site_inaccess.csv",sep=',')
s2_fiber_inaccess <- s2_fiber_inaccess %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s2_fiber_inaccess$names <- factor(s2_fiber_inaccess$density, c('5:1','10:1','15:1','20:1'))

s2_fiber_density <- read.csv('./S2_site_density.csv',sep=',')                                                                                                                                                                                                                                                                                                                                                                                                 
s2_fiber_density <- s2_fiber_density %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s2_fiber_density$names <- factor(s2_fiber_density$density, c('5:1','10:1','15:1','20:1'))

p1 <- ggplot(subset(s2_fiber_inaccess, length >= 25 & remodeler == 'Native'),aes(x=length,y=..density..,fill=names)) + geom_histogram(binwidth=2) + theme_bw() + labs(x="Footprint width (bp)",y="Density") + theme(text=element_text(size=15)) + theme(legend.position="none") + xlim(24,400) + facet_grid(names ~ .) + scale_fill_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000'))
p2 <- ggplot(subset(s2_fiber_density, remodeler == 'Native'),aes(x=nucs,fill=names,y=..density..)) + geom_histogram(binwidth=1) + facet_grid(names ~ .) + theme_bw() + labs(x="Inferred nucleosomes per template",y="Density") + scale_fill_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000')) + theme(text=element_text(size=20), legend.position="none")
p3 <- ggplot(subset(s2_fiber_inaccess, length >= 25 & remodeler == 'Native'),aes(y=length,x=midpoint)) + geom_hex(binwidth=10) + scale_fill_viridis_c(trans='log10',option = 'B') + ylim(24,1000) + theme_bw() + labs (x="Footprint midpoint (bp)", y="Footprint length (bp)") + theme(text=element_text(size = 25)) + theme(panel.grid = element_blank()) + facet_grid(names ~ .) 

s2_unremodeled_umap <- read.table('S2_unremodeled_leiden_umap.data')
s2_unremodeled_umap <-  s2_unremodeled_umap %>% separate(V1,c('density','remodeler','fiber','turnover','replicate'),sep='_')
s2_unremodeled_umap$names <- factor(s2_unremodeled_umap$density, c('5:1','10:1','15:1','20:1'))
p4 <- ggplot(subset(s2_unremodeled_umap, V2 != 6),aes(x=V3,y=V4,colour=names)) + geom_point_rast(stroke=0,size=1,alpha=0.15) + theme_bw() + scale_colour_manual(values=c('#fdcc8a','#fc8d59','#e34a33','#b30000')) + labs(x="UMAP1",y='UMAP2') + theme(panel.grid.major = element_blank(), panel.grid.minor=element_blank(), legend.position='none', text=element_text(size=20))

s2_samps <- read.table('S2_unremodeled_sampled_mols.data')
s2_samps <- subset(s2_samps, V4 == '1' | V4 == '3' | V4 == '5' | V4 == '7')
p5 <- ggplot(s2_samps) + geom_raster(aes(x=V2,y=V1,fill=as.logical(V3))) + scale_fill_manual(values = c('#e0ecf4','#6e016b')) + facet_grid(. ~ V4) + theme_bw() + theme(legend.position="none", panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(text=element_text(size=18)) + labs(x="Position along sequence (bp)")

(p1|p2|p3) / (p4|p5)
ggsave("s2_allplots_sfig1.pdf", width = 20, height = 20, units = "in")
ggplot() + geom_point_rast(data=subset(s2_unremodeled_umap, V2 != 6),aes(x=V3,y=V4),stroke=0,size=0.4,raster.dpi =500,colour='grey',alpha=0.8) + geom_point_rast(data=subset(s2_unremodeled_umap, V2 != '6' & (V2 == '1' | V2 == '3' | V2 == '5' | V2 == '7')),aes(x=V3,y=V4,colour=as.factor(V2)),stroke=0,size=0.4,raster.dpi =500) + scale_colour_brewer(palette='Set1') + theme_bw() + labs(x="UMAP1",y="UMAP2") + theme(legend.position="none", panel.border = element_rect(size=1), panel.grid.major = element_blank(), panel.grid.minor = element_blank())
ggsave("s2_inset.pdf", width = 4, height = 4, units = "in")

In the main text, we report statistics for the median numbers of nucleosomes for each prep we did; these calculations are recapitulated here.

In [None]:
library(tidyverse)

s1_fiber_density <- read.csv('./S1_site_density.csv',sep=',')                                           
s1_fiber_density <- s1_fiber_density %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_fiber_density$names <- factor(s1_fiber_density$density, c('5:1','10:1','15:1','20:1'))

med5 <- mean(subset(s1_fiber_density, density == '5:1')$nucs, na.rm=TRUE)
med10 <- mean(subset(s1_fiber_density, names == '10:1')$nucs, na.rm=TRUE)
med15 <- mean(subset(s1_fiber_density, names == '15:1')$nucs, na.rm=TRUE)
med20 <- mean(subset(s1_fiber_density, names == '20:1')$nucs, na.rm=TRUE)

mad5 <- sd(subset(s1_fiber_density, names == '5:1')$nucs, na.rm=TRUE)
mad10 <- sd(subset(s1_fiber_density, names == '10:1')$nucs, na.rm=TRUE)
mad15 <- sd(subset(s1_fiber_density, names == '15:1')$nucs, na.rm=TRUE)
mad20 <- sd(subset(s1_fiber_density, names == '20:1')$nucs, na.rm=TRUE)

sprintf("5:1 S1: %s +/- %s", med5, mad5)
sprintf("10:1 S1: %s +/- %s", med10, mad10)
sprintf("15:1 S1: %s +/- %s", med15, mad15)
sprintf("20:1 S1: %s +/- %s", med20, mad20)


s1_fiber_density <- read.csv('./S2_site_density.csv',sep=',')                                           
s1_fiber_density <- s1_fiber_density %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_fiber_density$names <- factor(s1_fiber_density$density, c('5:1','10:1','15:1','20:1'))

med5 <- mean(subset(s1_fiber_density, density == '5:1')$nucs, na.rm=TRUE)
med10 <- mean(subset(s1_fiber_density, names == '10:1')$nucs, na.rm=TRUE)
med15 <- mean(subset(s1_fiber_density, names == '15:1')$nucs, na.rm=TRUE)
med20 <- mean(subset(s1_fiber_density, names == '20:1')$nucs, na.rm=TRUE)

mad5 <- sd(subset(s1_fiber_density, names == '5:1')$nucs, na.rm=TRUE)
mad10 <- sd(subset(s1_fiber_density, names == '10:1')$nucs, na.rm=TRUE)
mad15 <- sd(subset(s1_fiber_density, names == '15:1')$nucs, na.rm=TRUE)
mad20 <- sd(subset(s1_fiber_density, names == '20:1')$nucs, na.rm=TRUE)

sprintf("5:1 S2: %s +/- %s", med5, mad5)
sprintf("10:1 S2: %s +/- %s", med10, mad10)
sprintf("15:1 S2: %s +/- %s", med15, mad15)
sprintf("20:1 S2: %s +/- %s", med20, mad20)

## Figure 2
Data files for remodeling experiments were processed identically to unremodeled fibers, but using  `process_remodeled_fibers.py`, which generated the UMAP and leiden clusters shown in the paper. Again, all data processing was handled using tidyverse and visualizations were made using ggplot2 / patchwork. 

In [None]:
#FIGURE 2
s1_fiber_inaccess <- read.csv("./S1_site_inaccess.csv",sep=',')
s1_fiber_inaccess <- s1_fiber_inaccess %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_fiber_inaccess$names <- factor(s1_fiber_inaccess$density, c('5:1','10:1','15:1','20:1'))
s1_fiber_inaccess$cat_order <- factor(s1_fiber_inaccess$remodeler, c('Native','(-)ATP','(+)ADP','(+)ATP'))
p1 <- ggplot(subset(s1_fiber_inaccess, length >= 25 & (remodeler == 'Native' | remodeler == '(+)ATP') & turnover == 'Single-turnover' & density == '10:1'),aes(y=length,x=midpoint)) + geom_hex(binwidth=10) + scale_fill_viridis_c(trans='log10',option = 'B') + ylim(24,1000) + theme_bw() + labs (x="Footprint midpoint (bp)", y="Footprint length (bp)") + theme(text=element_text(size = 25)) + theme(panel.grid = element_blank()) + facet_grid(names ~ cat_order) 
s2_fiber_inaccess <- read.csv("./S2_site_inaccess.csv",sep=',')
s2_fiber_inaccess <- s2_fiber_inaccess %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
s2_fiber_inaccess$names <- factor(s2_fiber_inaccess$density, c('5:1','10:1','15:1','20:1'))
s2_fiber_inaccess$cat_order <- factor(s2_fiber_inaccess$remodeler, c('Native','(-)ATP','(+)ADP','(+)ATP'))
p2 <- ggplot(subset(s2_fiber_inaccess, length >= 25 & turnover == 'Single-turnover' & density == '10:1'),aes(y=length,x=midpoint)) + geom_hex(binwidth=10) + scale_fill_viridis_c(trans='log10',option = 'B') + ylim(24,1000) + theme_bw() + labs (x="Footprint midpoint (bp)", y="Footprint length (bp)") + theme(text=element_text(size = 25)) + theme(panel.grid = element_blank()) + facet_grid(names ~ cat_order) 

p1 / p2
ggsave('figsx_remodeling_comparisons.pdf',height=10,width=18,unit='in')

s1_remodeler_clust <- read.table('S1_remodeled_leiden_umap.data')
s1_remodeler_clust <- s1_remodeler_clust %>% separate(V1, c('density','remodeler','fiber','turnover','replicate'), sep='_')
s1_remodeler_clust$names <- paste(s1_remodeler_clust$remodeler, s1_remodeler_clust$turnover)
s1_remodeler_clust$ordered <- factor(s1_remodeler_clust$names, c('Native Single-turnover','(-)ATP Single-turnover', '(+)ADP Multi-turnover', '(+)ADP Single-turnover', '(+)ATP Multi-turnover', '(+)ATP Single-turnover' ))
ggplot(subset(s1_remodeler_clust,density=='10:1' & V2 != 7),aes(x=V3,y=V4,colour=ordered)) + geom_point_rast(raster.dpi = 250,stroke=0,size=0.1) + scale_colour_manual(values = c('#377eb8', '#4daf4a','#984ea3','#ff7f00','#e5d8bd','#e41a1c')) + facet_wrap(~ ordered, nrow=2) + theme_bw() + theme(legend.position="none")
p1 <- ggplot(subset(s1_remodeler_clust,density=='10:1' & V2 != 7),aes(x=V3,y=V4,colour=as.factor(V2))) + geom_point_rast(raster.dpi = 250,stroke=0,size=1,alpha=0.5) + scale_colour_brewer(palette="Set3") + theme_classic() + theme(legend.position="none") + theme(text=element_text(size=21))
p2 <- ggplot()  + geom_point_rast(data=subset(s1_remodeler_clust,density=='10:1' & V2 != 7),aes(x=V3,y=V4,colour=ordered), raster.dpi = 250,stroke=0,size=0.1) + facet_grid(. ~ as.factor(V2)) + theme_bw() + theme(legend.position="none") + theme(text=element_text(size=21)) + scale_colour_manual(values = c('#377eb8', '#4daf4a','#984ea3','#ff7f00','#e5d8bd','#e41a1c'))
p3 <- ggplot(plotme,aes(x=clust,y=freq,stratum=name, alluvium = name, fill=as.factor(name))) + geom_stratum() + geom_alluvium(aes(fill=as.factor(name))) + scale_fill_brewer(palette = "Set3") + theme_bw() + labs(x="Catalytic condition",y="Fraction of molecules per cluster") + theme(text=element_text(size=21))
(p1 | (p2 / p3))


## Figure 3
\*trinucs.csv files for S1 and S2 fibers were generated using `process_midpoint_distance_freq.py` and then plotted using the R code shown below. Included inline below is also code for computing the correlations shown in Supplementary Figure 5A.

In [None]:
#Trinuc plots
trinucs_s1 <- subset(read.csv("./S1_site_trinucs.csv"), d1 <= 750 & d2 <= 750)

trinucs_s1 <- trinucs_s1 %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
trinucs_s1$names <- factor(trinucs_s1$density, c('5:1','10:1','15:1','20:1'))

p1 <- ggplot(subset(trinucs_s1, remodeler == 'Native'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))
p2 <- ggplot(subset(trinucs_s1, remodeler == '(+)ATP' & turnover == 'Single-turnover'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))
x = c()
names = c()
remodelers = c()
for (name in unique(trinucs_s1$names)){
  sub = subset(trinucs_s1, remodeler == 'Native' & names == name)
  print(name)
  #  print(cor(sub$d1, sub$d2, method='spearman',use='complete.obs'))
  names <- append(names, name)
  x <- append(x, cor(sub$d1, sub$d2, method='pearson',use='complete.obs'))
  remodelers <- append(remodelers, 'Native')
}
for (name in unique(trinucs_s1$names)){
  sub = subset(trinucs_s1, remodeler == '(+)ATP' & turnover == 'Single-turnover' & names == name)
  print(name)
  #  print(cor(sub$d1, sub$d2, method='spearman',use='complete.obs'))
  names <- append(names, name)
  x <- append(x, cor(sub$d1, sub$d2, method='pearson',use='complete.obs'))
  remodelers <- append(remodelers, 'Remodeled')}  
s1_plot <- data.frame(cor=x, density=names,condition=remodelers)
s1_plot$order <- factor(s1_plot, c('5:1','10:1','15:1','20:1'))


p3 <- ggplot(subset(trinucs_s2, remodeler == 'Native'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))
p4 <- ggplot(subset(trinucs_s2, remodeler == '(+)ATP' & turnover == 'Single-turnover'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))


#Trinuc plots
trinucs_s2 <- subset(read.csv("./S2_site_trinucs.csv"), d1 <= 750 & d2 <= 750)
trinucs_s2 <- trinucs_s2 %>% separate(samp_label,c('density','remodeler','fiber','turnover','replicate'), sep='_')
trinucs_s2$names <- factor(trinucs_s2$density, c('5:1','10:1','15:1','20:1'))
x = c()
names = c()
remodelers = c()
for (name in unique(trinucs_s2$names)){
  sub = subset(trinucs_s2, remodeler == 'Native' & names == name)
  print(name)
#  print(cor(sub$d1, sub$d2, method='spearman',use='complete.obs'))
  names <- append(names, name)
  x <- append(x, cor(sub$d1, sub$d2, method='pearson',use='complete.obs'))
  remodelers <- append(remodelers, 'Native')
  }
for (name in unique(trinucs_s2$names)){
  sub = subset(trinucs_s2, remodeler == '(+)ATP' & turnover == 'Single-turnover' & names == name)
  print(name)
  #  print(cor(sub$d1, sub$d2, method='spearman',use='complete.obs'))
  names <- append(names, name)
  x <- append(x, cor(sub$d1, sub$d2, method='pearson',use='complete.obs'))
  remodelers <- append(remodelers, 'Remodeled')}  
p3 <- ggplot(subset(trinucs_s2, remodeler == 'Native'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))
p4 <- ggplot(subset(trinucs_s2, remodeler == '(+)ATP' & turnover == 'Single-turnover'),aes(x=d1,y=d2)) +  rasterize(geom_bin2d(binwidth=4),dpi=200) + scale_fill_viridis_c(trans='log10') + facet_grid(. ~ names) + theme_bw() + labs(x="Interdyad distance (n1-to-n2) (bp)",y="Interdyad distance (n2-to-n3) (bp)") + theme(text=element_text(size=15))

(p1 | p3) | (p2 | p4)


## Figure 4
Autocorrelations for all fibers were calculated using the script `fishers_tests.py`, which output the \*fishers_remodeling.data and \*autocor_clusters.data files for both S1 and S2 fibers. These data files are then used as input for the plotting scripts shown below. For both S1 and S2 fibers, one artifactual cluster was manually filtered out prior to plotting. Clusters were manually arranged in order of increasing apparent NRL following visual inspection of cluster averages.

In [None]:
tst1 <- read.table('S1_fishers_remodeling.data')
tst1$ordered <- factor(tst1$V1, c(0,4,2,1,5,3,7,8,6))
lods = data.frame(ordered = subset(tst1, V6 == 'Native')$ordered, nucs = subset(tst1, V6 == 'Native')$V2, lod = log2(subset(tst1, V6 == 'Remodeled')$V7 / subset(tst1,V6 == 'Native')$V7))
p3 <- ggplot(subset(lods,ordered != 6),aes(x=nucs,y=ordered,fill=lod)) + geom_raster() + scale_fill_gradient2(mid='white',low='blue',high='red') + theme_bw() + labs(y="Clusters",x="Estimated nucleosome count") + theme(text=element_text(size=15))
t2s1 <- read.table('S1_autocor_clusters.data')
t2s1$ordered <- factor(t2s1$V1, c(0,4,2,1,5,3,7,8,6))
p1 <- ggplot(subset(t2s1,V2 >= 150 & V2 <= 750 & ordered != 6),aes(x=V2,fill=V3,y=as.factor(ordered))) + geom_raster() + scale_fill_gradient2(mid='white',low='blue',high='#fec44f') + labs(x="Offset (bp)",y="Cluster") + scale_x_continuous(breaks=c(175,225,275,325,500,750)) + theme_bw() + theme(text = element_text(size=15))
p2 <- ggplot(subset(tst1, ordered != 6),aes(x=V2,y=ordered,fill=log2(V4))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + facet_grid(. ~ V6) + labs(x="Estimated nucleosomes per template", y="Cluster") + theme_bw() + theme(text=element_text(size=15))

tst2 <- read.table('S2_fishers_remodeling.data')
tst$ordered <- factor(tst$V1, c(0,3,2,1,4,6,7,5))
lods = data.frame(ordered = subset(tst, V6 == 'Native')$ordered, nucs = subset(tst, V6 == 'Native')$V2, lod = log2(subset(tst, V6 == 'Remodeled')$V7 / subset(tst,V6 == 'Native')$V7))
p6 <- ggplot(subset(lods,ordered != 5),aes(x=nucs,y=ordered,fill=lod)) + geom_raster() + scale_fill_gradient2(mid='white',low='blue',high='red') + theme_bw() + labs(y="Clusters",x="Estimated nucleosome count") + theme(text=element_text(size=15))
t2 <- read.table('S2_autocor_clusters.data')
t2$ordered <- factor(t2$V1, c(0,3,2,1,4,6,7,5))
p4 <- ggplot(subset(t2,V2 >= 150 & V2 <= 750 & ordered != 5),aes(x=V2,fill=V3,y=as.factor(ordered))) + geom_raster() + scale_fill_gradient2(mid='white',low='blue',high='#fec44f') + labs(x="Offset (bp)",y="Cluster") + scale_x_continuous(breaks=c(175,225,275,325,500,750)) + theme_bw() + theme(text = element_text(size=15))
p5 <- ggplot(subset(tst, ordered != 5),aes(x=V2,y=ordered,fill=log2(V4))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + facet_grid(. ~ V6) + labs(x="Estimated nucleosomes per template", y="Cluster") + theme_bw() + theme(text=element_text(size=15))
(p1 | p2) / (p4 | p5)
(p1 | p3) / (p4 | p6)


## Figure 5
CTCF motif accessibility analyses were carried out using the scripts `process_site_accessibility_frequency.py` and `flatten_sa_data.py`. Specifically, we used these scripts to output S\*_site_accessibility_new.data.flattened files, which we used to create heatmaps and lineplots in Figure 5B and Figure 5C-E. CTCF motif locations were found using FIMO, using the CTCF PWM from Jolma et al, Cell (2013)

In [None]:
tst <- read.table('S1_site_accesibility_new.data.flattened')
tst2 <- subset(tst,V2 == "(+)ATP")
tst1 <- subset(tst, V2 == "Native")
tst1$lor <- log2((tst2$V4 / tst2$V5) / (tst1$V4 / tst1$V5))
p1 <- ggplot(subset(tst1, V3 <= 18),aes(x=V1,y=V3,fill=lor)) + geom_raster() + 
  geom_vline(aes(xintercept=1359)) + 
  scale_fill_gradient2(low='blue',mid='white',high='red') +  theme_bw() + labs(x="Midpoint position along S1 (bp)",y="Estimated nucleosomes per template") + theme(text = element_text(size = 10))

tst <- read.table('S2_site_accesibility_new.data.flattened')
tst2 <- subset(tst,V2 == "(+)ATP")
tst1 <- subset(tst, V2 == "Native")
tst1$lor <- log2((tst2$V4 / tst2$V5) / (tst1$V4 / tst1$V5))
p2 <- ggplot(subset(tst1, V3 <= 20),aes(x=V1,y=V3,fill=lor)) + geom_raster() + 
  geom_vline(aes(xintercept = 1360)) + geom_vline(aes(xintercept=1569)) + 
  scale_fill_gradient2(low='blue',mid='white',high='red') +  theme_bw() + labs(x="Midpoint position along S2 (bp)",y="Estimated nucleosomes per template") + theme(text = element_text(size = 10))

tst <- read.table('S1_site_accesibility_new.data.flattened')
site = 1359
plotme = subset(tst,V1 == site & V3 <= 18)
p3 <- ggplot(plotme, aes(x=V3,y=V4/V5, colour=V2)) + geom_point() + geom_smooth() + 
  scale_colour_manual(values = c('#e41a1c', '#377eb8')) + theme_bw() + 
  labs(x="Estimated nucleosomes per template",y="Fraction accessible templates") + 
  theme(text=element_text(size=10), legend.position="none")

tst <- read.table('S2_site_accesibility_new.data.flattened')

tst2 <- subset(tst,V2 == "(+)ATP" & V3 <= 20)
tst1 <- subset(tst, V2 == "Native" & V3 <= 20)
site = 1360
plotme = subset(tst,V1 == site & V3 <= 18)
p4 <- ggplot(plotme, aes(x=V3,y=V4/V5, colour=V2)) + geom_point() + geom_smooth() + 
  scale_colour_manual(values = c('#e41a1c', '#377eb8')) + theme_bw() + 
  labs(x="Estimated nucleosomes per template",y="Fraction accessible templates") + 
  theme(text=element_text(size=10), legend.position="none")
site = 1569
plotme = subset(tst,V1 == site & V3 <= 18)
p5 <- ggplot(plotme, aes(x=V3,y=V4/V5, colour=V2)) + geom_point() + geom_smooth() + 
  scale_colour_manual(values = c('#e41a1c', '#377eb8')) + theme_bw() + 
  labs(x="Estimated nucleosomes per template",y="Fraction accessible templates") + 
  theme(text=element_text(size=10), legend.position="none")

(p1 | p2) / (p3 | p4| p5)


## *In vivo* analyses
## Figure 6
*In vivo* autocorrelations were calculated as in Abdulhay et al. (2020), using three main scripts. First, we computed autocorrelations for all sequenced fibers using the script `compute_autocors_persample.py`, which takes as input HMM accessibility predictions in pickle format, and saves a numpy array of 500 nt offset autocorrelograms for each molecule in the input pickle as \*.autocors.npy. Second, we filter these numpy arrays for all molecules that fall within one of the epigenomic domains surveyed in the study and Leiden cluster all such molecules using the script `cluster_autocorrelograms.py`. This script outputs cluster IDs and cluster averages as invivo_final_autocor\*.data. Finally, we perform a series of Fisher's exact tests (using `fishers_epigenome_global.py`) to calculate significant enrichment and depletion of clusters 1.) across knockout, addback, and E14 samples (knockout and addback only shown in main text Figure 5, all shown in Supplementary Figures), and 2.) across epigenomic domains within each cell line. The output of `fishers_epigenome_global.py` is a series of fishers_tests*.txt files that facilitate easy plotting in ggplot. These are all then plotted using the code below. Additionally, all code for reproducibility analyses (and plotting) are included below as well. Leiden cluster order was again determined by visual inspection of cluster averages and manual ordering.

In [None]:
avg_fig = read.table('avg_lineplots_snf2hko_rescue.txt')
ggplot(avg_fig,aes(x=V1,y=V2,colour=V3)) + geom_line() + theme_bw() + scale_colour_manual(values=c('#e41a1c', '#377eb8')) + theme(text=element_text(size=15)) + labs(x="Distance to 5' MNase cut (bp)", y="Average methylation")
nrl_fig = read.table('in_vivo_cluster_sig_averages_102821.data')
nrl_fig$ordered = factor(nrl_fig$V3,c(0,3,4,7,5,6,2,1))
#nrl_fig$ordered_density <- factor(nrl_fig$V2,c(5,4,3,1,0,2))
all_mols <- read.table('density_enrich_110521.txt')
all_mols$ordered <- factor(all_mols$V1,c(0,3,4,7,5,6,2,1))
#all_mols$ordered_density <- factor(all_mols$V2,c(5,4,3,1,0,2))
#p1 | p2
fish <- read.table('fishers_tests_smarca5_invivo_bydomain.txt')
fishq <- qvalue(fish$V6,lambda=0)
fish$qval = fishq$qvalues
fish$ordered <- factor(fish$V2, c(0,3,4,7,5,6,2,1))
fish$eordered <- factor(fish$V3, c('H3K4me3_peaks.bed.gz','H3K4me1_peaks.bed.gz','H3K36me3_peaks.bed.gz','H3K27me3_peaks.bed.gz','H3K9me3_peaks.bed.gz','telomere.zmws','major_satellite.zmws','minor_satellite.zmws','atac_deseq_res.peaks.open.gz','atac_deseq_res.peaks.closed.gz'))
fish$ordered_sample <- factor(fish$V1, c('SNF2hKO','SNF2hWTAB','E14'))

fish_rep <- read.table('fishers_tests_smarca5_invivo_bydomain_replicates.txt')
fishq_rep <- qvalue(fish_rep$V7,lambda=0)
fish_rep$qval = fishq_rep$qvalues
fish_rep$ordered <- factor(fish_rep$V2, c(0,3,4,7,5,6,2,1))
fish_rep$eordered <- factor(fish_rep$V3, c('H3K4me3_peaks.bed.gz','H3K4me1_peaks.bed.gz','H3K36me3_peaks.bed.gz','H3K27me3_peaks.bed.gz','H3K9me3_peaks.bed.gz','telomere.zmws','major_satellite.zmws','minor_satellite.zmws','atac_deseq_res.peaks.open.gz','atac_deseq_res.peaks.closed.gz'))
fish_rep$ordered_sample <- factor(fish_rep$V1, c('SNF2hKO','SNF2hWTAB','E14'))

fish_remodeler <- read.table('fishers_tests_smarca5_invivo_across_samp.txt')
fish_remodeler$ordered_sample <- factor(fish_remodeler$V1, c('SNF2hKO','SNF2hWTAB','E14'))
fish_remodeler$ordered <- factor(fish_remodeler$V2, c(0,3,4,7,5,6,2,1))

fish_remodeler_reps <- subset(read.table('fishers_tests_smarca5_invivo_across_samp_replicates.txt'), V4 == 'Overall')
fish_remodeler_reps$ordered_sample <- factor(fish_remodeler_reps$V1, c('SNF2hKO','SNF2hWTAB','E14'))
fish_remodeler_reps$ordered <- factor(fish_remodeler_reps$V2, c(0,3,4,7,5,6,2,1))

p1 <- ggplot(nrl_fig, aes(x=V1,y=ordered,fill=V2)) + geom_raster() + theme_bw() + scale_fill_distiller(palette='BuPu',direction=1) + labs(x="Distance to 5' MNase cut (bp)", y="Fiber types") + theme(text=element_text(size=10), legend.position = "none")
p2 <- ggplot(all_mols,aes(x=V2,y=ordered,fill=V3)) + geom_raster() + scale_fill_gradient2(low='navy',mid='white',high='orange') + theme_bw() + labs(x="Estimated nucleosomes per kilobase",y="Clusters") + theme(text=element_text(size=10), legend.position = "none")
p3 <- ggplot(subset(fish_remodeler, ordered_sample != 'E14'), aes(x=ordered_sample,y=ordered,fill=log2(V5))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + theme_bw() + theme(text=element_text(size=10), legend.position = "none")
p4 <- ggplot(subset(fish, ordered_sample != 'E14' & eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz'), aes(x=ordered_sample,y=ordered,fill=log2(V5))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + facet_grid( . ~ eordered) + theme_bw() + theme(text=element_text(size=10)) + geom_point(data=subset(fish,qval > 0.05 & ordered_sample != 'E14' & eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz'), aes(x=ordered_sample,y=ordered), colour='grey')
 
(p1 | p2 | p3 ) / (p4)

#replot with legends to add back to patchwork
p1 <- ggplot(nrl_fig, aes(x=V1,y=ordered,fill=V2)) + geom_raster() + theme_bw() + scale_fill_distiller(palette='BuPu',direction=1) + labs(x="Distance to 5' MNase cut (bp)", y="Fiber types") + theme(text=element_text(size=10))
p2 <- ggplot(all_mols,aes(x=V2,y=ordered,fill=V3)) + geom_raster() + scale_fill_gradient2(low='navy',mid='white',high='orange') + theme_bw() + labs(x="Estimated nucleosomes per kilobase",y="Clusters") + theme(text=element_text(size=10))
p3 <- ggplot(subset(fish_remodeler, ordered_sample != 'E14'), aes(x=ordered_sample,y=ordered,fill=log2(V5))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + theme_bw() + theme(text=element_text(size=10))
(p1 | p2 | p3 )

#supplementary figures reproducibility
fish_remodeler_reps <- subset(read.table('fishers_tests_smarca5_invivo_across_samp_replicates.txt'), V4 == 'Overall')
fish_remodeler_reps$ordered_sample <- factor(fish_remodeler_reps$V1, c('SNF2hKO','SNF2hWTAB','E14'))
fish_remodeler_reps$ordered <- factor(fish_remodeler_reps$V2, c(0,3,4,7,5,6,2,1))

ggplot(subset(fish_remodeler_reps), aes(x=ordered_sample,y=ordered,fill=log2(V6))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + theme_bw() + theme(text=element_text(size=10)) + facet_grid(. ~ V3)
#scatter plots for samp cors
rep1_ors <- log2(subset(fish_remodeler_reps, V3 == 'rep1')$V6)
rep2_ors <- log2(subset(fish_remodeler_reps, V3 == 'rep2')$V6)
rep3_ors <- log2(subset(fish_remodeler_reps, V3 == 'rep3')$V6)
p1 <- ggplot() + geom_point(aes(x=rep1_ors,y=rep2_ors)) + labs(x="Rep1",y="Rep2") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep1_ors,rep2_ors)
p2 <- ggplot() + geom_point(aes(x=rep1_ors,y=rep3_ors)) + labs(x="Rep1",y="Rep3") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep1_ors,rep3_ors)
p3 <- ggplot() + geom_point(aes(x=rep2_ors,y=rep3_ors)) + labs(x="Rep2",y="Rep3") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep2_ors,rep3_ors)
p1 | p2 | p3

#domain-level reproducibility
fish_rep <- read.table('fishers_tests_smarca5_invivo_bydomain_replicates.txt')
fishq_rep <- qvalue(fish_rep$V7,lambda=0)
fish_rep$qval = fishq_rep$qvalues
fish_rep$ordered <- factor(fish_rep$V2, c(0,3,4,7,5,6,2,1))
fish_rep$eordered <- factor(fish_rep$V3, c('H3K4me3_peaks.bed.gz','H3K4me1_peaks.bed.gz','H3K36me3_peaks.bed.gz','H3K27me3_peaks.bed.gz','H3K9me3_peaks.bed.gz','telomere.zmws','major_satellite.zmws','minor_satellite.zmws','atac_deseq_res.peaks.open.gz','atac_deseq_res.peaks.closed.gz'))
fish_rep$ordered_sample <- factor(fish_rep$V1, c('SNF2hKO','SNF2hWTAB','E14'))
ggplot(subset(fish_rep, eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz'), aes(x=ordered_sample,y=ordered,fill=log2(V6))) + geom_raster() + scale_fill_gradient2(low='blue',mid='white',high='red') + theme_bw() + theme(text=element_text(size=10)) + facet_grid(V4 ~ eordered)

rep1_ors <- log2(subset(fish_rep, V4 == 'rep1' & eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz')$V6)
rep2_ors <- log2(subset(fish_rep, V4 == 'rep2' & eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz')$V6)
rep3_ors <- log2(subset(fish_rep, V4 == 'rep3' & eordered != 'atac_deseq_res.peaks.closed.gz' & eordered != 'atac_deseq_res.peaks.open.gz')$V6)
p1 <- ggplot() + geom_point(aes(x=rep1_ors,y=rep2_ors)) + labs(x="Rep1",y="Rep2") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep1_ors,rep2_ors)
p2 <- ggplot() + geom_point(aes(x=rep1_ors,y=rep3_ors)) + labs(x="Rep1",y="Rep3") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep1_ors,rep3_ors)
p3 <- ggplot() + geom_point(aes(x=rep2_ors,y=rep3_ors)) + labs(x="Rep2",y="Rep3") + theme_bw() + theme(text=element_text(size=15))
cor.test(rep2_ors,rep3_ors)
p1 | p2 | p3


## Figure 7
Ctcf site analyses were carried out as in Abdulhay et al (2020). Specifically, we used the script `zmw_selector.py` to extract all ZMWs from
all aligned CCS files that contain bona fide Ctcf sites (or control unbound Ctcf motif matches) in the mESC genome. We then visualized the averages of the signal at these sites using numpy, and wrote this in a readable text format. For ATAC analyses, code from Ramani et al (2019) was used to compute ATAC fragment midpoint enrichment surrounding ENCODE-defined Ctcf sites and output these as numpy array archives (\*.npz files). Python code to carry this out is included in this notebook, as well as R code for reproducing plots in Figure 7. 

In [None]:
#PYTHON CODE

#ATAC analysis / normalization
def sumShorts(npz):
    new_mat = npz['50.59']
    for key in npz.iterkeys():
        if key =='50.59': continue
        else:
            new_mat += npz[key]
    return new_mat

def normShorts(arr):
    mat = normalize(np.nan_to_num(pd.DataFrame(arr).rolling(33,axis=1, center=True, min_periods=1).mean())) 
    return m

wtr1 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_WT_output.deduped.bam.50.200.npz')
wtr2 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_WT_rep2_output.deduped.bam.50.200.npz')
kor1 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_SNF2HKO_output.deduped.bam.50.200.npz') 
kor2 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_SNF2HKO_rep2_output.deduped.bam.50.200.npz') 

wtn1 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_WT_output.deduped.bam_cntrl_sites.50.200.npz')
wtn2 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_WT_rep2_output.deduped.bam_cntrl_sites.50.200.npz')
kon1 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_SNF2HKO_output.deduped.bam_cntrl_sites.50.200.npz') 
kon2 = np.load('/avicenna/vramani/analyses/dna_damage/Hmaps_Schubeler_ATAC_SNF2HKO_rep2_output.deduped.bam_cntrl_sites.50.200.npz') 


mat1 = sumShorts(kor1)
mat2 = sumShorts(kor2)
mat3 = sumShorts(wtr1)
mat4 = sumShorts(wtr2)

mat5 = sumShorts(kon1)
mat6 = sumShorts(kon2)
mat7 = sumShorts(wtn1)
mat8 = sumShorts(wtn2)

kor_norm = mat1 + mat2
wt_norm = mat3 + mat4

kon = mat5 + mat6
wtn = mat7 + mat8

plt.figure(figsize=(20,20))
plt.plot(range(500), (np.nanmean(kor_norm,axis=0) / np.nanmean(kon))[750:1250])
plt.plot(range(500), (np.nanmean(wt_norm,axis=0) / np.nanmean(wt_norm))[750:1250])
plt.plot(range(500), (np.nanmean(kon,axis=0) / np.nanmean(kon))[750:1250])
plt.plot(range(500), (np.nanmean(wtn,axis=0) / np.nanmean(wtn))[750:1250])
plt.show()

p1 = (np.nanmean(kor_norm,axis=0) / np.nanmean(kon))[750:1250]
p2 = (np.nanmean(wt_norm,axis=0) / np.nanmean(wt_norm))[750:1250]
p3 = (np.nanmean(kon,axis=0) / np.nanmean(kon))[750:1250]
p4 = (np.nanmean(wtn,axis=0) / np.nanmean(wtn))[750:1250]

#ATAC line plot file
fho = open('atac_plots','w')
for i in range(len(p1)):
    print("%s\t%s\t%s" % (i - 250, p1[i], 'Knockout Ctcf sites'), file=fho)
    print("%s\t%s\t%s" % (i - 250, p2[i], 'WT Ctcf sites'), file=fho)
    print("%s\t%s\t%s" % (i - 250, p3[i], 'Knockout random matches'), file=fho)
    print("%s\t%s\t%s" % (i - 250, p4[i], 'WT random matches'), file=fho)
fho.close()



In [None]:
#PYTHON
#ATAC single site
import pandas as pd
def smooth(vec):
    smoothed = pd.Series(vec).rolling(50, center=True, min_periods=1).mean()
    return(smoothed)

chrid = 'chr12'
site = 115605169

site_lookup = pd.read_csv('/avicenna/vramani/analyses/pacbio/snf2h_peak_analyses/CTCF_mES.sites.flat.sorted.distances.filtered',sep='\t',names=['chrid','site','strand','score','d1','d2'])
site_lookup[(site_lookup['chrid'] == chrid) & (site_lookup['site'] == site)]

print(np.sum(kor_norm))
print(np.sum(wt_norm))
plt.figure(figsize=(10,10))
plt.bar(range(500), smooth(kor_norm[4174][750:1250] / np.sum(kor_norm) * 1000000))
plt.fill(range(500), smooth(wt_norm[4174][750:1250] / np.sum(wt_norm) * 1000000), c='blue')
plt.show()


kor_site = smooth(kor_norm[4174][750:1250] / np.sum(kor_norm) * 1000000)
wt_site = smooth(wt_norm[4174][750:1250] / np.sum(wt_norm) * 1000000)
fho = open('atac_sig_chr12_115605169.txt','w')
for i in range(500):
    print("%s\t%s\t%s" % (i - 250, kor_site[i], 'K'), file = fho)
    print("%s\t%s\t%s" % (i - 250, wt_site[i], 'WT'), file = fho)
fho.close()

In [None]:
#PYTHON CODE

#ISOLATING SINGLE-SITE MOLECULES FOR PLOTTING#
chrid = 'chr12'
site = 115605169
mols = np.load('ATAC_Ctcf_mols_500bp.npy')
print(len(mols))
lookup = pd.read_csv('CTCF_mols_sites_and_clusters.csv')
submols = mols[lookup[(lookup['chrid'] == chrid) & (lookup['sites'] == site)]['Unnamed: 0'].values]
sublabs_cluster = lookup[(lookup['chrid'] == chrid) & (lookup['sites'] == site)]['clusters'].values
sublabs_labels = lookup[(lookup['chrid'] == chrid) & (lookup['sites'] == site)]['labels_agg'].values

fho=open('heatmap_mols_singlesite.data','w')

s_counter = {}
for clust in [0,2,1,4,5,3]:
    if int(clust) > 5: continue
    subsubmols = submols[sublabs_cluster == clust]
    subsubsamps = sublabs_labels[sublabs_cluster == clust]
    for i in range(len(subsubmols)):
        sample_lab = subsubsamps[i].split('_')[0]
        if sample_lab in s_counter:
            s_counter[sample_lab] += 1
        else:
            s_counter[sample_lab] = 0
        for j in range(len(subsubmols[i])):
            print("%s\t%s\t%s\t%s\t%s\t%s" % (i, j - 250, subsubmols[i][j], clust, sample_lab, s_counter[sample_lab]), file=fho)
fho.close()

#NORMALIZED SAMOSA signal @ CTCF sites
ctcf_ab = []
ctrl_ab = []
for rep in np.unique(labs['bio_rep']):
    sub1 = mols[(tst['sample_id'] == 'SNF2hWTAB') & (labs['bio_rep'] == rep) & (labs['factor'] == 'mm10.archetype_motifs.CTCF.v1.0.bed.sampled.sorted.flat')]
    sub2 = mols[(tst['sample_id'] == 'SNF2hWTAB') & (labs['bio_rep'] == rep) & (labs['factor'] == 'CTCF_mES.sites.flat.sorted.distances.filtered')]
    ctrl_ab.append(sub1 / np.nanmean(sub1))
    ctcf_ab.append(sub2 / np.nanmean(sub1))

ctcf_ko = []
ctrl_ko = []
for rep in np.unique(labs['bio_rep']):
    sub1 = mols[(tst['sample_id'] == 'SNF2hKO') & (labs['bio_rep'] == rep) & (labs['factor'] == 'mm10.archetype_motifs.CTCF.v1.0.bed.sampled.sorted.flat')]
    sub2 = mols[(tst['sample_id'] == 'SNF2hKO') & (labs['bio_rep'] == rep) & (labs['factor'] == 'CTCF_mES.sites.flat.sorted.distances.filtered')]
    ctrl_ko.append(sub1 / np.nanmean(sub1))
    ctcf_ko.append(sub2 / np.nanmean(sub1))
    
ctcf_ab = np.vstack(ctcf_ab)
ctcf_ko = np.vstack(ctcf_ko)
ctrl_ab = np.vstack(ctrl_ab)
ctrl_ko = np.vstack(ctrl_ko)

plt.figure(figsize=(10,10))
plt.plot(range(len(ctcf_ab[0])), np.nanmean(ctcf_ab,axis=0))
plt.plot(range(len(ctcf_ab[0])), np.nanmean(ctcf_ko,axis=0))
plt.plot(range(len(ctcf_ab[0])), np.nanmean(ctrl_ab,axis=0))
plt.plot(range(len(ctcf_ab[0])), np.nanmean(ctrl_ko,axis=0))
plt.show()

In [None]:
#CTCF enrichments
avplots <- read.table('cluster_averages_CTCF_111021.txt')
avplots$ordered <- factor(avplots$V1, c(0,2,1,4,5,3))
p1 <- ggplot(avplots,aes(x=V2,fill=V3,y=ordered)) + geom_raster() + scale_fill_distiller(palette = "BuPu",direction=1) + theme_bw()

fish <- subset(read.table('CTCF_fishers_allmols.data'), V2 <= 5)
fishq <- qvalue(fish$V6,lambda=0)
fish$qval = fishq$qvalues
fish$ordered <- factor(fish$V2, rev(c(0,2,1,4,5,3)))
p2 <- ggplot(subset(fish),aes(x=V1,y=V4,fill=as.factor(ordered))) + geom_bar(position = 'fill', stat='identity') + theme_bw() + scale_fill_brewer(palette = 'Reds') + theme(text = element_text(size=20)) + labs(x="Sample",y="Fraction of molecules in each cluster")

#single-site analysis
atac_sig <- read.table('atac_sig_chr12_115605169.txt')
p0 <- ggplot(atac_sig) + theme_classic() + geom_bar(aes(x=V1+115605169,y=V2,fill=V3), stat='identity') + facet_grid(V3 ~ .) + scale_fill_discrete(type = c('#4575b4','#d73027'))

mols <- subset(read.table('heatmap_mols_singlesite.data'), V4 <= 5)
submol1 <- subset(mols, V5 == 'SNF2hKO')
submol1$ordered <- factor(submol1$V4, rev(c(0,2,1,4,5,3)))
p1 <- ggplot(submol1) + theme_bw() + geom_raster(aes(x=V2+115605169,fill=as.integer(V3),y=V6)) + facet_grid(V5 ~ .) + scale_fill_distiller(palette = 'BuPu',direction=1) + theme(legend.position="none")
p2 <- ggplot(submol1) + theme_bw() + geom_raster(aes(x=as.factor(1),fill=ordered,y=V6)) + facet_grid(V5 ~ .) + scale_fill_brewer(palette = 'Reds') + theme(legend.position="none")
submol2 <- subset(mols, V5 == 'SNF2hWTAB')
submol2$ordered <- factor(submol2$V4, rev(c(0,2,1,4,5,3)))
p3 <- ggplot(submol2) + theme_bw() + geom_raster(aes(x=V2+115605169,fill=as.integer(V3),y=V6)) + facet_grid(V5 ~ .) + scale_fill_distiller(palette = 'BuPu',direction=1) + theme(legend.position="none")
p4 <- ggplot(submol2) + theme_bw() + geom_raster(aes(x=as.factor(1),fill=ordered,y=V6)) + facet_grid(V5 ~ .) + scale_fill_brewer(palette = 'Reds') + theme(legend.position="none")

(p0 / p1 / p2) / (p3 | p4)