# Snf2h paper analysis and figures

21/09/16

Code for generating supplemental figures and statistics for the Snf2h SAMOSA 2 paper

## Average accessibility in E14
21/09/16

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
from tqdm import tqdm
import pickle
import os
import socket
from Bio import Seq, SeqIO


if 'biochem1' in socket.gethostname():
    dataPBase = '/avicenna/vramani/analyses/pacbio/'
    figPBase = '/avicenna/cmcnally/pbanalysis/'
if 'titan' in socket.gethostname():
    dataPBase = '/data/users/goodarzilab/colin/results/pacbio/'
if 'wynton' in socket.gethostname():
    dataPBase = '/wynton/group/goodarzilab/ramanilab/results/pacbio/'
if 'rumi' in socket.gethostname():
    raise Exception('no pacbio results folder on rumi')
    
    
sampleRef = pd.read_csv(dataPBase + 'sampleRef_K562_mESC.csv', sep=',')

In [29]:
negSamples = [12,13]
chrSamples = [14,15]
posSamples = [28]

In [None]:
accessFdic = {}

for samp in tqdm([12, 13, 14, 15, 28], position=0):

    with open('{0}{1}/processed/binarized/{1}_{2}_NNsingle_HMM.pickle'.format(dataPBase,
                                                                              sampleRef['cell'][samp],
                                                                              sampleRef['sampleName'][samp]), 'rb') as fin:
        hmmRes = pickle.load(fin)
        
    zmws = list(hmmRes.keys())

    accessCount = np.full(1000, 0, dtype='float')
    contribCount = np.full(1000, 0, dtype='int')
    
    for z in zmws[0:24000]:
        useM = np.nonzero(np.isfinite(hmmRes[z]))[0]
        useM = useM[useM < 1000]
        accessCount[useM] += hmmRes[z][useM]
        contribCount[useM] += 1
        
    accessFdic[samp] = accessCount / contribCount

In [26]:
combdic = {'sample':[], 'rep':[], 'position':[], 'fraction':[]}
sampShortN = {12:'E14_chromatin', 13:'E14_chromatin', 14:'E14_chromatin_methylated',
              15:'E14_chromatin_methylated', 28:'E14_gDNA_methylated'}
sampRep = {12:'rep1', 13:'rep2', 14:'rep1', 15:'rep2', 28:'rep1'}

for samp in [12,13,14,15,28]:
    for b in range(1000):
        if np.isfinite(accessFdic[samp][b]):
            combdic['sample'].append(sampShortN[samp])
            combdic['rep'].append(sampRep[samp])
            combdic['position'].append(b)
            combdic['fraction'].append(accessFdic[samp][b])
            
combdf = pd.DataFrame(combdic)

combdf.to_csv('{0}{1}/processed/e14AccessFig.csv'.format(dataPBase, sampleRef['cell'][12]), index=False)

In [18]:
samp = 13

with open('{0}{1}/processed/binarized/{1}_{2}_NNsingle_HMM.pickle'.format(dataPBase,
                                                                          sampleRef['cell'][samp],
                                                                          sampleRef['sampleName'][samp]), 'rb') as fin:
    hmmRes = pickle.load(fin)
        
zmws = list(hmmRes.keys())
usezmws = zmws[0:24000]


inac = pd.read_csv(dataPBase + '{0}/processed/inaccessibleRegions/{0}_{1}_inacRegions.csv'.format(sampleRef['cell'][samp],
                                                                                                  sampleRef['sampleName'][samp]), index_col=0)

In [None]:
inacWd = {}
for samp in tqdm([12, 13, 14, 15, 28], position=0):

    with open('{0}{1}/processed/binarized/{1}_{2}_NNsingle_HMM.pickle'.format(dataPBase,
                                                                              sampleRef['cell'][samp],
                                                                              sampleRef['sampleName'][samp]), 'rb') as fin:
        hmmRes = pickle.load(fin)
        
    zmws = list(hmmRes.keys())
    usezmws = zmws[0:24000]
    
    inac = pd.read_csv(dataPBase + '{0}/processed/inaccessibleRegions/{0}_{1}_inacRegions.csv'.format(sampleRef['cell'][samp],
                                                                                                      sampleRef['sampleName'][samp]), index_col=0)
    inacWd[samp] = inac[np.isin(inac['zmw'], usezmws)]['length'].to_numpy()

In [29]:
combdic = {'sample':[], 'rep':[], 'length':[]}
sampShortN = {12:'E14_chromatin', 13:'E14_chromatin', 14:'E14_chromatin_methylated',
              15:'E14_chromatin_methylated', 28:'E14_gDNA_methylated'}
sampRep = {12:'rep1', 13:'rep2', 14:'rep1', 15:'rep2', 28:'rep1'}

for samp in [12,13,14,15,28]:
    for i in inacWd[samp]:
        combdic['sample'].append(sampShortN[samp])
        combdic['rep'].append(sampRep[samp])
        combdic['length'].append(i)
            
combdf = pd.DataFrame(combdic)

combdf.to_csv('{0}{1}/processed/e14FootprintsFig.csv'.format(dataPBase, sampleRef['cell'][12]), index=False)

### R code to generate figures from these csv inputs

In [None]:
library(ggplot2)
library(patchwork)
library(extrafont)

theme_update(text = element_text(family="Arial", size=10))

accessibility <- read.csv(file="C:/Users/Colin/OneDrive/Ramani Lab/Data/pbanalysis/processed/e14AccessFig.csv")


p1 <- ggplot(accessibility, aes(x=position, y=fraction, color=sample, linetype=rep)) + 
  geom_line() +
  labs(x="Distance from 5' end of molecule",
       y="Fraction of molecules that are accessible",
       title='Average accessibility',
       color='Sample',
       linetype='Replicate') +
  theme_bw()


footprints <- read.csv(file="C:/Users/Colin/OneDrive/Ramani Lab/Data/pbanalysis/processed/e14FootprintsFig.csv")

p2 <- ggplot(footprints, aes(length, color=sample, linetype=rep)) +
  geom_freqpoly(bins=200) +
  xlim(0,1000) +
  labs(x='Footprint length',
       y='Number of footprints',
       title='Footprint size distribution',
       color='Sample',
       linetype='Replicate') +
  theme_bw()


combined <- ((p1 + p2) & theme(legend.position = "bottom")) + plot_layout(guides="collect")

#combined 

ggsave('C:/Users/Colin/OneDrive/Ramani Lab/Data/pbanalysis/Figures/SAMOSA2_paper/E14.png', combined, width=8, height=4, units='in', dpi=300)
ggsave('C:/Users/Colin/OneDrive/Ramani Lab/Data/pbanalysis/Figures/SAMOSA2_paper/E14.pdf', combined, width=8, height=4, units='in', dpi=300)

#dev.off()

## Cutoffs for nucleosome counting
21/10/12

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib
from tqdm import tqdm
import pickle
import os
import socket
from Bio import Seq, SeqIO


if 'biochem1' in socket.gethostname():
    dataPBase = '/avicenna/vramani/analyses/pacbio/'
    figPBase = '/avicenna/cmcnally/pbanalysis/'
if 'assembler4' in socket.gethostname():
    dataPBase = '/data/users/goodarzilab/colin/results/pacbio/'
if 'wynton' in socket.gethostname():
    dataPBase = '/wynton/group/goodarzilab/ramanilab/results/pacbio/'
if 'rumi' in socket.gethostname():
    raise Exception('no pacbio results folder on rumi')
    
    
# CTCF sites
# load in reference sequences for both the independent and dependent CTCF sites
refFile = dataPBase + 'pbrun10_CTCFpool_2/snf2h_independent_site_observed.fasta'
for ir, record in enumerate(SeqIO.parse(refFile, 'fasta')):
    if ir > 0:
        raise InputError('Reference fasta has multiple entries')
    irefseq = record.seq # reference sequence for independent CTCF site
refFile = dataPBase + 'pbrun10_CTCFpool_2/snf2h_dependent_site_observed.fasta'
for ir, record in enumerate(SeqIO.parse(refFile, 'fasta')):
    if ir > 0:
        raise InputError('Reference fasta has multiple entries')
    drefseq = record.seq # reference sequence for dependent CTCF site

# Load in the sample reference tables for the CTCF site samples
sampleRef = pd.read_csv(dataPBase + 'pbrun10_CTCFpool_2/pbrun10_CTCFpool_2.sampleReference.csv')
sampleRef = pd.concat([sampleRef,
                       pd.read_csv(dataPBase + 'pbrun10_CTCFpool_1/pbrun10_CTCFpool_1.sampleReference.csv')],
                      ignore_index=True)
sampleRef = pd.concat([sampleRef,
                       pd.read_csv(dataPBase + '210516_NA_SNF2hCTCFarray_ST_rep2/210516_NA_SNF2hCTCFarray_ST_rep2.sampleReference.wynton.csv')],
                      ignore_index=True)
sampleRef = pd.concat([sampleRef,
                       pd.read_csv(dataPBase + '210520_NA_SNF2hCTCFarray_MT_rep1/210520_NA_SNF2hCTCFarray_MT_rep1.sampleReference.wynton.csv')],
                      ignore_index=True)
sampleRef = pd.concat([sampleRef,
                       pd.read_csv(dataPBase + '210608_NA_SNF2hCTCFarray_MT_rep2/210608_NA_SNF2hCTCFarray_MT_rep2.sampleReference.wynton.csv')],
                      ignore_index=True)

del sampleRef['index']

indepSamples = np.nonzero([(name[0:5] == 'Indep' or name[0:8] == 'CTCF_Ind') for name in sampleRef['sampleName']])[0]
depSamples = np.nonzero([(name[0:5] == 'Depen' or name[0:8] == 'CTCF_Dep') for name in sampleRef['sampleName']])[0]

regionAll = pd.DataFrame()
for samp in tqdm(indepSamples[2:], position=0):
    regionFile = dataPBase + '{0}/processed/inaccessibleRegions/{0}_{1}_inacRegions.csv'.format(sampleRef['cell'][samp],
                                                                                                sampleRef['sampleName'][samp])
    regiondf = pd.read_csv(regionFile, index_col=0)

    regiondf['mid'] = regiondf['start'] + (regiondf['end'] - regiondf['start']) / 2
    regiondf['sample'] = samp
    
    regionAll = pd.concat([regionAll, regiondf])
    
regionAllInd = regionAll

regionAll = pd.DataFrame()
for samp in tqdm(depSamples[2:], position=0):
    regionFile = dataPBase + '{0}/processed/inaccessibleRegions/{0}_{1}_inacRegions.csv'.format(sampleRef['cell'][samp],
                                                                                                sampleRef['sampleName'][samp])
    regiondf = pd.read_csv(regionFile, index_col=0)

    regiondf['mid'] = regiondf['start'] + (regiondf['end'] - regiondf['start']) / 2
    regiondf['sample'] = samp
    
    regionAll = pd.concat([regionAll, regiondf])
    
regionAllDep = regionAll

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:20<00:00,  2.13it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████| 44/44 [00:15<00:00,  2.93it/s]


In [2]:
sampleRef = pd.read_csv(dataPBase + 'sampleRef_K562_mESC.csv', sep=',')

regionAll = pd.DataFrame()
for samp in tqdm([4,5,8,9,14,15,20,21,22,23,24,25,26,27,32,33,34,35,36,37,38,40,41,42,44,45,46,48], position=0):
    regionFile = dataPBase + '{0}/processed/inaccessibleRegions/{0}_{1}_inacRegions.csv'.format(sampleRef['cell'][samp],
                                                                                                sampleRef['sampleName'][samp])
    regiondf = pd.read_csv(regionFile, index_col=0)

    regiondf['mid'] = regiondf['start'] + (regiondf['end'] - regiondf['start']) / 2
    regiondf['sample'] = samp
    
    regionAll = pd.concat([regionAll, regiondf])
    
regionAllmESC = regionAll

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 28/28 [06:35<00:00, 14.12s/it]


In [None]:
matplotlib.rcParams['font.sans-serif'] = "Arial"
matplotlib.rcParams['font.family'] = "sans-serif"

regiondf = {'mesc':regionAllmESC['length'],
            'ind':regionAllInd['length'],
            'dep':regionAllDep['length']}
            
divs = {'mesc': [50, 200, 370, 500, 630, 800, 970, 1090, 1200],
        'ind': [70, 205, 340, 475, 610, 780, 950, 1120, 1300, 1500, 1650, 1770, 1980, 2170, 2350, 2520, 2660],
        'dep': [70, 205, 345, 470, 605, 740, 880, 1005, 1175, 1320, 1500, 1740, 1890, 2035, 2180, 2310, 2480, 2740, 2870, 3000]}
            
refL = {'mesc': 3080,
        'ind': 2706,
        'dep': 3087}

names = {'mesc': 'mESC',
         'ind': 'S1',
         'dep': 'S2'}
    
fig, ax = plt.subplots(3,1, figsize=(7,7), sharex=True, sharey=True)

for imol, molecule in enumerate(['ind', 'dep', 'mesc']):


    hist, bine = np.histogram(regiondf[molecule], bins=np.arange(-0.5, refL[molecule] + 0.5))
    binc = bine[0:-1] + 0.5

    hist = hist.astype('float')
    ix = 0
    while hist[ix] == 0:
        hist[ix] = np.nan
        ix += 1

    smWidth = 21
    smWidthH = int((smWidth-1) / 2)
    smooths = []

    
    def smooth1time(inp):
        smoothlast = inp.copy()
        smoothn = inp.copy()
        for windc in np.arange(0,refL[molecule])[np.isfinite(inp)]:
            windlow = max(0, windc - smWidthH)
            windhigh = min(refL[molecule], windc + smWidthH)
            smoothn[windc] = np.nanmean(smoothlast[windlow:windhigh])
        return smoothn

    smoothc = hist.copy()
    for i in range(10):
        smoothc = smooth1time(smoothc)
        smooths.append(smoothc)
        

    usesm = smooths[2]

    ax[imol].plot(binc, np.log10(usesm))
    usedivs = divs[molecule]
    for p in usedivs:
        ax[imol].axvline(x=p, ls='--', color='limegreen')
    for i in range(len(usedivs)-1):
        midp = usedivs[i] + (usedivs[i+1] - usedivs[i])/2
        ax[imol].text(y=0, x=midp, s=str(i+1), ha='center')
    # print the zero
    midp = usedivs[0] - (usedivs[1] - usedivs[0])/2
    ax[imol].text(y=0, x=midp, s='0', ha='center')
    # print the max
    midp = usedivs[-1] + (usedivs[-1] - usedivs[-2])/4
    ax[imol].text(y=0, x=midp, s=str(len(usedivs))+'+', ha='left')
    
    ax[imol].set_title(names[molecule])
ax[2].set_xlabel('Inaccessible region length')
ax[2].set_ylabel('log$_{10}$( smoothed counts )')
plt.tight_layout()

fname = '/avicenna/cmcnally/pbanalysis/Figures/SAMOSA2_paper/nucCutoffs'

plt.savefig(fname + '.png', dpi=300)
plt.savefig(fname + '.svg', dpi=300)