## Process new SAMOSA samples

This code takes in CSV files from the Ramani Lab google drive pacbio run set ups and outputs scripts to do the read processing on Wynton, as well as a fasta file of the barcodes and a sample reference CSV.

Instructions:
1. Find the sheet for the run you want to process in the google drive, under Ramani Lab > instruments > Pacbio > runs > Run Set Ups
2. Download the first and last tabs (*Samples* and *Demux FASTA files*) as CSV files
3. Transfer these CSV files files to the UCSF server where you are running this notebook.
4. Run this notebook. Adjusted the hardcoded name checks as needed to include your cell and use the correct reference
5. If not running on wynton, transfer the output (scripts, barcodes.fasta, and samplereference.csv) to Wynton
6. qsub the runLima.sh script for your cell
7. If it's an amplicon reference, you will then need to qsub the runExtractAll.sh script after that script and daughter scripts are finished running


In [4]:
import pandas as pd
import os
import socket
import csv
import numpy as np

if 'biochem1' in socket.gethostname():
    os.chdir('/avicenna/cmcnally/pipeline')
    # pipeline is the folder I used to contain the input and output from this notebook. You could make your
    # own pipeline folder and use it in a similar way
    
    dataPBase = '/avicenna/vramani/analyses/pacbio/'
    rawPBase = '/avicenna/vramani/data/pacbio/'
    figPBase = '/avicenna/cmcnally/pbanalysis/'
if 'titan' in socket.gethostname():
    dataPBase = '/data/users/goodarzilab/colin/results/pacbio/'
if 'wynton' in socket.gethostname():
    dataPBase = '/wynton/group/goodarzilab/ramanilab/results/pacbio/'
    rawPBase = '/wynton/group/goodarzilab/ramanilab/data/pacbio/'
if 'rumi' in socket.gethostname():
    raise Exception('no pacbio results folder on rumi')

In [None]:
csvfs = os.listdir('gdoc_sheets') # looks for files in the gdoc_sheets directory within pipeline
sampfs = [cv for cv in csvfs if 'Samples.csv' in cv] # Takes all the Samples csv files

# Uses the assumption that the order of the cells in the csv is the same as the order they were sequenced on
# the machine. This is usually true but be careful of possible exceptions to this assumption
cellIfolder = {1:'1_B03', 2:'2_B05', 3:'3_B07', 4:'4_B09'}

# Goes through all of the Samples csv files and generates output scripts, fasta, and samplereference for them
# This could be modified to only generate the output for a single new input
for i in range(len(sampfs)):
    cellNames = []
    cellInd = {}
    cellI = 1
    with open('gdoc_sheets/{0}'.format(sampfs[i])) as csvfile:
        cvreader = csv.reader(csvfile, delimiter=',')
        inSamples = False
        for row in cvreader:
            
            # The script looks for a header field called Directory Name and takes the entry as the folder name
            # of the sequencing output. This will look something like r64182_20211123_032844
            # This field may not be present in the google doc. If it isn't, you need to look up the folder name
            # and add it to the google doc before you download the Samples sheet
            if row[0] == 'Directory Name' or row[0] == 'Directory Name:':
                runName = row[1]
            if inSamples:
                if len(row[1]) > 3:
                    # Keep track of how many cells have been seen
                    cellInd[row[1]] = cellI
                    cellI += 1
                    
                if np.array([a in row[1] for a in {'SAM', '_NA_', 'BrdU', 'dUTP'}]).any():
                    # If any of these substrings are in the cell name, assume it is SAMOSA and make output for it
                    cellNames.append(row[1])
            if row[1] == 'Pool / Run Name' or row[1] == 'Pool / Cell Name':
                inSamples = True

    assert runName != '', "No directory for the files"
    
    if cellNames[0] == '210421_NA_SAMv2_mESCs': #fix for one cell that didn't have the directory name field
        runName = 'r64182_20210421_233732'
        
    cellSamples = {}
    # Now open the other sheet and from it take the sample names and barcodes
    with open('gdoc_sheets/{0}'.format(sampfs[i].replace('Samples', 'Demux FASTA files'))) as csvfile:
        cvreader = csv.reader(csvfile, delimiter=',')
        for row in cvreader:
            if row[0] in cellNames:
                cellSamples[row[0]] = row[1]
                
    for cell in cellNames:
        sampleNames = []
        barcodes = []

        # get sample names and barcodes, clean up sample names
        barcElements = cellSamples[cell].split('\n')
        for b in barcElements:
            if '>' in b:
                sampn = b
                sampn = sampn.replace('>','')
                # Replace problematic symbols in the sample names. These names will be used as file names and
                # some cause issues with the Pacbio software or unix in general
                sampn = sampn.replace(' ', '_')
                sampn = sampn.replace('+m', 'plusM')
                sampn = sampn.replace('+M', 'plusM')
                sampn = sampn.replace('-m', 'minusM')
                sampn = sampn.replace('-M', 'minusM')
                sampn = sampn.replace('+ATP', 'plusATP')
                sampn = sampn.replace('+ADP', 'plusADP')
                sampn = sampn.replace(':', 'to')
                sampn = sampn.replace('.', '_') #dislike this one, better to avoid in first place
                if sampn[-1] in ['1','2'] and 'rep' not in sampn:
                    sampn = sampn[:-1] + '_rep' + sampn[-1]
                sampn = sampn.rstrip(' _')
                sampleNames.append(sampn)
            else:
                if b is not '':
                    barcodes.append(b)
        if len(sampleNames) != len(barcodes):
            raise Exception('sample names and barcodes not matching up')
        
        # write out barcodes to a fasta
        with open('/avicenna/cmcnally/pipeline/barcodes/{0}.barcodes.fasta'.format(cell), 'w') as fout:
            for i in range(len(sampleNames)):
                fout.write('>' + sampleNames[i] + '\n')
                fout.write(barcodes[i] + '\n')


        if not os.path.exists('/avicenna/cmcnally/pipeline/scripts/' + cell):
            os.makedirs('/avicenna/cmcnally/pipeline/scripts/' + cell)

        # write out the runLima.sh script. This will make the output folder (OUTDIR), run lima, and then start running
        # the individual scripts to process each sample separately after lima completes
        with open('/avicenna/cmcnally/pipeline/scripts/{0}/runLima.sh'.format(cell), 'w') as fout:
            fout.write('#!/bin/env bash\n#\n#$ -pe smp 24\n#$ -l h_rt=72:00:00\n\n\n')
            fout.write('INFILE=/wynton/group/goodarzilab/ramanilab/data/pacbio/{0}/{1}/m64182_*.subreads.bam\n'.format(runName, cellIfolder[cellInd[cell]]))
            fout.write('OUTDIR=/wynton/group/goodarzilab/ramanilab/results/pacbio/{0}\n\n'.format(cell))
            fout.write('mkdir -p ${OUTDIR}/{aligned,ccs,processed}\n')
            fout.write('cd ${OUTDIR}\n\n')

            fout.write('cp /wynton/home/goodarzi/cpmcnally/pipeline/barcodes/{0}.barcodes.fasta ${{OUTDIR}}\n\n'.format(cell))
            fout.write('/wynton/home/goodarzi/cpmcnally/bin/smrtlink/smrtcmds/bin/lima --same --split-bam-named ')
            fout.write('-j ${NSLOTS:-1} ${INFILE} ${OUTDIR}/')
            fout.write('{0}.barcodes.fasta ${{OUTDIR}}/{0}.split.bam\n\n'.format(cell))
            for samp in sampleNames:
                fout.write('qsub /wynton/home/goodarzi/cpmcnally/pipeline/scripts/{0}/process_{1}.sh\n'.format(cell, samp))


        # prepare for sample reference
        targethost = 'wynton'
        if targethost == 'wynton':
            pathbase = '/wynton/group/goodarzilab/ramanilab/results/pacbio'

        refd = {'cell':[], 'sampleName':[], 'unalignedSubreadsFile':[], 'ccsFile':[], 'alignedSubreadsFile':[],
                'alignedCcsFile':[], 'reference':[], 'processed':[]}
                
        # write extractIPD script for the cell. Add a line for each sample within the below loop
        extractFile = open('/avicenna/cmcnally/pipeline/scripts/{0}/runExtractAll.sh'.format(cell), 'w')
        extractFile.write('#!/usr/bin/env bash\n\n')
        
        # Write script for each sample, to make ccs and align subreads
        for isamp, samp in enumerate(sampleNames):
            genomicRef = False
            ampliconRef = False
            
            # Figure out whether this sample is genomic or amplicon, and what the correct reference is
            # Obviously this needs to be manually adjusted to make sure it correctly identifies your cells
            if 'mESC' in samp or 'E14' in samp or '211122_NA_Snf2h' in cell:
                genomicRef = True
                refmmi = '/wynton/group/goodarzilab/ramanilab/genomes/mm10/mm10.mmi'
            if 'K562' in samp:
                genomicRef = True
                refmmi = '/wynton/group/goodarzilab/ramanilab/genomes/hg38/hg38.mmi'

            if '_observed' in samp or 'array' in samp or 'CTCF' in samp:
                ampliconRef = True
                if samp[0:5].lower() == 'depen':
                    reffasta = 'snf2h_dependent_site_observed.fasta'
                elif samp[0:8] == 'CTCF_Dep':
                    reffasta = 'snf2h_dependent_site_observed.fasta'
                elif samp[0:5].lower() == 'indep':
                    reffasta = 'snf2h_independent_site_observed.fasta'
                elif samp[0:8] == 'CTCF_Ind':
                    reffasta = 'snf2h_independent_site_observed.fasta'
                else:
                    print('Ref unknown')
                    print(samp)
                    
            if 'MO_dUTP_PCR' in cell:
                ampliconRef = True
                reffasta = 'snf2h_independent_site_observed.fasta'
                
            if not genomicRef and not ampliconRef:
                print('Reference unknown: %s' % (samp))
                # don't write output if the cell doesn't get identified properly
                continue

            with open('/avicenna/cmcnally/pipeline/scripts/{0}/process_{1}.sh'.format(cell, samp), 'w') as fout:
                if len(sampleNames) >= 4:
                    fout.write('#!/bin/env bash\n#\n#$ -pe smp 8\n#$ -l h_rt=48:00:00\n')
                else: # if small number of samples, request more computational resources to process each one
                    fout.write('#!/bin/env bash\n#\n#$ -pe smp 16\n#$ -l h_rt=72:00:00\n')
                fout.write('#$ -l hostname="!qb3-ad*"\n\n\n') # request to not use nodes that don't support SSE4.1, and thus pbmm2 won't work on
                fout.write('cd /wynton/group/goodarzilab/ramanilab/results/pacbio/{0}\n\n'.format(cell))

                fout.write('/wynton/home/goodarzi/cpmcnally/bin/smrtlink/smrtcmds/bin/ccs -j ${NSLOTS:-1} ')
                fout.write('{0}.split.{1}--{1}.bam ccs/{0}.split.{1}.ccs.bam\n\n'.format(cell,samp))
                fout.write('/wynton/home/goodarzi/cpmcnally/bin/smrtlink/smrtcmds/bin/pbmm2 align ')
                if genomicRef:
                    fout.write('{2} ccs/{0}.split.{1}.ccs.bam aligned/{0}.split.{1}.ccs.aligned.sorted.bam '.format(cell,samp,refmmi))
                    fout.write('--sort --preset CCS -j ${NSLOTS:-1}\n\n')
                    fout.write('/wynton/home/goodarzi/cpmcnally/bin/smrtlink/smrtcmds/bin/pbindex aligned/{0}.split.{1}.ccs.aligned.sorted.bam\n'.format(cell,samp))
                    fout.write('\nqsub /wynton/home/goodarzi/cpmcnally/pipeline/scripts/runExtractGenomicLinear.sh {0} {1}\n'.format(cell, isamp))
                    
                    extractFile.write('qsub /wynton/home/goodarzi/cpmcnally/pipeline/scripts/runExtractGenomicLinear.sh {0} {1}\n'.format(cell, isamp))
                elif ampliconRef:
                    fout.write('%s/pbrun10_CTCFpool_2/%s ' % (pathbase,reffasta))
                    fout.write('{0}.split.{1}--{1}.bam aligned/{0}.split.{1}.subreads.aligned.bam '.format(cell,samp))
                    fout.write('-j ${NSLOTS:-1}\n\n')
                    fout.write('/wynton/home/goodarzi/cpmcnally/bin/smrtlink/smrtcmds/bin/pbindex aligned/{0}.split.{1}.subreads.aligned.bam\n'.format(cell,samp))
                    
                    extractFile.write('qsub /wynton/home/goodarzi/cpmcnally/pipeline/scripts/runExtractAmplicon.sh {0} {1}\n'.format(cell, isamp))
            # also write the sample reference entry for this sample
            refd['cell'].append(cell)
            refd['sampleName'].append(samp)
            refd['unalignedSubreadsFile'].append('{0}/{1}/{1}.split.{2}--{2}.bam'.format(pathbase, cell, samp))
            refd['ccsFile'].append('%s/%s/ccs/%s.split.%s.ccs.bam' % (pathbase,cell, cell, samp))
            if genomicRef:
                refd['alignedSubreadsFile'].append('')
                refd['alignedCcsFile'].append('{0}/{1}/aligned/{1}.split.{2}.ccs.aligned.sorted.bam'.format(pathbase, cell, samp))
                refd['reference'].append(refmmi)
            elif ampliconRef:
                refd['alignedSubreadsFile'].append('%s/%s/aligned/%s.split.%s.subreads.aligned.bam' %
                                                   (pathbase, cell, cell, samp))
                refd['alignedCcsFile'].append('')
                refd['reference'].append('%s/pbrun10_CTCFpool_2/%s' % (pathbase,reffasta))
            refd['processed'].append('%s/%s/processed' % (pathbase,cell))
            
        extractFile.close()
        # compile and write out sample reference csv
        sref = pd.DataFrame(refd)
        sref.to_csv('/avicenna/cmcnally/pipeline/sampleReferences/%s.sampleReference.wynton.csv' % (cell), index_label='index')
        

In [12]:
# This cell writes out shell wrappers for the extractIPD script. This only needs to be run once as these
# are generic shell scripts. The first one

# write generic extractIPD script for genomic input

with open('/avicenna/cmcnally/pipeline/scripts/runExtractGenomicLinear.sh', 'w') as fout:
    fout.write('#!/usr/bin/env bash\n#\n#$ -pe smp 10\n#$ -l mem_free=1G\n#$ -l h_rt=96:00:00\n#$ -l hostname="!qb3-ad*"\n\n\n')
    fout.write('%s\n%s\n%s\n' % ('# >>> conda initialize >>>',
                                 '# !! Contents within this block are managed by ''conda init'' !!',
                                 '__conda_setup="$(''/wynton/home/goodarzi/cpmcnally/bin/miniconda3/bin/conda'' ''shell.bash'' ''hook'' 2> /dev/null)"'))
    fout.write('%s\n%s\n%s\n%s\n%s\n' % ('if [ $? -eq 0 ]; then',
                                         '\teval "$__conda_setup"',
                                         'else',
                                         '\tif [ -f "/wynton/home/goodarzi/cpmcnally/bin/miniconda3/etc/profile.d/conda.sh" ]; then',
                                         '\t\t. "/wynton/home/goodarzi/cpmcnally/bin/miniconda3/etc/profile.d/conda.sh"'))
    fout.write('%s\n%s\n%s\n%s\n%s\n%s\n\n' % ('\telse',
                                     '\t\texport PATH="/wynton/home/goodarzi/cpmcnally/bin/miniconda3/bin:$PATH"',
                                     '\tfi',
                                     'fi',
                                     'unset __conda_setup',
                                     '# <<< conda initialize <<<'))
    fout.write('cd /wynton/group/goodarzilab/ramanilab/results/pacbio/$1\n')
    fout.write('conda activate procSAMOSA\n\n')
    fout.write('python /wynton/home/goodarzi/cpmcnally/code/scripts/extractIPDlinear.py -j 8 -o /wynton/group/goodarzilab/ramanilab/results/pacbio/')
    fout.write('$1 /wynton/home/goodarzi/cpmcnally/pipeline/sampleReferences/$1.sampleReference.wynton.csv $2\n')
    
# write generic extractIPD script for amplicon input

with open('/avicenna/cmcnally/pipeline/scripts/runExtractAmplicon.sh', 'w') as fout:
    fout.write('#!/usr/bin/env bash\n#\n#$ -pe smp 1\n#$ -l mem_free=12G\n#$ -l h_rt=48:00:00\n\n\n')
    fout.write('%s\n%s\n%s\n' % ('# >>> conda initialize >>>',
                                 '# !! Contents within this block are managed by ''conda init'' !!',
                                 '__conda_setup="$(''/wynton/home/goodarzi/cpmcnally/bin/miniconda3/bin/conda'' ''shell.bash'' ''hook'' 2> /dev/null)"'))
    fout.write('%s\n%s\n%s\n%s\n%s\n' % ('if [ $? -eq 0 ]; then',
                                         '\teval "$__conda_setup"',
                                         'else',
                                         '\tif [ -f "/wynton/home/goodarzi/cpmcnally/bin/miniconda3/etc/profile.d/conda.sh" ]; then',
                                         '\t\t. "/wynton/home/goodarzi/cpmcnally/bin/miniconda3/etc/profile.d/conda.sh"'))
    fout.write('%s\n%s\n%s\n%s\n%s\n%s\n\n' % ('\telse',
                                     '\t\texport PATH="/wynton/home/goodarzi/cpmcnally/bin/miniconda3/bin:$PATH"',
                                     '\tfi',
                                     'fi',
                                     'unset __conda_setup',
                                     '# <<< conda initialize <<<'))
    fout.write('cd /wynton/group/goodarzilab/ramanilab/results/pacbio/$1\n')
    fout.write('conda activate procSAMOSA\n\n')
    fout.write('python /wynton/home/goodarzi/cpmcnally/code/scripts/extractIPDfull3.py -o /wynton/group/goodarzilab/ramanilab/results/pacbio/$1 ')
    fout.write('/wynton/home/goodarzi/cpmcnally/pipeline/sampleReferences/$1.sampleReference.wynton.csv $2\n')
    