In [1]:
%%bash
mkdir -p /data/reddylab/Revathy/collabs/Christian/grnas

In [2]:
# Read gRNAs
import pandas as pd
grnas = pd.read_csv("/data/reddylab/Revathy/collabs/Christian/Enhancer_CM_guides.txt", 
                    sep=',', names = ["gRNA"])
grnas["gRNA"] = grnas["gRNA"].str.upper()

In [3]:
grnas.head()

Unnamed: 0,gRNA
0,TGAGAGGTGATTGTACCATAA
1,ATGGATTAGTTTCCTCTGAGA
2,AGGAATGGATTAGTTTCCTCT
3,AAATTCATATTTCGAAACCCC
4,TTCCTCTATAAGTGCTATGGT


In [4]:
import numpy as np

In [5]:
import pandas as pd
from random import random
global DNA_ALPHABET
DNA_ALPHABET = ['A', 'T', 'C', 'G']

def extract_freq_from_np_uniq(a):
    freqs = dict(zip(DNA_ALPHABET, np.zeros(4)))
    kk,vv = np.unique(a, return_counts=True)
    freqs.update(dict(zip(kk, (1.*vv)/a.shape[0])))
    return [freqs[k] for k in DNA_ALPHABET]

In [7]:
# Calculate the nucleotide frequency at each base
from scipy.stats import itemfreq
import numpy as np
from Bio.Seq import Seq
freqs = np.apply_along_axis(extract_freq_from_np_uniq, 0, 
                            np.vstack([list(t) for t in grnas.gRNA.values.T]))


In [8]:
freqs.mean(axis=1)

array([0.2787226 , 0.28155707, 0.21371882, 0.22600151])

In [9]:
def subsample_alphabet_given_freqs(freqs):
    return np.random.choice(DNA_ALPHABET, p=freqs)

In [10]:
n_random_samples = 38

In [11]:
def outputTempBowtieFastq(libraryTable, outputFileName):
    phredString = 'I4!=======44444+++++++' #weighting for how impactful mismatches are along sgRNA sequence 
    with open(outputFileName,'w') as outfile:
        for name, row in libraryTable.iterrows():
            outfile.write('@' + name + '\n')
            outfile.write('CCN' + str(Seq(row['sequence'][1:]).reverse_complement()) + '\n')
            outfile.write('+\n')
            outfile.write(phredString + '\n')


In [12]:
import subprocess 
fqFile = '/data/reddylab/Revathy/ref_data/temp.fastq'
alignmentList = [(31,1,'/data/reddylab/Reference_Data/Genomes/hg19/hg19','hg19_all_cosgrove')]

nt_final_set = set()
alignmentColumns = []
while len(nt_final_set)<n_random_samples:
    random_samples = [''.join(np.apply_along_axis(subsample_alphabet_given_freqs, 0,  freqs)) for ii in range(n_random_samples)]
    negTable = pd.DataFrame(random_samples, 
                            index=['NT_%d' %i  for i in range(1, len(random_samples)+1)], 
                            columns = ['sequence'])
    outputTempBowtieFastq(negTable, '/data/reddylab/Revathy/ref_data/temp.fastq')

    for btThreshold, mflag, bowtieIndex, runname in alignmentList:

        alignedFile = '/data/reddylab/Revathy/ref_data/' + runname + '_aligned.txt'
        unalignedFile = '/data/reddylab/Revathy/ref_data/' + runname + '_unaligned.fq'
        maxFile = '/data/reddylab/Revathy/ref_data/' + runname + '_max.fq'

        bowtieString = '/nfs/software/helmod/apps/Core/bowtie/1.1.1-fasrc01/bowtie -n 3 -l 5 -e ' + \
            str(btThreshold) + \
            ' -m ' + str(mflag) + \
            ' --nomaqround -a --tryhard -p 16 --chunkmbs 256 ' + \
            bowtieIndex + \
            ' --suppress 5,6,7 --un ' + \
            unalignedFile + \
            ' --max ' + maxFile + \
            ' -q ' + \
            fqFile + ' ' + \
            alignedFile
#         print bowtieString
        subprocess.call(bowtieString, shell=True)

        #read unaligned file for negs, and then don't flip boolean of alignmentTable
        with open(unalignedFile) as infile:
            sgsAligning = set()
            for i, line in enumerate(infile):
                if i%4 == 0: #id line
                    sgsAligning.add(line.strip()[1:])

        alignmentColumns.append(negTable.apply(lambda row: row.name in sgsAligning, axis=1))

    alignmentTable = pd.concat(alignmentColumns,axis=1, keys=list(zip(*alignmentList))[3])
#     print alignmentTable.head()
    nt_final_set = set(list(nt_final_set) + list(negTable[alignmentTable.values].values.T[0]))


In [15]:
pd.DataFrame(list(nt_final_set), columns=['nt_sequence'])\
    .to_csv('/data/reddylab/Revathy/collabs/Christian/grnas/Enhancer_CM_guides_nt_sequences.txt', index=False)

#### index for hg38 genome: /data/reddylab/Reference_Data/Genomes/hg38/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bowtie_index

----