In [2]:
import pandas as pd
import Bio.SeqIO
import Bio.Seq as Seq
import numpy as np

In [3]:
#load in genome (that we downloaded from NCBI)
for record in Bio.SeqIO.parse('sequencev3.fasta', "fasta"):
    genome = str(record.seq)

In [4]:
#load in files containing start sites locations
totdf = pd.io.parsers.read_csv('../manu_genes.csv')

In [5]:
totdf

Unnamed: 0,name,start_site,rev,notes
0,livM,3597755,rev,
1,ygbI,2861256,rev,race over comput
2,deaD,3308086,rev,full operon over compute
3,frlR,3504043,fwd,
4,slyA,1720870,rev,
5,wzxC,2120337,rev,comp
6,ycgB,1237285,rev,comp
7,ymgC,1215752,fwd,operator
8,tff,0,fwd,already done.


In [6]:
#look at only relevant colums for gene name, location of TSS, and whether the
#gene transcribes in the reverse direction.
totdf = totdf.loc[:,['name','start_site','rev']]

In [7]:
def find_seq(s):
    '''this function gets the 115 base pairs upstream and 45 base pairs downstream
    of the TSS. If the gene transcribes in the reverse direction it returns
    the reverse complement of this.'''
    ss = int(s[1])
    if s[2] == 'rev':
        
        gene = genome[ss-45:ss+115]
        tempgene = Seq.Seq(gene)
        outgene = str(tempgene.reverse_complement())
    elif s[2] == 'fwd':
        outgene = genome[ss-115:ss+45]
    return outgene

In [8]:
#get wild type sequence for each TSS.
totdf['geneseq'] = totdf.apply(find_seq,axis=1)

In [9]:
totdf

Unnamed: 0,name,start_site,rev,geneseq
0,livM,3597755,rev,ACAAAATTAAAACATTAGAGAATGAAAAATGTCCAGCATAATCCCC...
1,ygbI,2861256,rev,AAGATAACGGTATGGTGATCTGATTCACATAAATTAACATTGTGTG...
2,deaD,3308086,rev,AAGTACTACCTAAGTCTGGGGGATTTGGACAGCGCCACGGCACTGT...
3,frlR,3504043,fwd,ATTCAGTACCACGGTGCCTGGTAGGTATAACGTTGGCGTGAGCATC...
4,slyA,1720870,rev,TAATAAATATTCTTTAAGTGCGAAAAATTTACGCGCAATTTCTGAA...
5,wzxC,2120337,rev,TCAATGTGCTGACCGGGGGGATGTCGATTGTCGGTCCACGTCCGCA...
6,ycgB,1237285,rev,TATCCAGCATAAAATTCCGTTCAGAAGCGGATTAGTGGCACTCTGA...
7,ymgC,1215752,fwd,ATGATGCAATATGTTTTATCATAACACATTGTTTTATATGCATTAG...
8,tff,0,fwd,


In [11]:
#save resulting wild type sequences
totdf.to_csv(open('manu_wtseqs.csv','w'))