In [62]:
'''memory on server is full, divert to local'''
import pandas as pd
import subprocess
from Bio import SeqIO
import itertools
from operator import add
import time
import numpy as np

In [2]:
CODONS = map(lambda x: ''.join(x), list(itertools.product(['A','T','G','C'],['A','T','G','C'],['A','T','G','C'])))

In [3]:
STOP_CODONS = ['TAA', 'TGA', 'TAG']

In [4]:
info = pd.read_csv('/home/richard/research/data_small/fullTableInfoGff3GffRNAESwithDensity20151109.csv')
info.index = info.shortName

In [5]:
target = '/storage3/w/richard/meta2016/CDS_cut/'

In [6]:
## convert seq to capital letters
## filter out alternative splicing
def filter1(seq): return len(seq)%3 == 0
def filter2(seq): return seq[:3] == 'ATG'
def filter3(seq): return seq[-3:] in STOP_CODONS
def filter4(seq): return 'N' not in seq

In [127]:
def process_CDS(filename):
    '''
    input: filename with path
    output: sequence contained in list
    '''
    recs = list(SeqIO.parse(filename,'fasta'))
    seqs = map(lambda x: x.seq, recs)
    seqs = filter(lambda x: filter1(x) and filter2(x) and filter3(x), seqs)
    seqs = map(lambda x: x[:-3].upper(), seqs)
    return reduce(lambda a,b: a+b, seqs)

In [129]:
def get_condon_freqs(seq):
    table = {codon:0 for codon in CODONS}
    for i in range(0,len(seq),3): table[seq[i:i+3]] += 1
    total = 1.*sum(table.values())
    for key in table: table[key] /= total
    return table

In [15]:
def process_CDS_spark(filename):
    '''
    input: filename with path
    output: sequence contained in list
    '''
    recs = sc.parallelize(list(SeqIO.parse(filename,'fasta')))
    seqs = recs.map(lambda x: str(x.seq))
    seqs = seqs.filter(lambda x: filter1(x) and filter2(x) and filter3(x) and filter4(x))
#     seqs = seqs.map(lambda x: x[:-3].upper())
    return seqs.reduce(lambda a,b: a+b)

In [16]:
def get_condon_freqs_spark(seq):
    table = {codon:0 for codon in CODONS}
    counts = sc.parallelize(xrange(0,len(seq),3)).map(lambda i: (seq[i:i+3],1)).reduceByKey(add).collect()
    for (key,count) in counts: table[key] = count
    total = 1.*sum(table.values())
    for key in table: table[key] /= total
    return table

In [59]:
codon_freqs = [] #container for gc, condon freq pairs
problem_species = []
start = time.time()
for i,shortName in enumerate(info.index[423:]):
    try:
        filename = "{}{}/nuc_seq.fna".format(target,shortName)
        seqs = process_CDS_spark(filename)
        codon_freqs.append(get_condon_freqs_spark(seqs))
        print i, shortName, 'time spent so far:', time.time()-start 
    except:
        print 'problem with {}'.format(shortName)
        problem_species.append(shortName)

problem with Sisni1
problem with Sissu1
problem with Sodal1
problem with Spapa3
problem with Sphst1
problem with Spofi1
problem with Spoli1
problem with Spore1
problem with Sporo1
problem with Spoth2
problem with Stagr1
problem with Stano2
problem with Stasp1
problem with Stehi1
problem with Suibr1
problem with Suigr1
problem with Suihi1
problem with Suilu1
problem with Symat1
problem with Symko1
problem with Talma1_2
problem with Talst1_2
problem with Tapde1_1
problem with Terbo2
problem with Ternu1
problem with Tescy1
problem with Theau1
problem with Thega1
problem with Themi1
problem with Thian1
problem with Thiap1
problem with Thiar1
problem with Thihy1
problem with Thite2
problem with ThoPMI491_1
problem with Tilan2
problem with Tilwa1
problem with Torde1
problem with Torra1
problem with Totfu1
problem with Trace1
problem with Traci1
problem with Tralj1
problem with Trave1
problem with Treme1
problem with Trepe1
problem with Triab1_1
problem with Trias1
problem with Triat2
problem

In [36]:
## make table
## shortName, gc, codon vector
table_entry = []
for i,shortName in enumerate(info.index[:30]):
    condon_vec = [codon_freqs[i][condon] for condon in CODONS]
    condon_vec.insert(0,info.ix[shortName].gc)
    condon_vec.insert(0,shortName)
    table_entry.append(condon_vec)
    
cols = CODONS[:]; cols.insert(0, 'gc'); cols.insert(0,'shortName')

condon_freq_table = pd.DataFrame(table_entry, columns=cols)

In [None]:
## save table for future use to avoid intensive calculation


In [54]:
## build regression model from table
regression_models = [np.poly1d(np.polyfit(condon_freq_table.gc, condon_freq_table[codon], 3)) for codon in CODONS]

In [55]:
# make table with regression model predicted codon frequency
vecs = []
for gc in range(25,71):
    vec = map(lambda model: model(gc), regression_models)
    vec.insert(0,gc)
    vecs.append(vec)

In [None]:
###### test script ############

In [None]:
codon_freqs = [] #container for gc, condon freq pairs
problem_species = []
start = time.time()
for i,shortName in enumerate(info.index[423:]):
    try:
        filename = "{}{}/nuc_seq.fna".format(target,shortName)
        seqs = process_CDS_spark(filename)
        codon_freqs.append(get_condon_freqs_spark(seqs))
        print i, shortName, 'time spent so far:', time.time()-start 
    except:
        print 'problem with {}'.format(shortName)
        problem_species.append(shortName)

In [61]:
filename = "{}{}/nuc_seq.fna".format(target,info.index[423])
seqs = process_CDS_spark(filename)

IOError: [Errno 28] No space left on device

In [56]:
CDSs = (sc.parallelize(CDS_shortNames[:10])
        .map(lambda name: (info.gc[name], target+name+'/'+CDS_number[name][0][:-3]))
        .map(lambda (gc, filename): (gc, list(SeqIO.parse(filename, 'fasta')))).collect())

In [11]:
test = list(SeqIO.parse('/storage3/w/richard/meta2016/CDS_cut/Aaoar1/nuc_seq.fna','fasta'))

In [12]:
test1 = test[0].seq

In [14]:
str(test1)

'ATGGCTGCACCGGTCAAGACTGTTGCAATTCTCGGCGCCAGTGGAAATTTGGGAGCTGTACTCCTGGATCATTTCTCTGCGCAGGAGGACTCCCCATTGAAGGTCACAGCCGTCACAAGAGAAGGGTCGGATAGCAAGTTTCCAGACTCTTTAAAAGTAGCCTACACCGACTTCAGCTCTGCATCACTTGAGCAGGTTCTCCGCGGGCAGGATGTGGTCATTGACCTGCTTCCACCAGAGGCCAAGGTGCCTCACGAACGCATCATTGACGCTGCCGTGAAGTCGGGGGTAAAGCGATTCTTTCCCTCCGAATATGGCGTCCGGTCTTATTATCCAGAATTTGCCGAGGTCGTGGCGATAACGAAGAAGAAACGATCCATTGTCAAGTATCTGGAGAAGACACAAGACAAGATGAGCTGGACAGCCTTGCTGTGTAACCCCTGGACTGACTTTTGTGTGGTTGATGGTCTTCTCGGCTTCGACCTGAAGAACAAGAAAGCCCAAATCTACAACGGCGGCGATGTACCCTTTTCTGCTGGCCTCCGCGAACTCGCCGGAAAGGCTTTCTACGCGCTCCTCACCAACGTAGAGCAATTCGAGGAAGCAAAGAACCAGTACATCCACATATTCTCTTATACCACCACGCAGAACGAGATTCTGGCTACGGTGGAGAACATTCTGGGCGAGAAATTCGAAGTTACCCACGTCAGGTCAGAGGAGGTACTTCCCCAGGCCGAGATGGAGGCCAAAGAAGGTAAGAACCGGGGTCTAGCTGCTCAGGTGCAGGCAATCTTTTACAGTCGGGACGCGAAAGGGAACGGAGTAGGCGACTTTCGGCCTCTTGGGGACTGGAGCGAACGCTTGAAGCTGCGACCGACAAACTTGGAGGACGACTTGAAGGGACCCCTTACGGGAAATTGGAGAGGTATTCTGCATTGGCAACCTGAGGAGCTTCCCGACTATAGCTTAACTGTCTAA'

In [38]:
test1[-3:] == 'TAG'

True

In [42]:
test1.upper()

Seq('ATGAGATGCTGGCCCGTGTTGTTCGTCGCGGCTGCGGCTGCTATGCCATGGACA...TAG', SingleLetterAlphabet())

In [93]:
for i in range(0,len(test1),3):
    print test1[i:i+3]

ATG
AGA
TGC
TGG
CCC
GTG
TTG
TTC
GTC
GCG
GCT
GCG
GCT
GCT
ATG
CCA
TGG
ACA
AAT
ATC
TTA
TTC
AAC
ACG
GCG
AGT
ACC
AAA
AGT
CTA
GAA
CCT
AGA
GAC
GCC
CCC
AAT
CCA
CCC
GGT
GGT
GAA
AGT
AGC
ATA
ATG
TGT
AGA
TGG
TCC
AAC
TGC
GGA
GAA
CCT
TGC
GAT
GCT
GGA
TTT
GAG
GCG
AAA
ACG
ATA
GCA
GGA
GGA
GAG
CCC
GGT
AAG
ATA
ATG
AGC
AAC
CAC
GAA
CAC
TGC
ATG
GAC
GAA
GGA
TTC
CAA
ACG
TTC
TGT
TGT
CCC
ACA
GGC
CAA
CCC
ACT
CCC
AAC
TGC
CTG
TGG
CGA
GGC
CTG
CAA
AAC
GGT
CAA
AAA
TGC
ACG
CCA
GGC
TGC
GCT
ACC
TCG
GAG
GTG
GAG
GTC
GGA
TCT
ACG
AAA
ACG
AAC
TGT
TCG
AAT
GGT
GGA
CAT
CAG
ACT
GCA
TGT
TGC
TCA
GAT
GGC
CGT
TCC
GTT
AGT
GCA
TAT
TCG
CAA
TGC
AAA
TGG
CAT
GGG
TGT
TCA
TTC
AGT
GGC
AAT
TGG
TGC
AGC
AAG
GAA
TAC
CCT
CAG
GCG
AGG
AGG
CAT
ACT
GTT
GTC
GCG
GAG
AAG
TCA
GAG
GGG
ACC
GCG
ACA
ATT
ATG
TGG
AAA
TAT
TTC
AGA
GGA
CCT
TGG
ACT
CGC
AAG
TTG
GAC
TTG
GTG
TCA
AGG
TTG
ATA
CTT
TGC
ATT
AAC
TGC
CTC
ACA
GAT
ATC
TCA
AGG
ACC
GCC
GGG
AAA
ACA
AAT
GCC
CAC
CTG
AGT
CGC
TTC
CAC
CGA
TTG
AAT
AAT
CGA
ACT
GAT
TAT
ATC
GTA
AAC
AGT
GAC
CAG
GAT
GTT
AAC
TAC
GAC
TCG
GCT
