In [110]:
import pandas as pd
import subprocess
from Bio import SeqIO
import itertools
from operator import add

In [80]:
CODONS = map(lambda x: ''.join(x), list(itertools.product(['A','T','G','C'],['A','T','G','C'],['A','T','G','C'])))

In [30]:
STOP_CODONS = ['TAA', 'TGA', 'TAG']

In [3]:
info = pd.read_csv('/home/richard/research/data_small/fullTableInfoGff3GffRNAESwithDensity20151109.csv')
info.index = info.shortName

In [7]:
source = '/storage3/w/richard/meta2016/fungi_download_CDS_Dec18/'
target = '/storage3/w/richard/meta2016/CDS_Dec18/'

In [10]:
# make directory
for shortName in info.index:
    command = 'mkdir {}{}'.format(target, shortName)
    subprocess.check_call(command, shell = True)

In [16]:
# copy CDS files and unzip, keep track of the names
CDS_number = {}
# problem species
problem_species = []
for shortName in info.index:
    try:
        filename = os.listdir(source+shortName)
        CDS_number[shortName] = filename
    except:
        print 'problem with {}'.format(shortName)
        problem_species.append(shortName)

problem with Altbr1
problem with Aspac1
problem with Aspca3
problem with Batde5
problem with Cante1
problem with Lacam1
problem with Mycgr3
problem with Necha2
problem with Picst3
problem with PleosPC9_1
problem with Pucgr1
problem with Pyrtr1
problem with Rhior3
problem with Spapa3
problem with Sporo1
problem with Suibr1
problem with Suilu1
problem with Treme1
problem with Trire2


In [17]:
# make a copy of unzipped CDS 
for shortName in CDS_number:
    command = 'cp {}{}/{} {}{}/'.format(source, shortName,CDS_number[shortName][0], target, shortName)
    subprocess.check_call(command, shell = True)
    command = 'gunzip {}{}/{}'.format(target, shortName, CDS_number[shortName][0])
    subprocess.check_call(command, shell = True)

In [47]:
# make table connecting CDS and main info file
CDS_shortNames = sorted([name for name in CDS_number])
gc = [info.gc[shortName] for shortName in CDS_shortNames]

In [115]:
## convert seq to capital letters
## filter out alternative splicing

def filter1(seq): return len(seq)%3 == 0
def filter2(seq): return seq[:3] == 'ATG'
def filter3(seq): return seq[-3:] in STOP_CODONS
def filter4(seq): return 'N' not in seq

In [127]:
def process_CDS(filename):
    '''
    input: filename with path
    output: sequence contained in list
    '''
    recs = list(SeqIO.parse(filename,'fasta'))
    seqs = map(lambda x: x.seq, recs)
    seqs = filter(lambda x: filter1(x) and filter2(x) and filter3(x), seqs)
    seqs = map(lambda x: x[:-3].upper(), seqs)
    return reduce(lambda a,b: a+b, seqs)

In [128]:
def process_CDS_spark(filename):
    '''
    input: filename with path
    output: sequence contained in list
    '''
    recs = sc.parallelize(list(SeqIO.parse(filename,'fasta')))
    seqs = recs.map(lambda x: x.seq)
    seqs = seqs.filter(lambda x: filter1(x) and filter2(x) and filter3(x) and filter4(x))
    seqs = seqs.map(lambda x: x[:-3].upper())
    return seqs.reduce(lambda a,b: a+b)

In [129]:
def get_condon_freqs(seq):
    table = {codon:0 for codon in CODONS}
    for i in range(0,len(seq),3): table[seq[i:i+3]] += 1
    total = 1.*sum(table.values())
    for key in table: table[key] /= total
    return table

In [132]:
def get_condon_freqs_spark(seq):
    table = {codon:0 for codon in CODONS}
    counts = sc.parallelize(xrange(0,len(seq),3)).map(lambda i: (seq[i:i+3],1)).reduceByKey(add).collect()
    for (key,count) in counts: table[key] = count
    total = 1.*sum(table.values())
    for key in table: table[key] /= total
    return table

In [133]:
local_target = '/home/richard/largeDataSet/CDS_Dec18/'
codon_freqs = [] #container for gc, condon freq pairs
for i,shortName in enumerate(CDS_shortNames[:5]):
    filename = "{}{}/{}".format(local_target,shortName,CDS_number[shortName][0][:-3])
    seqs = process_CDS_spark(filename)
    codon_freqs.append(get_condon_freqs_spark(seqs))
    print i, shortName

0 Aaoar1
1 Acain1
2 Acema1


KeyboardInterrupt: 

In [134]:
codon_freqs

[{'AAA': 0.01691291222341413,
  'AAC': 0.022486430602222798,
  'AAG': 0.032334694236881706,
  'AAT': 0.01577134627548792,
  'ACA': 0.014713785392373792,
  'ACC': 0.018999333602815992,
  'ACG': 0.013724750713760885,
  'ACT': 0.012992568373431376,
  'AGA': 0.00822572435019825,
  'AGC': 0.015548192706006297,
  'AGG': 0.009708953891861135,
  'AGT': 0.009419531451673627,
  'ATA': 0.008484350307892267,
  'ATC': 0.027272461962716262,
  'ATG': 0.02126440682830574,
  'ATT': 0.015621717292482901,
  'CAA': 0.018314716510316742,
  'CAC': 0.013138811355743392,
  'CAG': 0.02173876940158242,
  'CAT': 0.011480315968685621,
  'CCA': 0.016873731358252257,
  'CCC': 0.016371958303175085,
  'CCG': 0.012022076079565857,
  'CCT': 0.015191695204471714,
  'CGA': 0.011423882623802263,
  'CGC': 0.01470427234280774,
  'CGG': 0.0069869318109444245,
  'CGT': 0.008858422765404812,
  'CTA': 0.008898732297464353,
  'CTC': 0.02605721019018521,
  'CTG': 0.016035293091413796,
  'CTT': 0.016202335792268534,
  'GAA': 0.025

In [56]:
CDSs = (sc.parallelize(CDS_shortNames[:10])
        .map(lambda name: (info.gc[name], target+name+'/'+CDS_number[name][0][:-3]))
        .map(lambda (gc, filename): (gc, list(SeqIO.parse(filename, 'fasta')))).collect())

In [24]:
test = list(SeqIO.parse('/storage3/w/richard/meta2016/CDS_Dec18/Aaoar1/Aaoar1_GeneCatalog_CDS_20140429.fasta','fasta'))

In [37]:
test1 = test[0].seq

In [38]:
test1[-3:] == 'TAG'

True

In [42]:
test1.upper()

Seq('ATGAGATGCTGGCCCGTGTTGTTCGTCGCGGCTGCGGCTGCTATGCCATGGACA...TAG', SingleLetterAlphabet())

In [93]:
for i in range(0,len(test1),3):
    print test1[i:i+3]

ATG
AGA
TGC
TGG
CCC
GTG
TTG
TTC
GTC
GCG
GCT
GCG
GCT
GCT
ATG
CCA
TGG
ACA
AAT
ATC
TTA
TTC
AAC
ACG
GCG
AGT
ACC
AAA
AGT
CTA
GAA
CCT
AGA
GAC
GCC
CCC
AAT
CCA
CCC
GGT
GGT
GAA
AGT
AGC
ATA
ATG
TGT
AGA
TGG
TCC
AAC
TGC
GGA
GAA
CCT
TGC
GAT
GCT
GGA
TTT
GAG
GCG
AAA
ACG
ATA
GCA
GGA
GGA
GAG
CCC
GGT
AAG
ATA
ATG
AGC
AAC
CAC
GAA
CAC
TGC
ATG
GAC
GAA
GGA
TTC
CAA
ACG
TTC
TGT
TGT
CCC
ACA
GGC
CAA
CCC
ACT
CCC
AAC
TGC
CTG
TGG
CGA
GGC
CTG
CAA
AAC
GGT
CAA
AAA
TGC
ACG
CCA
GGC
TGC
GCT
ACC
TCG
GAG
GTG
GAG
GTC
GGA
TCT
ACG
AAA
ACG
AAC
TGT
TCG
AAT
GGT
GGA
CAT
CAG
ACT
GCA
TGT
TGC
TCA
GAT
GGC
CGT
TCC
GTT
AGT
GCA
TAT
TCG
CAA
TGC
AAA
TGG
CAT
GGG
TGT
TCA
TTC
AGT
GGC
AAT
TGG
TGC
AGC
AAG
GAA
TAC
CCT
CAG
GCG
AGG
AGG
CAT
ACT
GTT
GTC
GCG
GAG
AAG
TCA
GAG
GGG
ACC
GCG
ACA
ATT
ATG
TGG
AAA
TAT
TTC
AGA
GGA
CCT
TGG
ACT
CGC
AAG
TTG
GAC
TTG
GTG
TCA
AGG
TTG
ATA
CTT
TGC
ATT
AAC
TGC
CTC
ACA
GAT
ATC
TCA
AGG
ACC
GCC
GGG
AAA
ACA
AAT
GCC
CAC
CTG
AGT
CGC
TTC
CAC
CGA
TTG
AAT
AAT
CGA
ACT
GAT
TAT
ATC
GTA
AAC
AGT
GAC
CAG
GAT
GTT
AAC
TAC
GAC
TCG
GCT
