In [1]:
import os
import json

from Bio import AlignIO, SeqIO

dataDir = "Data"
snpDir = "SNPs"
genomeTreeDir = "Genome_tree"
translatedDir = "Translated"
translatedBatDir = "Translated_bat"

reference = "EPI_ISL_402125"

In [2]:
def translate(cdsSeq):
    aaSeqs = []
    for record in cdsSeq:
        print(record.seq)
        record.seq = record.seq.ungap('-').translate(cds=True)
        aaSeqs.append(record)
    return aaSeqs

In [3]:
if not os.path.exists(translatedDir):
    os.mkdir(translatedDir)
    
if not os.path.exists(translatedBatDir):
    os.mkdir(translatedBatDir)

In [4]:
aligned = AlignIO.read(os.path.join(genomeTreeDir, "aligned.fasta"), "fasta")
with open(os.path.join(snpDir, "cds_info.json")) as f:
    cdsInfo = json.load(f)

In [5]:
ref2aligned = {}

for record in aligned:
    if record.id == reference:
        index = -1
        for aligned_index, nt in enumerate(record):
            if nt != '-':
                index += 1
                ref2aligned[index] = aligned_index

In [6]:
prev_product_id = -1
prev_product_name = None
prev_start = None
prev_end = None

for loc in cdsInfo:
    product_id = loc["product_id"]
    product_name = loc["product_name"]
    start = loc["ref_start"]
    end = loc["ref_end"]
    current_seq = aligned[:, ref2aligned[start]:ref2aligned[end]]
    if product_id == prev_product_id:
        current_seq = prev_seq + current_seq
    else:
        if prev_product_name:
            cdsDir = os.path.join(translatedDir, prev_product_name.replace(' ', '_'))
            print(prev_seq.get_alignment_length())
#             print(prev_product_name, prev_seq)
            if not os.path.exists(cdsDir):
                os.mkdir(cdsDir)
            AlignIO.write(prev_seq, os.path.join(cdsDir, "aligned_cds.fasta"), "fasta")
            aaSeqs = translate(prev_seq)
            SeqIO.write(aaSeqs, os.path.join(cdsDir, "protein.fasta"), "fasta")
    prev_product_id = product_id
    prev_product_name = product_name
    prev_seq = current_seq

print(prev_product_name, prev_seq.get_alignment_length())
AlignIO.write(prev_seq, os.path.join(cdsDir, prev_product_name + ".fasta"), "fasta")

21291
atggagagccttgtccctggtttcaacgagaaaacacacgtccaactcagtttgcctgttttacaggttcgcgacgtgctcgtacgtggctttggagactccgtggaggaggtcttatcagaggcacgtcaacatcttaaagatggcacttgtggcttagtagaagttgaaaaaggcgttttgcctcaacttgaacagccctatgtgttcatcaaacgttcggatgctcgaactgcacctcatggtcatgttatggttgagctggtagcagaactcgaaggcattcagtacggtcgtagtggtgagacacttggtgtccttgtccctcatgtgggcgaaataccagtggcttaccgcaaggttcttcttcgtaagaacggtaataaaggagctggtggccatagttacggcgccgatctaaagtcatttgacttaggcgacgagcttggcactgatccttatgaagattttcaagaaaactggaacactaaacatagcagtggtgttacccgtgaactcatgcgtgagcttaacggaggggcatacactcgctatgtcgataacaacttctgtggccctgatggctaccctcttgagtgcattaaagaccttctagcacgtgctggtaaagcttcatgcactttgtccgaacaactggactttattgacactaagaggggtgtatactgctgccgtgaacatgagcatgaaattgcttggtacacggaacgttctgaaaagagctatgaattgcagacaccttttgaaattaaattggcaaagaaatttgacaccttcaatggggaatgtccaaattttgtatttcccttaaattccataatcaagactattcaaccaagggttgaaaagaaaaagcttgatggctttatgggtagaattcgatctgtctatccagttgcgtcaccaaatgaatgcaaccaaatgtgcctttcaactctcatgaagtgtgatcattgtggtgaaacttcatggc

TranslationError: First codon 'GGC' is not a start codon

In [None]:
referenceBat = SeqIO.read(os.path.join(dataDir, "reference_bat.gb"), "gb")

In [None]:
prev_product_id = -1
prev_product_name = None
prev_start = None
prev_end = None

for f in referenceBat.features:
    if f.type == "CDS":
        product_name = ", ".join(f.qualifiers["product"])
        start = f.location.parts[0].start
        for i in f.location.parts:
            end = i.end
        print(start, end)
        cdsSeq = referenceBat[start:end]
        cdsDir = os.path.join(translatedBatDir, product_name.replace(' ', '_'))
        if not os.path.exists(cdsDir):
            os.mkdir(cdsDir)
        SeqIO.write(cdsSeq, os.path.join(cdsDir, "aligned_cds.fasta"), "fasta")
#         aaSeqs = translate(cdsSeq)
#         SeqIO.write(aaSeqs, os.path.join(cdsDir, "protein.fasta"), "fasta")