## Important

Run `./Scripts/alignment.sh` to make alignment before this notebook

In [1]:
import os
import json

from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment

genomeTreeDir = "Genome_tree"
snpDir = "SNPs"
dataDir = "Data"

reference = "EPI_ISL_402125"

In [2]:
aligned = AlignIO.read(os.path.join(genomeTreeDir, "aligned.fasta"), "fasta")
annoRef = SeqIO.read(os.path.join(dataDir, "reference.gb"), "gb")

In [3]:
ref2aligned = {}

for record in aligned:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2aligned[refIndex] = alignedIndex
        break

In [4]:
cdsAlignedIndex = []
product_id = 0

for f in annoRef.features:
    if f.type == "CDS":
        product_id += 1
        for i in f.location.parts:
            cdsAlignedIndex.append({
                "start": ref2aligned[i.start],
                "end": ref2aligned[i.end],
                "product_id": product_id,
                "product_name": ", ".join(f.qualifiers["product"]),
                "ref_start": i.start,
                "ref_end": i.end
            })

In [5]:
alignedCDS = None
cdsPos = 0
cdsAnno = {}

for loc in cdsAlignedIndex:
    cdsSeq = aligned[:, loc["start"]:loc["end"]]
    if alignedCDS is None:
        alignedCDS = cdsSeq
    else:
        alignedCDS += cdsSeq
    for pos in range(loc["ref_start"] + 1, loc["ref_end"] + 1):
        cdsPos += 1
        cdsAnno[cdsPos] = {
            "pos": pos, "product_id": loc["product_id"],
            "product_name": loc["product_name"]
        }

AlignIO.write(alignedCDS, os.path.join(genomeTreeDir, "aligned_cds.fasta"), "fasta")

with open(os.path.join(snpDir, "cds_product.json"), 'w') as f:
    json.dump(cdsAnno, f, indent=4)

In [6]:
alignedCDS.get_alignment_length()

29133

In [7]:
# alignedAA = None

# for record in alignedCDS:
#     aa = record.translate(gap='-', id=record.id, description="")
#     print(record.id)
#     if alignedAA is None:
#         alignedAA = MultipleSeqAlignment([aa])
#     else:
#         alignedAA.add_sequence(record.id, str(aa.seq))