In [1]:
import os
from collections import Counter

import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment
from matplotlib import pyplot as plt

genomeTreeDir = "Genome_tree"
snpDir = "SNPs"
dataDir = "Data"

if not os.path.exists(snpDir):
    os.mkdir(snpDir)

ancestral = "EPI_ISL_402123"
reference = "EPI_ISL_402125"

In [2]:
aligned = AlignIO.read(os.path.join(genomeTreeDir, "aligned.fasta"), "fasta")
annoRef = SeqIO.read(os.path.join(dataDir, "reference.gb"), "gb")

In [3]:
ref2aligned = {}

for record in aligned:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2aligned[refIndex] = alignedIndex
        break

In [4]:
cdsAlignedIndex = []
product = 0

for f in annoRef.features:
    if f.type == "CDS":
        product += 1
        for i in f.location.parts:
            cdsAlignedIndex.append({
                "start": ref2aligned[i.start],
                "end": ref2aligned[i.end],
                "product": product
            })

In [5]:
alignedCDS = None
# alignedAA = None

for loc in cdsAlignedIndex:
    cdsSeq = aligned[:, loc["start"]:loc["end"]]
#     aaSeq = []
#     for record in cdsSeq:
#         print(aa.id)
#         aa = record.translate(gap='-', id=record.id, description="")
#         aaSeq.append(aa)
    if alignedCDS is None:
        alignedCDS = cdsSeq
    else:
        alignedCDS += cdsSeq

AlignIO.write(alignedCDS, os.path.join(genomeTreeDir, "aligned_cds.fasta"), "fasta")

1

In [29]:
snp = pd.DataFrame(columns=("pos", "ref", "A", "T", "G", "C", "gap", "unknown"))

alignedAnc = None
for record in alignedCDS:
    if record.id == ancestral:
        alignedAnc = record
        break

In [30]:
for i in range(alignedCDS.get_alignment_length()):
    aaSum = Counter(alignedCDS[:, i])
    if len(aaSum) > 1 and max(aaSum.values()) < len(alignedCDS) - 1:
        row = pd.Series(data={
            "pos": i + 1, "ref": alignedAnc[i],
            "A": aaSum["a"], "T": aaSum["t"],
            "G": aaSum["g"], "C": aaSum["c"],
            "gap": aaSum["-"], "unknown": aaSum["n"]
        })
        snp = snp.append(row, ignore_index=True)
        print(i + 1, aaSum, aaSum["t"])

113 Counter({'t': 87, '-': 1, 'c': 1}) 87
114 Counter({'c': 87, '-': 1, 't': 1}) 1
349 Counter({'g': 87, 'a': 2}) 0
2397 Counter({'c': 86, 't': 3}) 3
2772 Counter({'c': 87, 't': 2}) 2
4819 Counter({'a': 87, 'g': 2}) 0
8517 Counter({'c': 60, 't': 28, 'y': 1}) 28
10818 Counter({'g': 83, 't': 6}) 6
15060 Counter({'c': 87, 't': 2}) 2
17109 Counter({'c': 85, 't': 4}) 4
17796 Counter({'c': 84, 't': 5}) 5
18224 Counter({'t': 87, 'c': 2}) 87
20035 Counter({'t': 87, '-': 1, 'c': 1}) 87
21360 Counter({'a': 87, 'g': 1, 'n': 1}) 0
21366 Counter({'c': 87, 't': 1, 'n': 1}) 1
21378 Counter({'t': 87, 'c': 1, 'n': 1}) 87
21381 Counter({'t': 87, 'c': 1, 'n': 1}) 87
21384 Counter({'t': 87, 'a': 1, 'n': 1}) 87
21385 Counter({'t': 87, 'a': 1, 'n': 1}) 87
21386 Counter({'t': 87, 'c': 1, 'n': 1}) 87
21390 Counter({'a': 87, 'c': 1, 'n': 1}) 0
21399 Counter({'t': 87, 'c': 1, 'n': 1}) 87
21426 Counter({'c': 87, 't': 1, 'n': 1}) 1
21436 Counter({'c': 84, 't': 4, 'n': 1}) 4
22032 Counter({'t': 87, 'c': 1, 'g': 1}

In [31]:
snp

Unnamed: 0,pos,ref,A,T,G,C,gap,unknown
0,113,t,0,87,0,1,1,0
1,114,c,0,1,0,87,1,0
2,349,g,2,0,87,0,0,0
3,2397,c,0,3,0,86,0,0
4,2772,c,0,2,0,87,0,0
5,4819,a,87,0,2,0,0,0
6,8517,c,0,28,0,60,0,0
7,10818,g,0,6,83,0,0,0
8,15060,c,0,2,0,87,0,0
9,17109,c,0,4,0,85,0,0
