In [2]:
import os
import json

import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment

dataDir = "Data"
snpCompareDir = "SNPs_comparison"
snpDir = "SNPs"

# reference = "MN908947.3"
reference = "EPI_ISL_402125"

In [3]:
aligned = AlignIO.read(os.path.join(snpCompareDir, "aligned.fasta"), "fasta")
annoRef = SeqIO.read(os.path.join(dataDir, "reference.gb"), "gb")

In [4]:
ref2aligned = {}

for record in aligned:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2aligned[refIndex] = alignedIndex
        break

In [5]:
cdsAlignedIndex = []
product_id = 0

for f in annoRef.features:
    if f.type == "CDS":
        product_id += 1
        for i in f.location.parts:
            cdsAlignedIndex.append({
                "start": ref2aligned[i.start],
                "end": ref2aligned[i.end],
                "product_id": product_id,
                "product_name": ", ".join(f.qualifiers["product"]),
                "ref_start": i.start,
                "ref_end": i.end
            })

with open(os.path.join(snpCompareDir, "cds_info.json"), 'w') as f:
    json.dump(cdsAlignedIndex, f, indent=4)

In [6]:
alignedCDS = None
cdsPos = 0
cdsAnno = {}

for loc in cdsAlignedIndex:
    cdsSeq = aligned[:, loc["start"]:loc["end"]]
    if alignedCDS is None:
        alignedCDS = cdsSeq
    else:
        alignedCDS += cdsSeq

AlignIO.write(alignedCDS, os.path.join(snpCompareDir, "aligned_cds.fasta"), "fasta")

1

In [7]:
ref2alignedCDS = {}

for record in alignedCDS:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2alignedCDS[refIndex] = alignedIndex
        break

In [8]:
orderAC = [reference]

for fn in ("gp2_trimmed.fasta", "gp1_trimmed.fasta", "wild.fasta"):
    seqs = SeqIO.index(os.path.join(snpCompareDir, fn), "fasta")
    for ac in seqs:
        if ac != reference:
            orderAC.append(ac)

In [21]:
snpPos = 8517
pos = ref2alignedCDS[snpPos]
seq = alignedCDS[:, pos - 15:pos + 12]

In [22]:
nt = pd.DataFrame(seq)
nt.columns = nt.columns.values + (snpPos - 14)
nt.index = [record.id for record in seq]
nt.loc[orderAC, ].to_csv(os.path.join(snpCompareDir, "nt{}.csv".format(snpPos)))

In [24]:
seqs = []
for ac in orderAC:
    for record in seq:
        if record.id == ac:
            seqs.append(record)
SeqIO.write(seqs, os.path.join(snpCompareDir, "nt{}.fasta".format(snpPos)), "fasta")

10

In [25]:
aaSeq = MultipleSeqAlignment([seq[0].translate(id=seq[0].id)])
for s in seq[1:]:
    aaSeq.add_sequence(s.id, str(s.translate().seq))

aa = pd.DataFrame(aaSeq)
aa.columns = aa.columns.values + int((snpPos - 12) / 3)
aa.index = [record.id for record in aaSeq]
aa.loc[orderAC, ].to_csv(os.path.join(snpCompareDir, "aa{}.csv".format(snpPos)))

In [26]:
seqs = []
for ac in orderAC:
    for record in aaSeq:
        if record.id == ac:
            seqs.append(record)
SeqIO.write(seqs, os.path.join(snpCompareDir, "aa{}.fasta".format(snpPos)), "fasta")

10

In [27]:
snpPos = 27641
pos = ref2alignedCDS[snpPos]
seq = alignedCDS[:, pos - 14:pos + 13]

In [28]:
nt = pd.DataFrame(seq)
nt.columns = nt.columns.values + (snpPos - 13)
nt.index = [record.id for record in seq]
nt.loc[orderAC, ].to_csv(os.path.join(snpCompareDir, "nt{}.csv".format(snpPos)))

In [29]:
seqs = []
for ac in orderAC:
    for record in seq:
        if record.id == ac:
            seqs.append(record)
SeqIO.write(seqs, os.path.join(snpCompareDir, "nt{}.fasta".format(snpPos)), "fasta")

10

In [31]:
aaSeq = MultipleSeqAlignment([seq[0].translate(id=seq[0].id)])
for s in seq[1:]:
    aaSeq.add_sequence(s.id, str(s.translate().seq))

aa = pd.DataFrame(aaSeq)
aa.columns = aa.columns.values + int((snpPos - 11) / 3)
aa.index = [record.id for record in aaSeq]
aa.loc[orderAC, ].to_csv(os.path.join(snpCompareDir, "aa{}.csv".format(snpPos)))

In [32]:
seqs = []
for ac in orderAC:
    for record in aaSeq:
        if record.id == ac:
            seqs.append(record)
SeqIO.write(seqs, os.path.join(snpCompareDir, "aa{}.fasta".format(snpPos)), "fasta")

10