In [1]:
import os
import json

import pandas as pd
from Bio import SeqIO, AlignIO
from Bio.Align import MultipleSeqAlignment

dataDir = "Data"
snpCompareDir = "SNPs_comparison"
snpDir = "SNPs"
genomeTreeEXDir = "Genome_tree_EX"

reference = "EPI_ISL_402125"

In [2]:
aligned = AlignIO.read(os.path.join(snpCompareDir, "aligned.fasta"), "fasta")
annoRef = SeqIO.read(os.path.join(dataDir, "reference.gb"), "gb")

# with open(os.path.join(dataDir, "genotypes.json")) as f:
#     genotypes = json.load(f)

In [3]:
ref2aligned = {}

for record in aligned:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2aligned[refIndex] = alignedIndex
        break

In [4]:
cdsAlignedIndex = []
product_id = 0

for f in annoRef.features:
    if f.type == "CDS":
        product_id += 1
        for i in f.location.parts:
            cdsAlignedIndex.append({
                "start": ref2aligned[i.start],
                "end": ref2aligned[i.end],
                "product_id": product_id,
                "product_name": ", ".join(f.qualifiers["product"]),
                "ref_start": i.start,
                "ref_end": i.end
            })

In [5]:
alignedCDS = None
cdsPos = 0
cdsAnno = {}

for loc in cdsAlignedIndex:
    cdsSeq = aligned[:, loc["start"]:loc["end"]]
    if alignedCDS is None:
        alignedCDS = cdsSeq
    else:
        alignedCDS += cdsSeq

AlignIO.write(alignedCDS, os.path.join(snpCompareDir, "aligned_cds.fasta"), "fasta")
AlignIO.write(alignedCDS, os.path.join(genomeTreeEXDir, "aligned_cds.fasta"), "fasta")

1

In [6]:
ref2alignedCDS = {}
ref2alignedPos = {}

for record in alignedCDS:
    if record.id == reference:
        refIndex = -1
        for alignedIndex, n in enumerate(record):
            if n != '-':
                refIndex += 1
            ref2alignedCDS[refIndex] = alignedIndex
            ref2alignedPos[refIndex + 1] = alignedIndex + 1
        break

with open(os.path.join(snpCompareDir, "ref2aligned_cds.json"), 'w') as f:
    json.dump(ref2alignedPos, f)

In [7]:
# ref2alignedPos = {}

# for record in SeqIO.parse(os.path.join(snpCompareDir, "wild.fasta"), "fasta"):
#     if record.id == "ZYZ":
#         refPos = 0
#         for alignedPos, n in enumerate(record, start=1):
#             if n != '-':
#                 refPos += 1
#             ref2alignedPos[refPos] = alignedPos

# with open(os.path.join(snpCompareDir, "ref2aligned_cds.json"), 'w') as f:
#     json.dump(ref2alignedPos, f)

In [8]:
# seqsDict = dict(SeqIO.index(os.path.join(snpCompareDir, "aligned_cds.fasta"), "fasta"))

# for gp, accessions in genotypes.items():
#     seqs = []
#     for ac in accessions:
#         record = seqsDict[ac]
#         record.seq = record.seq.ungap('-')
#         seqs.append(record)
#         seqsDict.pop(ac)
#     SeqIO.write(seqs, os.path.join(genomeTreeEX, gp + ".fasta"), "fasta")

# seqs = []
# for record in seqsDict.values():
#     if record.id.startswith("EPI_ISL"):
#         record.seq = record.seq.ungap('-')
#         seqs.append(record)
# SeqIO.write(seqs, os.path.join(genomeTreeEX, "wild.fasta"), "fasta")

In [9]:
snpPos = 8517
pos = ref2alignedCDS[snpPos]
seq = alignedCDS[:, pos - 15:pos + 12]

outDir = os.path.join(snpCompareDir, str(snpPos))
if not os.path.exists(outDir):
    os.mkdir(outDir)
    
SeqIO.write(seq, os.path.join(outDir, "nt{}.fasta".format(snpPos)), "fasta")

136

In [10]:
nt = pd.DataFrame(seq)
nt.columns = nt.columns.values + (snpPos - 14)
nt.index = [record.id for record in seq]
nt.to_csv(os.path.join(outDir, "nt{}.csv".format(snpPos)))

In [11]:
aaSeq = MultipleSeqAlignment([seq[0].translate(id=seq[0].id)])
for s in seq[1:]:
    aaSeq.add_sequence(s.id, str(s.translate().seq))

SeqIO.write(aaSeq, os.path.join(outDir, "aa{}.fasta".format(snpPos)), "fasta")

aa = pd.DataFrame(aaSeq)
aa.columns = aa.columns.values + int((snpPos - 12) / 3)
aa.index = [record.id for record in aaSeq]
aa.to_csv(os.path.join(outDir, "aa{}.csv".format(snpPos)))

In [12]:
snpPos = 27641
pos = ref2alignedCDS[snpPos]
seq = alignedCDS[:, pos - 14:pos + 13]

outDir = os.path.join(snpCompareDir, str(snpPos))
if not os.path.exists(outDir):
    os.mkdir(outDir)

SeqIO.write(seq, os.path.join(outDir, "nt{}.fasta".format(snpPos)), "fasta")

136

In [13]:
nt = pd.DataFrame(seq)
nt.columns = nt.columns.values + (snpPos - 13)
nt.index = [record.id for record in seq]
nt.to_csv(os.path.join(outDir, "nt{}.csv".format(snpPos)))

In [14]:
aaSeq = MultipleSeqAlignment([seq[0].translate(id=seq[0].id)])
for s in seq[1:]:
    aaSeq.add_sequence(s.id, str(s.translate().seq))

SeqIO.write(aaSeq, os.path.join(outDir, "aa{}.fasta".format(snpPos)), "fasta")

aa = pd.DataFrame(aaSeq)
aa.columns = aa.columns.values + int((snpPos - 11) / 3)
aa.index = [record.id for record in aaSeq]
aa.to_csv(os.path.join(outDir, "aa{}.csv".format(snpPos)))