In [1]:
import os

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

dataDir = "Data"
snpCompareDir = "SNPs_comparison"

In [2]:
if not os.path.exists(snpCompareDir):
    os.mkdir(snpCompareDir)

In [4]:
seqs = []
seqAnno = pd.DataFrame(columns=("Accession ID", "Start", "End", "Product_id", "Product_name"))

genBanks = [
    "reference.gb",
    "reference_bat.gb",
    "sequence_bat_1.gb",
    "sequence_bat_2.gb",
    "sequence_bat_3.gb"
]

for fname in genBanks:
    record = SeqIO.read(os.path.join(dataDir, fname), "gb")
    product_id = 0
    aaSeq = ""
    for f in record.features:
        if f.type == "CDS":
            product_id += 1
            for i in f.location.parts:
                row = pd.Series(data={
                    "Accession ID": record.id,
                    "Start": i.start + 1,
                    "End": i.end,
                    "Product_id": product_id,
                    "Product_name": ", ".join(f.qualifiers["product"]),
                })
                seqAnno = seqAnno.append(row, ignore_index=True)
                aaSeq += ''.join(f.qualifiers["translation"])
    seqs.append(SeqRecord(Seq(aaSeq), record.id, ""))

seqAnno.to_csv(os.path.join(snpCompareDir, "annotation.csv"), index=False)
SeqIO.write(seqs, os.path.join(snpCompareDir, "peptides.fasta"), "fasta")

5