In [1]:
import os
import json
from collections import Counter

import pandas as pd
from Bio import SeqIO, AlignIO, Phylo
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Align import MultipleSeqAlignment
from treetime import TreeAnc
from matplotlib import pyplot as plt

genomeTreeDir = "Genome_tree"
snpDir = "SNPs"
dataDir = "Data"

if not os.path.exists(snpDir):
    os.mkdir(snpDir)

with open(os.path.join(genomeTreeDir, "outgroup.txt")) as f:
    for row in f:
        exclude = row.strip("\n")

In [2]:
ambiguity = {
    "Y": ["C", "T"], "R": ["A", "G"], "W": ["A", "T"],
    "S": ["G", "C"], "K": ["T", "G"], "M": ["C", "A"],
    "D": ["A", "G", "T"], "V": ["A", "C", "G"],
    "H": ["A", "C", "T"], "B": ["C", "G", "T"]
}

In [3]:
tree = Phylo.read(os.path.join(genomeTreeDir, "no_outgroup.newick"), "newick")
alignedCDS = AlignIO.read(os.path.join(genomeTreeDir, "aligned_cds.fasta"), "fasta")

In [4]:
treeAnc = TreeAnc(tree = tree, aln = alignedCDS, gtr="Jukes-Cantor", verbose=False)
treeAnc.infer_ancestral_sequences(infer_gtr=True, marginal=True)


    	tips at positions with AMBIGUOUS bases. This resulted in unexpected
    	behavior is some cases and is no longer done by default. If you want to
    	replace those ambiguous sites with their most likely state, rerun with
    	`reconstruct_tip_states=True` or `--reconstruct-tip-states`.


0

In [5]:
alignedAnc = SeqRecord(Seq(''.join(treeAnc.tree.root.sequence)), "ancestral")

In [6]:
alignedCDS.add_sequence("ancestral", str(alignedAnc.seq))
AlignIO.write(alignedCDS, os.path.join(snpDir, "ancestral.fasta"), "fasta")

1

In [7]:
info = pd.read_csv(os.path.join(dataDir, "info.csv"))
info.columns.values[list(info.columns.values).index("Location")] = "City"
info.columns.values[list(info.columns.values).index("Area")] = "Province"

In [8]:
meta = pd.read_csv(os.path.join(dataDir, "original.csv"))
meta.columns.values[list(meta.columns.values).index("Accession.ID")] = "Accession ID"

In [9]:
with open(os.path.join(snpDir, "cds_product.json")) as f:
    cdsAnno = json.load(f)

In [10]:
snpPosIndex = []
for i in range(alignedCDS.get_alignment_length()):
    aaSum = Counter(alignedCDS[:, i])
    if len(aaSum) > 1:
        snpPosIndex.append(i)

In [11]:
allSNP = pd.DataFrame(columns=("Pos", "SNP", "Ref", "Genome pos", "Product", "Accession ID"))

for record in alignedCDS:
    if record.id != exclude:
        for index in snpPosIndex:
            ref = alignedAnc[index].upper()
            n = record[index].upper()
            if n != 'N' and n != '-' and ref != n:
                if n in ambiguity:
                    if ref in ambiguity[n]:
                        continue
                pos = index + 1
                posAnno = cdsAnno[str(pos)]
                row = pd.Series(data={
                    "Pos": pos, "SNP": n, "Ref": ref,
                    "Genome pos": posAnno["pos"],
                    "Product": posAnno["product_name"],
                    "Accession ID": record.id
                })
                allSNP = allSNP.append(row, ignore_index=True)
                
allSNP = pd.merge(allSNP, info, on="Accession ID", how="left")
allSNP = pd.merge(allSNP, meta, on="Accession ID", how="left")
allSNP = allSNP.sort_values(["Pos", "SNP"])
allSNP.to_csv(os.path.join(snpDir, "all.csv"), index=False)

In [12]:
hubeiSNP = allSNP[allSNP["Province"] == "Hubei"][["Pos", "SNP"]].drop_duplicates()

provinceSNP = pd.DataFrame(columns=allSNP.columns)
citySNP = pd.DataFrame(columns=allSNP.columns)

for i in allSNP[allSNP["Province"] != "Hubei"].groupby(["Pos", "Ref"]):
    inHubei = any((hubeiSNP["Pos"] == i[0][0]) & (hubeiSNP["SNP"] == i[0][1]))
    isProvince = len(i[1]["Province"].drop_duplicates()) >= 2
    isCity = len(i[1]["City"].drop_duplicates()) >= 2
    if not inHubei:
        if isProvince:
            provinceSNP = provinceSNP.append(i[1], ignore_index=True)
        if isCity:
            citySNP = citySNP.append(i[1], ignore_index=True)

provinceSNP.to_csv(os.path.join(snpDir, "province.csv"), index=False)
citySNP.to_csv(os.path.join(snpDir, "city.csv"), index=False)

In [13]:
allSNP["SNP"].drop_duplicates()

335    A
95     T
103    G
71     C
Name: SNP, dtype: object