In [1]:
import os

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

dataDir = "Data"
genomeTreeDir = "Genome_tree"

if not os.path.exists(genomeTreeDir):
    os.mkdir(genomeTreeDir)

In [2]:
df = pd.read_csv(os.path.join(dataDir, "meta_data.csv"))
df["Collection date"] = pd.to_datetime(df["Collection date"])

In [3]:
df["Area"] = pd.Series()
for index, row in df.iterrows():
    loc = row["Location"].split(" / ")
    if loc[1] == "China":
        country = "China"
        if len(loc) > 2:
            area = loc[2]
            if area == "Guandong":
                area = "Guangdong"
        elif "Wuhan" in row["Virus name"]:
            area = "Hubei"
        else:
            raise ValueError("Not a valid location")
    else:
        area = loc[1]
        if area in ("Taiwan", "Hong Kong"):
            country = "China"
        else:
            country = area
    df.loc[index, "Area"] = area
    df.loc[index, "Country"] = country

df[["Accession ID", "Virus name", "Area", "Country", "Collection date"]].to_csv(
    os.path.join(dataDir, "info.csv"),
    sep=",",
    index=False
)

In [4]:
outgroup = None
earlies = None
outputSeqs = []

# excluded as agreed http://virological.org/t/phylodynamic-analysis-90-genomes-12-feb-2020/356/4
excluded = ["EPI_ISL_406592", "EPI_ISL_406595"]

for record in SeqIO.parse(os.path.join(dataDir, "genomes.fasta"), "fasta"):
    if record.id != "ZYZ":
        ac = record.id.split('|')[-1]
        cd = df[df["Accession ID"] == ac]["Collection date"].values[0]
        if outgroup is None:
            outgroup = ac
            earlies = cd
        elif cd < earlies:
            outgroup = ac
            earlies = cd
        record.id = ac
        record.description = ""
        if ac not in excluded:
            outputSeqs.append(record)

with open(os.path.join(genomeTreeDir, "outgroup.txt"), 'w') as f:
    f.write(outgroup + "\n")
SeqIO.write(outputSeqs, os.path.join(genomeTreeDir, "raw.fasta"), "fasta")

89

In [5]:
outputSeqs = []

for record in SeqIO.parse(os.path.join(dataDir, "aa_sequences.fasta"), "fasta"):
    if record.id != "ZYZ":
        ac = record.id.split('|')[-1]
        cd = df[df["Accession ID"] == ac]["Collection date"].values[0]
        if outgroup is None:
            outgroup = ac
            earlies = cd
        elif cd < earlies:
            outgroup = ac
            earlies = cd
        record.id = ac
        record.description = ""
        if ac not in excluded:
            outputSeqs.append(record)
            
SeqIO.write(outputSeqs, os.path.join(dataDir, "aa_sequences_2.fasta"), "fasta")

89