In [1]:
import os
from collections import Counter

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

dataDir = "Data"
genomeTreeDir = "Genome_tree"

if not os.path.exists(genomeTreeDir):
    os.mkdir(genomeTreeDir)

In [2]:
df = pd.read_csv(os.path.join(dataDir, "meta_data.csv"))
df["Collection date"] = pd.to_datetime(df["Collection date"])

In [3]:
df["Area"] = pd.Series()
df["Location"] = pd.Series()
for index, row in df.iterrows():
    if row["Country"] == "China" or row["Country"] == "USA":
        area = row["Province"].strip(' ')
        if area == "Guandong":
            area = "Guangdong"
        elif area == "Chongqinq":
            area = "Chongqing"
        if pd.isna(row["City"]):
            location = area
        else:
            location = row["City"].strip(' ')
                
    else:
        area = row["Country"].strip(' ')
        location = row["Country"].strip(' ')
    df.loc[index, "Area"] = area
    df.loc[index, "Location"] = location

df[["Accession ID", "Location", "Area", "Country", "Collection date"]].to_csv(
    os.path.join(dataDir, "info.csv"),
    sep=",",
    index=False
)

In [4]:
metaDir = os.path.join(dataDir, "ByArea")

if not os.path.exists(metaDir):
    os.mkdir(metaDir)

for area in df["Area"].drop_duplicates():
    df[df["Area"] == area].to_csv(os.path.join(metaDir, area + ".csv"))

In [5]:
outgroup = None
earlies = None
allSeqs = []
genomeSeqs = []

# excluded as agreed http://virological.org/t/phylodynamic-analysis-90-genomes-12-feb-2020/356/4
excluded = [
    "EPI_ISL_406592",
    "EPI_ISL_406595",
    "EPI_ISL_408485",
    "EPI_ISL_408482",
    "MT039890",
]

outgroup = "EPI_ISL_402131"

for record in SeqIO.parse(os.path.join(dataDir, "sequences.fasta"), "fasta"):
    ac = record.id
    if ac not in excluded:
        allSeqs.append(record)
        seqSum = Counter(str(record.seq).upper())
        if len(record) > 28000 and seqSum['N'] < 100:
            genomeSeqs.append(record)

with open(os.path.join(genomeTreeDir, "outgroup.txt"), 'w') as f:
    f.write(outgroup + "\n")
# SeqIO.write(allSeqs, os.path.join(genomeTreeDir, "all.fasta"), "fasta")
SeqIO.write(genomeSeqs, os.path.join(genomeTreeDir, "genomes.fasta"), "fasta")

106