In [1]:
import os
from collections import Counter

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

dataDir = "Data"
genomeTreeDir = "Genome_tree"
genomeTreeEXDir = "Genome_tree_EX"
snpCompareDir = "SNPs_comparison"

In [2]:
if not os.path.exists(genomeTreeDir):
    os.mkdir(genomeTreeDir)

if not os.path.exists(snpCompareDir):
    os.mkdir(snpCompareDir)

if not os.path.exists(genomeTreeEXDir):
    os.mkdir(genomeTreeEXDir)

In [3]:
original = pd.read_csv(os.path.join(dataDir, "original.csv"))
meta = pd.read_csv(os.path.join(dataDir, "meta_data.csv"))
meta = pd.merge(meta, original, left_on="Accession ID", right_on="Accession ID", suffixes=("", "_y"))

In [4]:
meta["Collection date"] = pd.to_datetime(meta["Collection date"])
meta["Area"] = pd.Series()
meta["Location"] = pd.Series()
for index, row in meta.iterrows():
    if row["Country"] == "China":
        area = row["Province"].strip(' ')
        if area == "Guandong":
            area = "Guangdong"
        elif area == "Chongqinq":
            area = "Chongqing"
        if pd.isna(row["City"]):
            location = area
        else:
            location = row["City"].strip(' ')
                
    else:
        area = row["Country"].strip(' ')
        location = row["Country"].strip(' ')
        if "SIngapore" in area:
            area = "Singapore"
            location = "Singapore"
    meta.loc[index, "Area"] = area
    meta.loc[index, "Location"] = location

In [5]:
meta = meta[["Accession ID", "Host", "Location", "Area", "Country", "Collection date"]]
meta.to_csv(
    os.path.join(dataDir, "info.csv"),
    sep=",",
    index=False
)

In [6]:
minLen = 28000
maxN = 100

seqs = []
outgroup = "EPI_ISL_402125"

pangolin = meta.loc[meta["Host"] == "Manis javanica", "Accession ID"].values
bat = meta.loc[meta["Host"] == "Rhinolophus affinis", "Accession ID"].values

nonHuman = [*pangolin, *bat]

# Long branches to exclude
exclude = [
    "EPI_ISL_406592",
    "EPI_ISL_406595",
    "EPI_ISL_408485",
    "EPI_ISL_408482",
    "EPI_ISL_411929",
    "EPI_ISL_411951",
    "EPI_ISL_411952",
    "MT039890",
]

maxID = None

for record in SeqIO.parse(os.path.join(dataDir, "sequences.fasta"), "fasta"):
    ac = record.id
    if ac not in exclude and ac.startswith("EPI_ISL"):
        seqSum = Counter(str(record.seq).upper())
        if len(record) > minLen and seqSum['N'] < maxN and record.id not in nonHuman:
            seqs.append(record)
            idNum = record.id.split('_')[2]
            if not maxID:
                maxID = idNum
            elif idNum > maxID:
                maxID = idNum
print(maxID)

with open(os.path.join(genomeTreeDir, "outgroup.txt"), 'w') as f:
    f.write(outgroup + "\n")
SeqIO.write(seqs, os.path.join(genomeTreeDir, "genomes.fasta"), "fasta")

412116


129

In [7]:
outAllSeqs = []
outgroup = "EPI_ISL_402131"

for record in SeqIO.parse(os.path.join(dataDir, "sequences.fasta"), "fasta"):
    seqSum = Counter(str(record.seq).upper())
    if len(record) > minLen and seqSum['N'] < maxN:
        if record.id not in exclude and record.id.startswith("EPI_ISL"):
            outAllSeqs.append(record)

with open(os.path.join(genomeTreeEXDir, "outgroup.txt"), 'w') as f:
    f.write(outgroup + "\n")
SeqIO.write(outAllSeqs, os.path.join(snpCompareDir, "all.fasta"), "fasta")

136