## Note
The `H3N2 HA` and `H1N1 HA` sequences of `Human` host flu virus were downloaded from https://www.fludb.org/. Copy the sequence file to the destination folder under `Trees`. Will possibly have quality check on the sequences.

In [1]:
import os
import re
import shutil
from collections import defaultdict, Counter

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

SEQUENCES_DIR = "Sequences"
METADATA_DIR = "Metadata"
TREES_DIR = "Trees"

H1N1_HA_NAME = "H1N1_HA"
H3N2_HA_NAME = "H3N2_HA"
H3N2_HA1_PRE2003_NAME = "H3N2_HA1_pre2003"
SARS_COV_2_CDS_NAME = "SARS_CoV_2_cds"
ZIKV_POLYPROTEIN_NAME = "ZIKV_polyprotein"

GROUPING = "Spatiotemporal"

H1N1_HA_YEAR = 2009
H3N2_HA_YEAR = 2009

MAX_AMBIGUITY = 1
HA_STANDARD_LEN = 566
AMBIGUITY = r"B|J|Z"

H1N1_HA_EXCLUDE = [
    "CY075897",
    "JX875001",
    "JQ348837",
    "KU720432",
    "KU720436",
    "KU720435",
    "KY930507",
    "CY079544",
    "KF057112",
    "HQ908440",
    "MH211234",
    "MH211235",
#     "MK615191",
    "MK615591",
]

H1N1_HA_EXCLUDE_SEASON = ["USA_Unknown_14-15"]

# H3N2_HA_EXCLUDE = [
#     "KY273064",
#     "KY273063",
#     "KY273058",
#     "KY273060",
#     "KY273057",
#     "KU289634",
#     "MH201523",
#     "KP335956",
#     "KF805696",
#     "KP335932",
#     "KP335938",
#     "KP335934",
#     "MK239073",
#     "MK117070",
#     "KJ955515",
#     "KP765772",
#     "KJ623709",
#     "GU937743",
#     "KF805640",
#     "KF805656",
#     "KF805688",
#     "KU182657",
#     "MF993038",
#     "KU182655",
#     "MG856267",
#     "MN571199",
#     "CY189823"
# ]

In [2]:
if not os.path.exists(TREES_DIR):
    os.mkdir(TREES_DIR)

### 1. H1N1

Remove the sequence with ambiguious sites. Group sequences by year.

In [3]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H1N1_HA_NAME + ".tsv"),
    sep="\t",
    index_col="Sequence Accession",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)

In [4]:
nQualified = 0
seqs = defaultdict(list)

for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H1N1_HA_NAME + ".fasta"), "fasta"):
    record.seq = Seq(re.sub(AMBIGUITY, 'X', str(record.seq)))
    info = dict(i.split(':') for i in record.id.split('|'))
    time, country, state, season = meta.loc[
        info["gb"],
        ["Collection Date", "Country", "State/Province", "Flu Season"]
    ]
    time = time[-4:] if pd.notna(time) else 0
    country = country.replace(' ', '_') if pd.notna(country) else "Unknown"
    state = state.replace(' ', '_') if pd.notna(state) else "Unknown"
    season = season.replace(' ', '_') if pd.notna(season) else "Unknown"
    seasonInfo = '_'.join([country, state, season])
    if (
        Counter(str(record.seq))['X'] < MAX_AMBIGUITY
        and len(record.seq) == HA_STANDARD_LEN
        and int(time) >= H1N1_HA_YEAR
        and info["gb"] not in H1N1_HA_EXCLUDE
        and seasonInfo not in H1N1_HA_EXCLUDE_SEASON
    ):
        nQualified += 1
        record.id = info["gb"]
        record.description = ""
        seqs["{}".format(time)].append(record)
    
groupingDir = os.path.join(TREES_DIR, H1N1_HA_NAME, GROUPING)
    
if os.path.exists(groupingDir):
    shutil.rmtree(groupingDir)
os.makedirs(groupingDir)

for st, records in seqs.items():
    outDir = os.path.join(groupingDir, st)
    if not os.path.exists(outDir):
        os.mkdir(outDir)
    SeqIO.write(records, os.path.join(outDir, "sequences.fasta"), "fasta")
print(nQualified)

19685


### 2. H3N2

Remove the sequence with ambiguious sites. Group sequences by year and country.

In [10]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H3N2_HA_NAME + ".tsv"),
    sep="\t",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)
meta = meta.drop_duplicates()
meta = meta.set_index("Sequence Accession")

In [13]:
nQualified = 0
seqs = defaultdict(list)

for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H3N2_HA_NAME + ".fasta"), "fasta"):
    record.seq = Seq(re.sub(AMBIGUITY, 'X', str(record.seq)))
    info = dict(i.split(':') for i in record.id.split('|'))
    time = meta.loc[info["gb"], "Collection Date"]
    time = time[-4:] if pd.notna(time) else 0
#     country = meta.loc[info["gb"], "Country"]
#     country = country.replace(' ', '_') if pd.notna(country) else "Unknown"
    if (
        Counter(str(record.seq))['X'] < MAX_AMBIGUITY
        and len(record.seq) == HA_STANDARD_LEN
        and int(time) >= H3N2_HA_YEAR
    ):
        nQualified += 1
        record.id = info["gb"]
        record.description = ""
        seqs["{}".format(time)].append(record)

groupingDir = os.path.join(TREES_DIR, H3N2_HA_NAME, GROUPING)
    
if os.path.exists(groupingDir):
    shutil.rmtree(groupingDir)
os.makedirs(groupingDir)
    
for st, records in seqs.items():
    outDir = os.path.join(groupingDir, st)
    if not os.path.exists(outDir):
        os.mkdir(outDir)
    SeqIO.write(records, os.path.join(outDir, "sequences.fasta"), "fasta")
print(nQualified)

23288


### 3. Others

Including `ZIKV`, `H3N2_HA1` and `SARS-CoV-2`.

In [7]:
outDir = os.path.join(TREES_DIR, H3N2_HA1_PRE2003_NAME)

if not os.path.exists(outDir):
    os.mkdir(outDir)

shutil.copyfile(
    os.path.join(SEQUENCES_DIR, H3N2_HA1_PRE2003_NAME + ".fasta"),
    os.path.join(outDir, "sequences.fasta")
)

'Trees\\H3N2_HA1_pre2003\\sequences.fasta'

In [8]:
outDir = os.path.join(TREES_DIR, SARS_COV_2_CDS_NAME)
    
if not os.path.exists(outDir):
    os.mkdir(outDir)

shutil.copyfile(
    os.path.join(SEQUENCES_DIR, SARS_COV_2_CDS_NAME + ".fasta"),
    os.path.join(outDir, "sequences.fasta")
)

'Trees\\SARS_CoV_2_cds\\sequences.fasta'

In [9]:
outDir = os.path.join(TREES_DIR, ZIKV_POLYPROTEIN_NAME)
    
if not os.path.exists(outDir):
    os.mkdir(outDir)

shutil.copyfile(
    os.path.join(SEQUENCES_DIR, ZIKV_POLYPROTEIN_NAME + ".fasta"),
    os.path.join(outDir, "sequences.fasta")
)

'Trees\\ZIKV_polyprotein\\sequences.fasta'