## Note
The `H3N2 HA` and `H1N1 HA` sequences of `Human` host flu virus were downloaded from https://www.fludb.org/. Copy the sequence file to the destination folder under `Trees`. Will possibly have quality check on the sequences.

In [1]:
import os
import re
import shutil
from collections import defaultdict, Counter

import pandas as pd
from Bio import SeqIO
from Bio.Seq import Seq

SEQUENCES_DIR = "Sequences"
METADATA_DIR = "Metadata"
TREES_DIR = "Trees"

H1N1_HA_NAME = "H1N1_HA"
H3N2_HA_NAME = "H3N2_HA"
GROUPING = "Spatiotemporal"

H1N1_HA_TREE_DIR = os.path.join(TREES_DIR, H1N1_HA_NAME)
H3N2_HA_TREE_DIR = os.path.join(TREES_DIR, H3N2_HA_NAME)
H1N1_HA_GROUPING_DIR = os.path.join(H1N1_HA_TREE_DIR, GROUPING)
H3N2_HA_GROUPING_DIR = os.path.join(H3N2_HA_TREE_DIR, GROUPING)

H1N1_HA_YEAR = 2009
H3N2_HA_YEAR = 2009

MAX_AMBIGUITY = 1
HA_STANDARD_LEN = 566
AMBIGUITY = r"B|J|Z"

H1N1_HA_EXCLUDE = [
    "CY075897",
    "JX875001",
    "JQ348837",
    "KU720432",
    "KU720436",
    "KU720435",
    "KY930507",
    "MK615191",
    "CY079544",
    "KF057112",
    "HQ908440",
    "MH211234",
    "MH211235",
]

H3N2_HA_EXCLUDE = [
    "KY273064",
    "KY273063",
    "KY273058",
    "KY273060",
    "KY273057",
    "KU289634",
    "MH201523",
    "KP335956",
    "KF805696",
    "KP335932",
    "KP335938",
    "KP335934",
    "MK239073",
    "MK117070",
    "KJ955515",
    "KP765772",
    "KJ623709",
    "GU937743",
    "KF805640",
    "KF805656",
    "KF805688",
    "KU182657",
    "MF993038",
    "KU182655",
    "MG856267",
    "MN571199",
    "CY189823"
]

In [9]:
print(len(H1N1_HA_EXCLUDE), len(H3N2_HA_EXCLUDE))

13 27


In [2]:
if not os.path.exists(TREES_DIR):
    os.mkdir(TREES_DIR)

if os.path.exists(H1N1_HA_GROUPING_DIR):
    shutil.rmtree(H1N1_HA_GROUPING_DIR)
os.makedirs(H1N1_HA_GROUPING_DIR)
    
if os.path.exists(H3N2_HA_GROUPING_DIR):
    shutil.rmtree(H3N2_HA_GROUPING_DIR)
os.makedirs(H3N2_HA_GROUPING_DIR)

### 1. H1N1 metadata and quality check

Remove the sequence with ambiguious sites.

In [3]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H1N1_HA_NAME + ".tsv"),
    sep="\t",
    index_col="Sequence Accession",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)

In [4]:
nQualified = 0
seqs = defaultdict(list)

for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H1N1_HA_NAME + ".fasta"), "fasta"):
    info = dict(i.split(':') for i in record.id.split('|'))
    time = meta.loc[info["gb"], "Collection Date"]
    time = time[-4:] if pd.notna(time) else 0
    record.seq = Seq(re.sub(AMBIGUITY, 'X', str(record.seq)))
#     country = meta.loc[info["gb"], "Country"]
#     country = country.replace(' ', '_') if pd.notna(country) else "Unknown"
    if (
        info["gb"] not in H1N1_HA_EXCLUDE and 
        int(time) >= H1N1_HA_YEAR and
        Counter(str(record.seq))['X'] < MAX_AMBIGUITY and 
        len(record.seq) == HA_STANDARD_LEN
    ):
        nQualified += 1
        record.id = info["gb"]
        record.description = ""
        seqs["{}".format(time)].append(record)
    
for st, records in seqs.items():
    outDir = os.path.join(H1N1_HA_GROUPING_DIR, st)
    if not os.path.exists(outDir):
        os.mkdir(outDir)
    SeqIO.write(records, os.path.join(outDir, "sequences.fasta"), "fasta")
print(nQualified)

19735


### 2. H3N2 metadata and quality check

Remove the sequence with ambiguious sites.

In [5]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H3N2_HA_NAME + ".tsv"),
    sep="\t",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)
meta = meta.drop_duplicates()
meta = meta.set_index("Sequence Accession")

In [6]:
nQualified = 0
seqs = defaultdict(list)

for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H3N2_HA_NAME + ".fasta"), "fasta"):
    info = dict(i.split(':') for i in record.id.split('|'))
    time = meta.loc[info["gb"], "Collection Date"]
    time = time[-4:] if pd.notna(time) else 0
    record.seq = Seq(re.sub(AMBIGUITY, 'X', str(record.seq)))
    if (
        info["gb"] not in H3N2_HA_EXCLUDE and 
        int(time) >= H3N2_HA_YEAR and 
        Counter(str(record.seq))['X'] < MAX_AMBIGUITY and
        len(record.seq) == HA_STANDARD_LEN
    ):
        nQualified += 1
        record.id = info["gb"]
        record.description = ""
        seqs["{}".format(time)].append(record)
    
for st, records in seqs.items():
    outDir = os.path.join(H3N2_HA_GROUPING_DIR, st)
    if not os.path.exists(outDir):
        os.mkdir(outDir)
    SeqIO.write(records, os.path.join(outDir, "sequences.fasta"), "fasta")
print(nQualified)

23261


In [7]:
# for fname in os.listdir(SEQUENCES_DIR):
#     dstDir = os.path.join(TREES_DIR, os.path.splitext(fname)[0])
#     if not os.path.exists(dstDir):
#         os.mkdir(dstDir)
#     shutil.copyfile(
#         os.path.join(SEQUENCES_DIR, fname),
#         os.path.join(dstDir, "sequences.fasta")
#     )