## Important

`run_cdhit.sh` before this notebook. Gather all trimmed sequences.

In [1]:
import os
import re
from glob import glob

import pandas as pd
from Bio import SeqIO

SEQUENCES_DIR = "Sequences"
TREES_DIR = "Trees"
METADATA_DIR = "Metadata"

H1N1_HA_NAME = "H1N1_HA"
H3N2_HA_NAME = "H3N2_HA"
GROUPING = "Spatiotemporal"

H3N2_OUTGROUP_ID = "AB284320"

H1N1_HA_TREE_DIR = os.path.join(TREES_DIR, H1N1_HA_NAME)
H3N2_HA_TREE_DIR = os.path.join(TREES_DIR, H3N2_HA_NAME)
H1N1_HA_GROUPING_DIR = os.path.join(H1N1_HA_TREE_DIR, GROUPING)
H3N2_HA_GROUPING_DIR = os.path.join(H3N2_HA_TREE_DIR, GROUPING)

### 1. H1N1

Find the outgroup by date

In [2]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H1N1_HA_NAME + ".tsv"),
    sep="\t",
    index_col="Sequence Accession",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)

In [3]:
seqs = []

for trimmed in glob(os.path.join(H1N1_HA_GROUPING_DIR, "*", "trimmed.fasta")):
    for record in SeqIO.parse(trimmed, "fasta"):
        seqs.append(record)
        
SeqIO.write(seqs, os.path.join(H1N1_HA_TREE_DIR, "sequences.fasta"), "fasta")

4013

In [4]:
trimmedRow = []

for clstr in glob(os.path.join(H1N1_HA_GROUPING_DIR, "*", "trimmed.fasta.clstr")):
    with open(clstr) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if len(accessions) > 1:
                    trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })

trimmedRow = pd.DataFrame(trimmedRow)
trimmedRow.to_csv(os.path.join(H1N1_HA_TREE_DIR, "trimmed.csv"), index=False)

### 2. H3N2

In [5]:
meta = pd.read_csv(
    os.path.join(METADATA_DIR, H3N2_HA_NAME + ".tsv"),
    sep="\t",
    index_col="Sequence Accession",
    na_values=["-N/A-", "Unknown"],
    true_values=["Yes"],
    false_values=["No"]
)

In [6]:
for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H3N2_HA_NAME + ".fasta"), "fasta"):
    info = dict(i.split(':') for i in record.id.split('|'))
    if info["gb"] == H3N2_OUTGROUP_ID:
        record.id = info["gb"]
        record.description = ""
        outgroup = record

In [7]:
seqs = [outgroup]

for trimmed in glob(os.path.join(H3N2_HA_GROUPING_DIR, "*", "trimmed.fasta")):
    for record in SeqIO.parse(trimmed, "fasta"):
        seqs.append(record)
        
SeqIO.write(seqs, os.path.join(H3N2_HA_TREE_DIR, "sequences.fasta"), "fasta")

4020

In [8]:
trimmedRow = []

for clstr in glob(os.path.join(H3N2_HA_GROUPING_DIR, "*", "trimmed.fasta.clstr")):
    with open(clstr) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if len(accessions) > 1:
                    trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })

trimmedRow = pd.DataFrame(trimmedRow)
trimmedRow.to_csv(os.path.join(H3N2_HA_TREE_DIR, "trimmed.csv"), index=False)