## Important

`run_cdhit.sh` in `H1N1_HA` and `run_cdhit.sh`s in `H3N2_HA` before this notebook.

In [7]:
import os
import re
from glob import glob

import pandas as pd
from Bio import SeqIO

SEQUENCES_DIR = "Sequences"
TREES_DIR = "Trees"

H1N1_HA_NAME = "H1N1_HA"
H3N2_HA_NAME = "H3N2_HA"

GROUPING = "Spatiotemporal"

H3N2_OUTGROUP_ID = "AB284320"

### 1. H1N1

Gather the representative sequence from each clusters.

In [8]:
treeDir = os.path.join(TREES_DIR, H1N1_HA_NAME)
groupingDir = os.path.join(treeDir, GROUPING)

In [9]:
seqs = []

for trimmed in glob(os.path.join(groupingDir, "*", "trimmed.fasta")):
    for record in SeqIO.parse(trimmed, "fasta"):
        seqs.append(record)
        if record.id == "MK615207":
            print("Hi")
        
SeqIO.write(seqs, os.path.join(treeDir, "sequences.fasta"), "fasta")

4012

In [10]:
trimmedRow = []

for clstr in glob(os.path.join(groupingDir, "*", "trimmed.fasta.clstr")):
    with open(clstr) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if len(accessions) > 1:
                    trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })

trimmedRow = pd.DataFrame(trimmedRow)
trimmedRow.to_csv(os.path.join(treeDir, "trimmed.csv"), index=False)

### 2. H3N2

Gather the representative sequence from each clusters.

In [14]:
treeDir = os.path.join(TREES_DIR, H3N2_HA_NAME)
groupingDir = os.path.join(treeDir, GROUPING)

In [15]:
for record in SeqIO.parse(os.path.join(SEQUENCES_DIR, H3N2_HA_NAME + ".fasta"), "fasta"):
    info = dict(i.split(':') for i in record.id.split('|'))
    if info["gb"] == H3N2_OUTGROUP_ID:
        record.id = info["gb"]
        record.description = ""
        outgroup = record

In [16]:
seqs = [outgroup]

for trimmed in glob(os.path.join(H3N2_HA_GROUPING_DIR, "*", "trimmed.fasta")):
    for record in SeqIO.parse(trimmed, "fasta"):
        seqs.append(record)
        
SeqIO.write(seqs, os.path.join(H3N2_HA_TREE_DIR, "sequences.fasta"), "fasta")

4017

In [17]:
trimmedRow = []

for clstr in glob(os.path.join(H3N2_HA_GROUPING_DIR, "*", "trimmed.fasta.clstr")):
    with open(clstr) as f:
        repID = None
        accessions = []
        for line in f:
            if line.startswith('>'):
                if len(accessions) > 1:
                    trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })
                accessions = []
            else:
                m = re.findall(r">(\w+)", line)[0]
                if '*' in line:
                    repID = m
                accessions.append(m)
        if len(accessions) > 1:
            trimmedRow.append({"rep": repID, "trimmed": ', '.join(accessions) })

trimmedRow = pd.DataFrame(trimmedRow)
trimmedRow.to_csv(os.path.join(H3N2_HA_TREE_DIR, "trimmed.csv"), index=False)