## Setup

In [1]:
# built-in packages
import os

# third-party packages
from matplotlib import pyplot as plt
import pandas as pd
import seaborn as sns
from tqdm.notebook import tqdm

plt.rcParams["figure.dpi"] = 200
sns.set_palette("deep")
sns.set_context("paper")

## ClermonTyping

Actual ClermonTyping takes place in the `1__annotate_phylogroups.py` script. This generates multiple results files which must be collated.

### Concatenate phylogroup files

In [3]:
# Takes ~7 minutes if assert statement is run, ~3 minutes if skipped
# Here are finding the _phylogroups.txt file from our output dirs (change file path accordingly)

CLERMONT = '.'
DF_PATHS = []


for path in tqdm(os.listdir(CLERMONT)):
    if 'ipynb' in path:
        continue
    full_path = os.path.join(CLERMONT, path)
    
    if os.path.isdir(full_path):
        df_path = os.path.join(full_path, f'{path}_phylogroups.txt')
        #assert os.path.isfile(df_path)
        DF_PATHS.append(df_path)

  0%|          | 0/2367 [00:00<?, ?it/s]

In [4]:
with open('./ecoli_phylogroups.txt', 'w') as outfile:
    for fname in tqdm(DF_PATHS):
        with open(fname) as infile:
            outfile.write(infile.read())

  0%|          | 0/2362 [00:00<?, ?it/s]

### Post-process resultant file

In [5]:
ecoli_phylgroups = pd.read_csv('./ecoli_phylogroups.txt', sep='\t', header=None)

# Drop column listing MASH output filename
ecoli_phylgroups.drop(ecoli_phylgroups.columns[len(ecoli_phylgroups.columns)-1], axis=1, inplace=True)

# Add column names
ecoli_phylgroups.columns = ['genome_id', 'pcr_genes', 'quadruplex', 'CE_alleles', 'phylogroup']

# Remove '.fna' from genome_id name
ecoli_phylgroups['genome_id'] = ecoli_phylgroups.genome_id.apply(lambda x: x[:-4])

ecoli_phylgroups

Unnamed: 0,genome_id,pcr_genes,quadruplex,CE_alleles,phylogroup
0,562.46071,"['ybgD', 'trpA', 'trpBA', 'yjaA', 'arpA']","['+', '-', '+', '-']",[],A
1,562.70513,"['ybgD', 'trpA', 'trpBA', 'chuA', 'TspE4.C2', ...","['+', '+', '-', '+']",['trpAgpC'],D
2,562.67796,"['ybgD', 'trpA', 'trpBA', 'TspE4.C2', 'arpA', ...","['+', '-', '-', '+']",['trpAgpC'],B1
3,83334.637,"['trpA', 'trpBA', 'chuA', 'arpA', 'ArpAgpE', '...","['+', '+', '-', '-']","['ArpAgpE', 'trpAgpC']",E
4,562.102351,"['ybgD', 'trpA', 'trpBA', 'chuA', 'arpA', 'trp...","['+', '+', '-', '-']",['trpAgpC'],D
...,...,...,...,...,...
2357,562.112528,"['ybgD', 'trpA', 'trpBA', 'TspE4.C2', 'arpA', ...","['+', '-', '-', '+']",['trpAgpC'],B1
2358,562.61628,"['ybgD', 'trpA', 'trpBA', 'arpA', 'trpAgpC']","['+', '-', '-', '-']",['trpAgpC'],A
2359,562.94170,"['trpA', 'trpBA', 'yjaA', 'arpA']","['+', '-', '+', '-']",[],A
2360,562.66855,"['ybgD', 'trpA', 'trpBA', 'chuA', 'arpA', 'trp...","['+', '+', '-', '-']",['trpAgpC'],D


In [6]:
# CORRECTION: Rerunning everything has gotten rid of these "non-Escherichia" strains

# Among public genomes, all but '562.78335' are NOT E coli,
# need to remove ALL from pangenome collection (in later notebook)
ecoli_phylgroups[ecoli_phylgroups.phylogroup == 'Non Escherichia']

Unnamed: 0,genome_id,pcr_genes,quadruplex,CE_alleles,phylogroup
270,562.112392,[],"['-', '-', '-', '-']",[],Non Escherichia
655,562.48218,[],"['-', '-', '-', '-']",[],Non Escherichia


In [7]:
# These are mostly Shigella sonnei strains, with some STECs mixed in
ecoli_phylgroups[ecoli_phylgroups.phylogroup == 'Unknown']

Unnamed: 0,genome_id,pcr_genes,quadruplex,CE_alleles,phylogroup
146,562.9623,"['trpA', 'trpBA', 'chuA', 'yjaA', 'TspE4.C2', ...","['+', '+', '+', '+']",[],Unknown
183,2044467.5,['trpBA'],"['-', '-', '-', '-']",[],Unknown
334,562.16414,"['ybgD', 'trpA', 'trpBA']","['-', '-', '-', '-']",[],Unknown
672,562.67833,"['trpA', 'trpBA', 'yjaA', 'trpAgpC']","['-', '-', '+', '-']",['trpAgpC'],Unknown
729,562.60623,"['trpA', 'trpBA', 'trpAgpC']","['-', '-', '-', '-']",['trpAgpC'],Unknown
821,562.51562,"['ybgD', 'trpA', 'trpBA', 'trpAgpC']","['-', '-', '-', '-']",['trpAgpC'],Unknown
1313,562.96055,"['trpA', 'trpBA', 'chuA', 'yjaA', 'TspE4.C2', ...","['+', '+', '+', '+']",[],Unknown
2080,562.10196,"['trpA', 'trpBA', 'chuA', 'yjaA', 'TspE4.C2', ...","['+', '+', '+', '+']",[],Unknown
2242,562.96228,"['trpA', 'trpBA', 'chuA', 'yjaA', 'TspE4.C2', ...","['+', '+', '+', '+']",[],Unknown
2275,562.101959,"['trpA', 'trpBA', 'chuA', 'yjaA', 'TspE4.C2', ...","['+', '+', '+', '+']",[],Unknown


In [9]:
ecoli_phylgroups.to_csv('ecoli_phylogroups.csv')