In [None]:
# This code goes through the flexibility simulation outputs and looks for how many of them are correct
# It then makes fig 2
# As stated below but also up here we need to put the outputs (currently in the tar.gz file into their own directories)

# Note because of the relatively small numbers of simulations involved in this, we ran everything 10* to see variability across the 10 runs


In [1]:
# Dependencies

import os
import subprocess

from Bio import Seq
from Bio import SeqIO

import pandas as pd
import numpy as np

from tqdm import tnrange, tqdm_notebook

In [2]:

db = SeqIO.parse("db_formatted.fasta", "fasta")
db = {k.id:k.seq for k in db}
# This DB just contains unique sequences and names have been renamed to make things run easily with all programs

linkfile = "db_link.csv"
link = pd.read_csv(linkfile, index_col=0, header=None)
rlink = pd.read_csv(linkfile, index_col=1, header=None)

In [3]:
correct = pd.read_csv("flex_correct.csv")
correct_dict = {}
for i in range(len(correct)):
    correct_dict[correct.iloc[i].sequence] = correct.iloc[i].correct


In [4]:
# Note all the outputs are in the flex_outputs.tar.gz tarball. The subdirectories, (flex{n}_output) need to be in the current directory for this code to run.
# Extract and place folders in this directory.
%ls

db_formatted.fasta   flex_correct.csv     flex_outputs.tar.gz
db_link.csv          flex_example.ipynb
flex_analysis.ipynb  flex_fastas.tar.gz


In [5]:
# Now writing cleaner code to go to github

def conv_string(b):
    if b == True:
        return("1")
    else:
        return("0")

# Defining a simulation class
# For any given flexibility simulation it is defined by the sequence inserted and which run it was in

class simulation:
    
    def __init__(self, sequence, run):
        self.sequence = sequence
        self.sim_fl = "" # I need to pull these back off analysis 1
        self.run = run
        self.correct = correct_dict[self.sequence]
        self.origname = link.loc[self.correct][1]
        self.origname_seq = link.loc[self.correct][2]
        
        #Abricate
        self.abricate_fl = "flex{0}_output/abricate_summary/{1}_assem.tab" .format(self.run, self.sequence)
        self.abricate_result = pd.read_csv(self.abricate_fl, delimiter = "\t")
        self.abricate_genes = list(self.abricate_result["GENE"])
        
        #ARIBA
        self.ariba_fl = "flex{0}_output/ariba_summary/{1}.expandedreport.tsv" .format(self.run, self.sequence)
        self.ariba_result = pd.read_csv(self.ariba_fl, delimiter = "\t")
        self.ariba_genes = list(self.ariba_result['ref_name'])
        
        #KmerResistance
        self.kmerres_fl = "flex{0}_output/kmerres_summary/{1}.KmerRes" .format(self.run, self.sequence)
        self.kmerres_result = pd.read_csv(self.kmerres_fl, delimiter = "\t")
        # Note here I have had to apply the cutoff manually as KmerResistance doesn't appear to manually do this
        # Its files suggest its "template id cutoff" should be 70.0.
        self.kmerres_result = self.kmerres_result.loc[self.kmerres_result.template_id > 70.0]
        self.kmerres_genes = sorted(list(set([k for k in list(self.kmerres_result['#Template']) if "resfindernewid" in str(k)])))
        
        #SRST2
        # Note an important point to begin is that if SRST2 doesn't find anything, it doesn't create a results file at its final reporting stage.
        # Therefore I have included a tag to pull these ones out if you want to look at the original file
        self.srst2_fl = "flex{0}_output/srst2_summary/{1}_SRST.out__fullgenes__seqs_clustered__results.txt" .format(self.run, self.sequence)
        if os.path.isfile(self.srst2_fl):
            self.srst2_result = pd.read_csv(self.srst2_fl, delimiter = "\t")
            self.srst2_genes = sorted(list(set(list(self.srst2_result.allele))))
        else:
            self.srst2_result = "Empty"
            self.srst2_genes = []
        
        # Output investigation
        self.matching = [self.abricate_genes == [self.correct] , 
                                   self.ariba_genes == [self.correct],
                                  self.kmerres_genes == [self.correct], 
                                  self.srst2_genes == [self.correct]]
        self.matching_string = ":".join([conv_string(j) for j in self.matching])
        self.any_error = False in self.matching
        
        
        

In [6]:
sims = {} # This stores all out the simulation objects
simulation_outcome = {} # This is just a straightforwad 



# So this reads in all the results and goes through things.
# Note we use range 1,11 as there were 10 repeats

for r in range(1, 11):
    sims[r] = {}
    for n in tnrange(len(list(correct_dict.keys()))):
        k = list(correct_dict.keys())[n]
        x = simulation(k, r)
        sims[r][k] = x
        simulation_outcome.setdefault(k, []).append(x.any_error)

FileNotFoundError: [Errno 2] File flex1_output/abricate_summary/flex_0_assem.tab does not exist: 'flex1_output/abricate_summary/flex_0_assem.tab'

In [12]:
# You can then use this dictionary to query what you want
errors_list = []

for k in simulation_outcome:
    if True in simulation_outcome[k].any_error:
        errors_list.append(k)

print(len(errors_list))

1080


In [None]:
# Also we use it to make our fig as seen in the paper

for k in simulation_outcome:
    if True in simulation_outcome[k]:
        if False in simulation_outcome[k]:
            print(k, Counter(simulation_outcome[k]))

# So overall there are 8 which have been variably called.
# Also lets look at patterns of miss call

matching_strings = {}

for r in sims:
    for k in sims[r]:
        matching_strings.setdefault(k, []).append(sims[r][k].matching_string)

picture_strings = []
m = 0
for k in matching_strings:
    if len(list(set(matching_strings[k]))) > 1:
        m += 1 
        picture_strings.append("*:*:*:*")
#         print(k, matching_strings[k])
    else:
        picture_strings.append(matching_strings[k][0])

bar_totals = Counter(list(picture_strings))
# So overall we have 42 with variable calling (8 occasionally all correct, all other 34 always at least one incorrect)
# I think the easiest way to manage this for the picture is to rename them variable
# We will therefore replace these strings with **** for working things out

ax_labs = ['1:1:1:0', '1:0:1:0', '1:0:1:1', '*:*:*:*', '0:1:1:1', 
          '0:0:1:1', '0:1:1:0', '0:0:1:0','0:1:0:1', '0:0:0:0']

# Note the **** means variably classified across the 10 repeats

fig = plt.figure(figsize=(8, 6), dpi=300)
ax = plt.subplot2grid((1,1),(0,0), rowspan = 1 , colspan= 1)

xs = [k for k in range(len(ax_labs))]
ys = [bar_totals[ax_labs[k]] for k in xs]

print(xs, ys)

ax.bar(xs, ys, color="#c51b8a", width = 0.75)
ax.set_xticks(xs)
ax.set_xticklabels(ax_labs, size=14,rotation=45 )
# Note for the final graph we use those color labels seen in the text
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.set_ylabel("Number of incorrect simulations", size=16)

# This then makes essentially the final image,
# Just for anyone keen the numbers I have printed the bar totals dictionary below as well.

print(bar_totals)

plt.show()
