In [1]:
#THIS NOTEBOOK GENERATES ALL FIGURES IN CATALÁN ET AL. (2020)

In [2]:
from functions import *

In [None]:
#PREPARE DATASET

#DOWNLOAD DATASET FROM https://amr.theodi.org/programmes/atlas
#URL: https://s3-eu-west-1.amazonaws.com/amr-prototype-data/Open+Atlas_Reuse_Data.xlsx

#THE ATLAS DATASET HAS TO BE PREPARED TO WORK WITH
#FIRST YOU HAVE TO MANUALLY REMOVE THE FIRST FIVE ROWS FROM THE EXCEL FILE
#NEXT WE READ THE EXCEL FILE
DF=pd.read_excel('data/Atlas_Reuse_Data.xlsx')
DF=DF.rename(columns={'In / Out Patient': 'InOutPatient'})
#WRITE DATAFRAME'S KEYS (USEFUL FOR LATER ANALYSES)
keys=['Year', 'Country', 'Species']
mkeys={}
for key in keys:
    dL=DF[key].tolist()
    mkeys[key]={}
    for i,z in enumerate(dL):
        z=str(z)
        if z not in mkeys[key]:
            mkeys[key][z]=0  
        mkeys[key][z]+=1
    with open('data/key_'+key+'.txt','w') as f:
        for z in sorted(mkeys[key], key=lambda x: (str(type(x)), x)):
            f.write(str(z)+'\t'+str(mkeys[key][z])+'\n')

#WE WANT TO LOG2-TRANSFORM THE MIC VALUES IN THE DATASET
#BUT MIC VALUES APPEAR IN A VARIETY OF WAYS THROUGHOUT THE DATASET
#FILE data/key_MIC_values.txt HOLDS ALL MIC VALUES IN THE DATASET
#IN THE FIRST COLUMN. IN THE SECOND WE HAVE MODIFIED THEM, AND IN THE
#THIRD WE HAVE THE LOG2-TRANSFORMED VALUES
MICdict=[]
with open('data/key_MIC_values.txt', 'r') as f:
    for line in f:
        w=line.split()
        k1=w[0]
        k3=float(w[2])
        MICdict.append([k1, str(k3), len(k1)])
MICdict=sorted(MICdict, key=itemgetter(2), reverse=True)

#WE ASSIGN A RANDOM KEY TO EACH MIC VALUE BEFORE SUBSTITUTING
import string
import random
def randomString(stringLength=10):
    """Generate a random string of fixed length """
    letters = string.ascii_lowercase
    return ''.join(random.choice(letters) for i in range(stringLength))
eggs=[randomString() for z in MICdict]

#NOW WE SUBSTITUTE THEM IN THE DATASET
drugs=read_key('Drugs', 'data/')
for c1 in drugs:
    DF[c1]=DF[c1].astype('str')
    for x1,y1 in zip(MICdict,eggs):
        DF[c1]=DF[c1].str.replace(x1[0],y1)
    for x1,y1 in zip(eggs,MICdict):
        DF[c1]=DF[c1].str.replace(x1,y1[1])
    DF[c1]=DF[c1].astype('float64')
DF['Species']=DF['Species'].astype('str')
#FINALLY, WE UNIFORMIZE SOME SPECIES
DF['Species']=DF['Species'].str.replace('Enterobacter aerogenes', 'Klebsiella aerogenes', regex=False)
DF['Species']=DF['Species'].str.replace('Klebsiella (Enterobacter) aerogenes', 'Klebsiella aerogenes', regex=False)
DF['Species']=DF['Species'].str.replace('Klebsiella planticola', 'Raoultella planticola', regex=False)
DF.to_excel('data/atlas_treated.xlsx')   
#WRITE NEW SPECIES FILE
mkeys={}
dL=DF['Species'].tolist()
for i,z in enumerate(dL):
    z=str(z)
    if z not in mkeys:
        mkeys[z]=0  
    mkeys[z]+=1
with open('data/key_Species.txt','w') as f:
    for z in sorted(mkeys, key=lambda x: (str(type(x)), x)):
        f.write(str(z)+'\t'+str(mkeys[z])+'\n')

In [None]:
#READ DATASET
DF, pL, bP, drugs=read_dataset('data/')

In [3]:
#COMPARE DATASETS

#FIRST WE OBTAIN AVERAGE MIC DATA IN ATLAS
L=[]
pC=get_key(DF,'Country')
for country in pC:#for every country
    DFc=DF[DF.Country==country]
    if DFc.empty: continue   
    for sp in pL:#for every pathogen
        DFsp=DFc[DFc.Species==sp]
        if DFsp.empty: continue
        for c1 in drugs:#for every drug
            if c1 not in bP[sp]: continue
            DFnew=pd.DataFrame({'Year': DFsp['Year'], c1: DFsp[c1]})
            DFnew=DFnew.dropna()
            if DFnew.empty: continue
            yL=get_key(DFnew, 'Year')
            for y in yL:#for every year
                DFy=DFnew[DFnew.Year==y]
                if DFy.empty: continue
                #DATA
                rcases=DFy[DFy[c1]>bP[sp][c1][0]].count()[c1]  
                m=DFy[c1].mean()-bP[sp][c1][0]
                merr=DFy[c1].sem()
                mstd=DFy[c1].std()
                N=len(DFy)
                R=rcases
                F=rcases/N
                #RECORD
                L.append([sp,c1,country,y,m,merr,mstd,N,R,F])
#SAVE ALL IN DATAFRAME AND WRITE TO FILE
MICdf=pd.DataFrame.from_records(L, columns=['Species', 'Antibiotic', 'Country', 'Year', 'Average MIC', 'Average MIC SEM', 'Average MIC SD', 'Number of cases', 'Number of resistant cases', 'Fraction of resistant cases'])
MICdf.to_csv('data/average_mic_per_year.csv', float_format='%.3f', na_rep='NaN', index=False)

#%%NOW WE COMPARE ATLAS VERSUS ECDC DATA

#READ AVERAGE MIC DATA
DF=pd.read_csv('data/average_mic_per_year.csv')
#READ ECDC DATA
RD=pd.read_csv('data/europe_resistance_data.csv')
#we want to compare fraction of resistance in both datasets
RD=RD[RD.Indicator=='Nonsusceptible proportion']
RD['Year']=RD['Year'].astype('int64')
RD['Value']=RD['Value'].astype('float64')
spR=get_key(RD, 'Species')
cR=get_key(RD, 'Country')
dR=get_key(RD, 'Antibiotic')
#ECDC resistance data comes aggregated for drug classes and for different
#pathogens. For instance, for Acinetobacter it reports resistance to
#'Carbapenems'. We therefore compare with resistance to all drugs tested
#in ATLAS for that pathogen-drug class combination, as specified in the
#ECDC website.
#Also, Acinetobacter is reported as a genus, but we here only compare with
#A.baumannii, which is by far the most important pathogen
RD['Species']=RD['Species'].str.replace('Acinetobacter spp.', 'Acinetobacter baumannii')
Rant={}
Rant['Acinetobacter baumannii.']={'Carbapenems':['Imipenem', 'Meropenem']}
Rant['Enterococcus faecalis']={'Highlevel gentamicin':['Gentamicin'],
    'Vancomycin':['Vancomycin']}
Rant['Enterococcus faecium']={'Highlevel gentamicin':['Gentamicin'],
    'Vancomycin':['Vancomycin']}
Rant['Escherichia coli']={'Carbapenems':['Imipenem', 'Meropenem']}
Rant['Klebsiella pneumoniae']={'Carbapenems':['Imipenem, Meropenem']}
Rant['Pseudomonas aeruginosa']={'Carbapenems':['Imipenem', 'Meropenem'],
    'Ceftazidime':['Ceftazidime'],
    'PiperacillinTazobactam':['Piperacillin tazobactam']}
Rant['Staphylococcus aureus']={'Meticillin (MRSA)':['Oxacillin']}
Rant['Streptococcus pneumoniae']={'Macrolides':['Erythromycin', 'Clarithromycin', 'Azithromycin'],
    'Penicillins': ['Penicillin', 'Oxacillin']}

#COMPARE RESISTANCE RATES
yL=list(range(2004,2018))
L=[]
for sp, country in itertools.product(spR, cR):
    DFsp=DF[(DF.Species==sp) & (DF.Country==country)]
    RDsp=RD[(RD.Species==sp) & (RD.Country==country)]
    if DFsp.empty or RDsp.empty: continue
    for key,value in Rant[sp].items():
        DFc=DFsp[DFsp.Antibiotic.isin(value)]
        RDc=RDsp[RDsp.Antibiotic==key]
        for y in yL:
            DFy=DFc[DFc.Year==y]
            RDy=RDc[RDc.Year==y]
            if DFy.empty or RDy.empty: continue
            x1=RDy.Value.mean()
            if np.isnan(x1): continue
            y1=DFy['Fraction of resistant cases'].mean()*100
            diff=y1-x1
            L.append([sp,key,y,country,x1,y1,diff])
NewDF=pd.DataFrame.from_records(L, columns=['Species','Antibiotic','Year','Country','ECDCValue','ATLASValue','Diff'])
NewDF.to_excel('data/atlas_vs_ecdc.xls', float_format='%.3f', index=False)

#%%NOW WE COMPARE ATLAS VERSUS RESISTANCEMAP DATA

#READ AVERAGE MIC DATA
DF=pd.read_csv('data/average_mic_per_year.csv')
#READ RESISTANCEMAP DATA
RD=pd.read_csv('data/resistmap_data.csv')
RD['year']=RD['year'].astype('int64')
RD['value']=RD['value'].astype('float64')
RD['IsolatesTested']=RD['IsolatesTested'].astype('float64')
spR=get_key(RD, 'Organism')
cR=get_key(RD, 'CountryName')
dR=get_key(RD, 'Antimicrobial')
#DATA IN RESISTANCEMAP IS AGGREGATED IN A SIMILAR WAY AS IN ECDC
Rant={}
Rant['Acinetobacter baumannii']={'Amikacin':['Amikacin'], 
    'Ampicillin-sulbactam':['Ampicillin sulbactam'],
    'Carbapenems':['Imipenem', 'Meropenem'],
    'Ceftazidime':['Ceftazidime'],
    'Glycylcyclines':['Tigecycline']}
Rant['Enterobacter aerogenes/cloacae']={'Amoxicillin-clavulanate':['Amoxicillin clavulanate'],
    'Carbapenems':['Imipenem', 'Meropenem'],
    'Glycylcyclines':['Tigecycline'],
    'Piperacillin-tazobactam':['Piperacillin tazobactam']}
Rant['Enterococcus faecalis']={'Aminoglycosides (high-level)':['Gentamicin'],
    'Vancomycin':['Vancomycin']}
Rant['Enterococcus faecium']={'Vancomycin':['Vancomycin']}
Rant['Escherichia coli']={'Amoxicillin-clavulanate':['Amoxicillin clavulanate'],
    'Carbapenems':['Imipenem', 'Meropenem'],
    'Glycylcyclines':['Tigecycline'],
    'Piperacillin-tazobactam':['Piperacillin tazobactam']}
Rant['Klebsiella pneumoniae']={'Amoxicillin-clavulanate':['Amoxicillin clavulanate'],
    'Carbapenems':['Imipenem, Meropenem'],
    'Glycylcyclines':['Tigecycline'],
    'Piperacillin-tazobactam':['Piperacillin tazobactam']}
Rant['Pseudomonas aeruginosa']={'Amikacin':['Amikacin'],
    'Carbapenems':['Imipenem', 'Meropenem'],
    'Ceftazidime':['Ceftazidime'],
    'Piperacillin-tazobactam':['Piperacillin tazobactam']}
Rant['Staphylococcus aureus']={'Linezolid':['Linezolid'],
    'Oxacillin (MRSA)':['Oxacillin'],
    'Vancomycin':['Vancomycin']}
Rant['Streptococcus pneumoniae']={'Macrolides':['Erythromycin', 'Clarithromycin', 'Azithromycin']}

#COMPARE RESISTANCE RATES
yL=list(range(2004,2018))
L=[]
for sp, country in itertools.product(spR, cR):
    DFsp=DF[(DF.Species==sp) & (DF.Country==country)]
    RDsp=RD[(RD.Organism==sp) & (RD.CountryName==country)]
    if DFsp.empty or RDsp.empty: continue
    for key,value in Rant[sp].items():
        DFc=DFsp[DFsp.Antibiotic.isin(value)]
        RDc=RDsp[RDsp.Antimicrobial==key]
        for y in yL:
            DFy=DFc[DFc.Year==y]
            RDy=RDc[RDc.year==y]
            if DFy.empty or RDy.empty: continue
            x1=RDy.value.mean()
            xhigh=RDy.CIHigh.mean()
            xlow=RDy.CILow.mean()
            if np.isnan(x1): continue
            y1=DFy['Fraction of resistant cases'].mean()*100
            diff=y1-x1
            L.append([sp,key,y,country,x1,xhigh,xlow,y1,diff])

NewDF=pd.DataFrame.from_records(L, columns=['Species','Antibiotic','Year','Country','RMapValue','RMapCIHigh', 'RMapCILow','ATLASValue','Diff'])
NewDF.to_excel('data/atlas_vs_resistancemap.xls', float_format='%.3f', index=False)

Staphylococcus aureus 113693
Escherichia coli 80500
Klebsiella pneumoniae 64296
Pseudomonas aeruginosa 61799
Enterobacter cloacae 39391
Streptococcus pneumoniae 36765
Haemophilus influenzae 26209
Acinetobacter baumannii 25932
Enterococcus faecalis 23998
Streptococcus agalactiae 20924
Serratia marcescens 20536
Klebsiella oxytoca 14332
Klebsiella aerogenes 14045
Enterococcus faecium 11960
Streptococcus pyogenes 9357
Staphylococcus epidermidis 8582
Citrobacter freundii 5349
Proteus mirabilis 4898
Citrobacter koseri 4269
Staphylococcus haemolyticus 3819
Bacteroides fragilis 2799
Stenotrophomonas maltophilia 2267
Morganella morganii 2206
Acinetobacter pitii 2172
Proteus vulgaris 2114
Enterobacter asburiae 1961
Acinetobacter lwoffii 1578
Streptococcus dysgalactiae 1491
Clostridioides (Clostridium) difficile 1468
Clostridium perfringens 984
Staphylococcus hominis 834
Moraxella catarrhalis 797
Klebsiella variicola 763
Streptococcus anginosus 748
Enterobacter kobei 729
Parvimonas micra 718
Prev

In [4]:
#ATLAS QUALITY

#%%COMPUTE CORRELATIONS BETWEEN YEARS FOR EACH PA
bins=np.arange(-10.5,11.5,1)
z=np.zeros(len(bins))
for sp,c1 in itertools.product(pL,drugs):
    DFsp=DF[DF.Species==sp]
    DFc=pd.DataFrame({'Year': DFsp['Year'], c1: DFsp[c1]})
    DFc=DFc.dropna()
    if DFc.empty: continue
    Y=get_key(DFc,'Year')
    H=[z for Y in Y]#H will hold the distribution of MIC values each year
    for i1,y in enumerate(Y):
        DFy=DFc[DFc.Year==y]
        h,b1=np.histogram(DFy[c1], bins=bins, density=True)
        H[i1]=h
    H=np.array(H)
    R=np.corrcoef(H)#correlation between the MIC distributions in different years
    if R.size>1:
        np.savetxt('results/correlations/'+sp+'_'+c1+'.txt',R,fmt='%.3f')
        np.savetxt('results/correlations/'+sp+'_'+c1+'_years.txt',Y, fmt='%d')
#%%COMPUTE TAU FOR EVERY PA PAIR  
from os import path
Dl=[]
for sp,c1 in itertools.product(pL,drugs):
    if path.exists('results/correlations/'+sp+'_'+c1+'.txt'):
        R,Y=read_corr(sp, c1)
        n,m=R.shape
        tau=(np.sum(R**2)-n)/(n**2-n)
        tau2=np.sum(R**2)
        Dl.append([sp,c1,tau,tau2,n])     
Dl=sorted(Dl, reverse=True, key=itemgetter(2))
DFdist=pd.DataFrame.from_records(Dl, columns=['Species', 'Antibiotic', 'Tau', 'TauN', 'N'])
DFdist.to_csv('results/correlations/tau_test.csv', float_format='%.3f', index=False)

Staphylococcus aureus 113693
Escherichia coli 80500
Klebsiella pneumoniae 64296
Pseudomonas aeruginosa 61799
Enterobacter cloacae 39391
Streptococcus pneumoniae 36765
Haemophilus influenzae 26209
Acinetobacter baumannii 25932
Enterococcus faecalis 23998
Streptococcus agalactiae 20924
Serratia marcescens 20536
Klebsiella oxytoca 14332
Klebsiella aerogenes 14045
Enterococcus faecium 11960
Streptococcus pyogenes 9357
Staphylococcus epidermidis 8582
Citrobacter freundii 5349
Proteus mirabilis 4898
Citrobacter koseri 4269
Staphylococcus haemolyticus 3819
Bacteroides fragilis 2799
Stenotrophomonas maltophilia 2267
Morganella morganii 2206
Acinetobacter pitii 2172
Proteus vulgaris 2114
Enterobacter asburiae 1961
Acinetobacter lwoffii 1578
Streptococcus dysgalactiae 1491
Clostridioides (Clostridium) difficile 1468
Clostridium perfringens 984
Staphylococcus hominis 834
Moraxella catarrhalis 797
Klebsiella variicola 763
Streptococcus anginosus 748
Enterobacter kobei 729
Parvimonas micra 718
Prev

In [5]:
#GLOBAL MIC CHANGE (FOR CLUSTERMAPS)
L=[]
for sp, c1 in itertools.product(pL, drugs):
    DFsp=DF[DF.Species==sp]
    M=mic_change(sp,c1,DFsp)
    if not M: continue
    L.append(M)
MIC=pd.DataFrame.from_records(L, columns=['Species', 'Antibiotic', 'MIC change', 'MIC change error', 'Intercept'])
#WRITE TO FILE
MIC.to_csv('data/mic_change_global.csv', na_rep='NaN', float_format='%.3f', index=False)
        
#EUROPEAN MIC CHANGE
RD=pd.read_csv('data/europe_resistance_data.csv')
#GET LIST OF EUROPEAN COUNTRIES FROM ECDC DATA
cR=get_key(RD, 'Country')
for i,c in enumerate(cR):
    if c=='Slovakia':
        cR[i]='Slovak Republic'
DFE=DF[DF.Country.isin(cR)]
for sp,c1 in itertools.product(pL, drugs):
    DFsp=DFE[DFE.Species==sp]
    if not M: continue
    M=mic_change(sp,c1,DFsp)
    L.append(M)
MIC=pd.DataFrame.from_records(L, columns=['Species', 'Antibiotic', 'MIC change', 'MIC change error', 'Intercept'])
#WRITE TO FILE
MIC.to_csv('data/mic_change_europe.csv', na_rep='NaN', float_format='%.3f', index=False)