## COMBINING RESULTS FROM mlplasmids AND abricate by isolate and database

This script allows the combination of results from mlplasmid software (contig prediction as plasmid or chromosome) and abricate software (presence/absence of genes of interest) results. For each database used, we can merge all genes predicted in plasmid contigs and in chromosomal contigs per isolate. 

In [1]:
import pandas as pd
import os
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
mlplasmid = 'mlplasmids_07_curated.csv'
abricateR = 'resfinder_results_cov80id90.csv'
abricateV = 'vfkp_results_cov80id90.csv'
abricateP = 'plasmidfinder_results_cov80id90.csv'

In [3]:
#check separators of your csv files in TextEditor for example
dfml = pd.read_csv(mlplasmid, sep=',') 
dfabR = pd.read_csv(abricateR, sep=',') 
dfabV = pd.read_csv(abricateV, sep=',') 
dfabP = pd.read_csv(abricateP, sep=',') 

In [4]:
dfml.head()

Unnamed: 0.1,Unnamed: 0,Prob_Chromosome,Prob_Plasmid,Prediction,Isolate_contig,Contig_length
0,1,0.998197,0.001803,Chromosome,544651_contig_1,116416
1,2,0.997998,0.002002,Chromosome,544651_contig_2,461922
2,3,0.995752,0.004248,Chromosome,544651_contig_3,100222
3,4,0.995657,0.004343,Chromosome,544651_contig_4,34295
4,5,0.999379,0.000621,Chromosome,544651_contig_5,162471


In [5]:
#dfml = dfml[['Isolate_contig', 'Prediction']]
dfml = dfml[['Isolate_contig', 'Prob_Chromosome', 'Prob_Plasmid', 'Prediction']]

In [6]:
dfml.head()

Unnamed: 0,Isolate_contig,Prob_Chromosome,Prob_Plasmid,Prediction
0,544651_contig_1,0.998197,0.001803,Chromosome
1,544651_contig_2,0.997998,0.002002,Chromosome
2,544651_contig_3,0.995752,0.004248,Chromosome
3,544651_contig_4,0.995657,0.004343,Chromosome
4,544651_contig_5,0.999379,0.000621,Chromosome


In [7]:
dfabR.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE
0,544651_contig_2,37029,38204,+,oqxA_1,1-1176/1176,===============,0/0,100.0,98.89,resfinder,EU370913,oqxA,Nalidixic_acid;Ciprofloxacin
1,544651_contig_2,38228,41380,+,oqxB_1,1-3153/3153,===============,0/0,100.0,98.57,resfinder,EU370913,oqxB,Nalidixic_acid;Ciprofloxacin
2,544651_contig_20,22760,23179,+,fosA6_1,1-420/433,===============,0/0,97.0,99.76,resfinder,KU254579,fosA6,Fosfomycin
3,544651_contig_41,4350,5210,-,blaSHV-1_1,1-861/861,===============,0/0,100.0,100.0,resfinder,AF148850,blaSHV-1,Amoxicillin;Ampicillin;Cephalothin;Piperacillin;Ticarcillin
4,544651_contig_59,54,851,+,blaOXA-48_1,1-798/798,===============,0/0,100.0,100.0,resfinder,AY236073,blaOXA-48,Amoxicillin;Amoxicillin+Clavulanic_acid;Ampicillin;Ampicillin+Clavulanic_acid;Imipenem;Meropenem;Piperacillin;Piperacillin+Tazobactam


In [8]:
dfR = pd.merge(dfabR, dfml, how='left', on=['Isolate_contig'])
dfV = pd.merge(dfabV, dfml, how='left', on=['Isolate_contig'])
dfP = pd.merge(dfabP, dfml, how='left', on=['Isolate_contig'])

In [9]:
#dfR.to_csv('mlplasmids07_resistance_intersect.csv', index=False)
#dfV.to_csv('mlplasmids07_virulenceKp_intersect.csv', index=False)
#dfP.to_csv('mlplasmids07_plasREP_intersect.csv', index=False)

In [10]:
dfabR.shape

(1633, 14)

In [11]:
dfR.shape

(1633, 17)

In [12]:
dfR.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE,Prob_Chromosome,Prob_Plasmid,Prediction
0,544651_contig_2,37029,38204,+,oqxA_1,1-1176/1176,===============,0/0,100.0,98.89,resfinder,EU370913,oqxA,Nalidixic_acid;Ciprofloxacin,0.997998,0.002002,Chromosome
1,544651_contig_2,38228,41380,+,oqxB_1,1-3153/3153,===============,0/0,100.0,98.57,resfinder,EU370913,oqxB,Nalidixic_acid;Ciprofloxacin,0.997998,0.002002,Chromosome
2,544651_contig_20,22760,23179,+,fosA6_1,1-420/433,===============,0/0,97.0,99.76,resfinder,KU254579,fosA6,Fosfomycin,0.999436,0.000564,Chromosome
3,544651_contig_41,4350,5210,-,blaSHV-1_1,1-861/861,===============,0/0,100.0,100.0,resfinder,AF148850,blaSHV-1,Amoxicillin;Ampicillin;Cephalothin;Piperacillin;Ticarcillin,0.898761,0.101239,Chromosome
4,544651_contig_59,54,851,+,blaOXA-48_1,1-798/798,===============,0/0,100.0,100.0,resfinder,AY236073,blaOXA-48,Amoxicillin;Amoxicillin+Clavulanic_acid;Ampicillin;Ampicillin+Clavulanic_acid;Imipenem;Meropenem;Piperacillin;Piperacillin+Tazobactam,0.099711,0.900289,Plasmid


In [13]:
dfR['sample'] = dfR.Isolate_contig.str.split('_').str[0]
dfV['sample'] = dfV.Isolate_contig.str.split('_').str[0]
dfP['sample'] = dfP.Isolate_contig.str.split('_').str[0]

In [14]:
dfR.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE,Prob_Chromosome,Prob_Plasmid,Prediction,sample
0,544651_contig_2,37029,38204,+,oqxA_1,1-1176/1176,===============,0/0,100.0,98.89,resfinder,EU370913,oqxA,Nalidixic_acid;Ciprofloxacin,0.997998,0.002002,Chromosome,544651
1,544651_contig_2,38228,41380,+,oqxB_1,1-3153/3153,===============,0/0,100.0,98.57,resfinder,EU370913,oqxB,Nalidixic_acid;Ciprofloxacin,0.997998,0.002002,Chromosome,544651
2,544651_contig_20,22760,23179,+,fosA6_1,1-420/433,===============,0/0,97.0,99.76,resfinder,KU254579,fosA6,Fosfomycin,0.999436,0.000564,Chromosome,544651
3,544651_contig_41,4350,5210,-,blaSHV-1_1,1-861/861,===============,0/0,100.0,100.0,resfinder,AF148850,blaSHV-1,Amoxicillin;Ampicillin;Cephalothin;Piperacillin;Ticarcillin,0.898761,0.101239,Chromosome,544651
4,544651_contig_59,54,851,+,blaOXA-48_1,1-798/798,===============,0/0,100.0,100.0,resfinder,AY236073,blaOXA-48,Amoxicillin;Amoxicillin+Clavulanic_acid;Ampicillin;Ampicillin+Clavulanic_acid;Imipenem;Meropenem;Piperacillin;Piperacillin+Tazobactam,0.099711,0.900289,Plasmid,544651


In [15]:
dfR_summary = dfR.groupby(['sample', 'Prediction'])['GENE'].apply(list).reset_index(name='genes')
dfV_summary = dfV.groupby(['sample', 'Prediction'])['GENE'].apply(list).reset_index(name='genes')
dfP_summary = dfP.groupby(['sample', 'Prediction'])['GENE'].apply(list).reset_index(name='genes')

In [16]:
dfR_summary

Unnamed: 0,sample,Prediction,genes
0,544651,Chromosome,"[oqxA_1, oqxB_1, fosA6_1, blaSHV-1_1]"
1,544651,Plasmid,[blaOXA-48_1]
2,544663,Chromosome,"[blaSHV-106_1, fosA_6]"
3,544663,Plasmid,"[aac(3)-IIa_1, blaOXA-1_1, aac(6')-Ib-cr_1, dfrA14_5, blaOXA-48_1]"
4,544680,Chromosome,"[fosA6_1, blaSHV-161_1, oqxA_1, oqxB_1]"
5,544680,Plasmid,"[tet(D)_1, blaOXA-48_1]"
6,544689,Chromosome,"[fosA_3, blaSHV-76_1, oqxA_1, oqxB_1]"
7,544689,Plasmid,"[sul2_2, aph(3'')-Ib_5, aph(6)-Id_1, blaTEM-1B_1, blaCTX-M-15_1, aac(6')-Ib-cr_1, blaOXA-1_1, dfrA14_5, aac(3)-IIa_1, qnrB1_1, blaOXA-48_1, tet(A)_6]"
8,544719,Chromosome,"[fosA_3, oqxB_1, oqxA_1, blaSHV-182_1, blaCTX-M-15_1]"
9,544719,Plasmid,"[aac(3)-IIa_1, catA1_1, qnrS1_1, blaTEM-1B_1, sul1_5, dfrA1_8, tet(A)_6, blaOXA-1_1, aac(6')-Ib-cr_1, blaOXA-48_1]"


In [17]:
dfR_summary.to_csv('summaryBySample_ml07_resistance.csv', index=False)
dfV_summary.to_csv('summaryBySample_ml07_virulenceKp.csv', index=False)
dfP_summary.to_csv('summaryBySample_ml07_plasREP.csv', index=False)