## COMBINING RESULTS from mlplasmid AND abricate software - Plasmid replicon location

This script allows the combination of results from mlplasmid software (contig prediction as plasmid or chromosome) and abricate software (presence/absence of genes of interest) results. In this case we are using abricate results based on PlasmidFinder database: plasmid replicon genes.

In [1]:
import pandas as pd
import os
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)

In [2]:
mlplasmid = 'mlplasmids_all_curated.csv'
#this is the same as open excel file and "save as" CSV UTF-8 comma-delimited .csv
#read_file = pd.read_excel('plasmidFinder_cov70id90.xlsx')
#read_file.to_csv('plasmidFinder_cov70id90.csv', index = None, header=True)
abricate = 'plasmidfinder_results_cov80id90.csv'

In [3]:
# check separator of csv files, in TextEditor for example
dfml = pd.read_csv(mlplasmid, sep=',')
dfab = pd.read_csv(abricate, sep=',')

In [4]:
dfml.head()

Unnamed: 0.1,Unnamed: 0,Prob_Chromosome,Prob_Plasmid,Prediction,Isolate_contig,Contig_length,Contig_info
0,1,0.998197,0.001803,Chromosome,544651_contig_1,116416,
1,2,0.997998,0.002002,Chromosome,544651_contig_2,461922,
2,3,0.995752,0.004248,Chromosome,544651_contig_3,100222,
3,4,0.995657,0.004343,Chromosome,544651_contig_4,34295,
4,5,0.999379,0.000621,Chromosome,544651_contig_5,162471,


In [5]:
#dfml = dfml[['Isolate_contig', 'Prediction']]
dfml = dfml[['Isolate_contig', 'Prob_Chromosome', 'Prob_Plasmid','Prediction']]

In [6]:
dfml.head()

Unnamed: 0,Isolate_contig,Prob_Chromosome,Prob_Plasmid,Prediction
0,544651_contig_1,0.998197,0.001803,Chromosome
1,544651_contig_2,0.997998,0.002002,Chromosome
2,544651_contig_3,0.995752,0.004248,Chromosome
3,544651_contig_4,0.995657,0.004343,Chromosome
4,544651_contig_5,0.999379,0.000621,Chromosome


In [7]:
dfab.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE
0,544651_contig_33,15756,16416,+,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,
1,544651_contig_54,13614,14173,-,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.75,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,
2,544663_contig_18,8151,8811,-,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,
3,544663_contig_36,2947,3077,-,ColRNAI_1,1-130/130,========/======,01-Jan,100.0,97.71,plasmidfinder,DQ298019,ColRNAI_1__DQ298019,
4,544663_contig_46,12479,13038,+,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.93,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,


In [8]:
df = pd.merge(dfab, dfml, how='left', on=['Isolate_contig'])

In [9]:
df.to_csv('mlplasmids_plasREP_intersect.csv', index=False)

In [10]:
dfab.shape

(476, 14)

In [11]:
df.shape

(476, 17)

In [12]:
df.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE,Prob_Chromosome,Prob_Plasmid,Prediction
0,544651_contig_33,15756,16416,+,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,,0.047315,0.952685,Plasmid
1,544651_contig_54,13614,14173,-,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.75,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,,0.04267,0.95733,Plasmid
2,544663_contig_18,8151,8811,-,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,,0.030473,0.969527,Plasmid
3,544663_contig_36,2947,3077,-,ColRNAI_1,1-130/130,========/======,01-Jan,100.0,97.71,plasmidfinder,DQ298019,ColRNAI_1__DQ298019,,0.049174,0.950826,Plasmid
4,544663_contig_46,12479,13038,+,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.93,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,,0.017651,0.982349,Plasmid


In [13]:
df['sample'] = df.Isolate_contig.str.split('_').str[0]

In [14]:
#to add contig column
df['contig'] = df.Isolate_contig.str.split('_').str[1] + df.Isolate_contig.str.split('_').str[2]

In [15]:
df.head()

Unnamed: 0,Isolate_contig,START,END,STRAND,GENE,COVERAGE,COVERAGE_MAP,GAPS,%COVERAGE,%IDENTITY,DATABASE,ACCESSION,PRODUCT,RESISTANCE,Prob_Chromosome,Prob_Plasmid,Prediction,sample,contig
0,544651_contig_33,15756,16416,+,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,,0.047315,0.952685,Plasmid,544651,contig33
1,544651_contig_54,13614,14173,-,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.75,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,,0.04267,0.95733,Plasmid,544651,contig54
2,544663_contig_18,8151,8811,-,IncL/M(pOXA-48)_1_pOXA-48,1-661/661,===============,0/0,100.0,100.0,plasmidfinder,JN626286,IncL/M(pOXA-48)_1_pOXA-48_JN626286,,0.030473,0.969527,Plasmid,544663,contig18
3,544663_contig_36,2947,3077,-,ColRNAI_1,1-130/130,========/======,01-Jan,100.0,97.71,plasmidfinder,DQ298019,ColRNAI_1__DQ298019,,0.049174,0.950826,Plasmid,544663,contig36
4,544663_contig_46,12479,13038,+,IncFIB(K)_1_Kpn3,1-560/560,===============,0/0,100.0,98.93,plasmidfinder,JN233704,IncFIB(K)_1_Kpn3_JN233704,,0.017651,0.982349,Plasmid,544663,contig46


In [16]:
df_summary3 = df.groupby(['sample', 'Prob_Plasmid', 'Prediction', 'contig'])['GENE'].apply(list).reset_index(name='genes')

In [17]:
df_summary3.to_csv('summaryByContigProb_ml_plasREP.csv', index=False) 

In [18]:
df_summary3

Unnamed: 0,sample,Prob_Plasmid,Prediction,contig,genes
0,544651,0.952685,Plasmid,contig33,[IncL/M(pOXA-48)_1_pOXA-48]
1,544651,0.95733,Plasmid,contig54,[IncFIB(K)_1_Kpn3]
2,544663,0.839698,Plasmid,contig49,[Col440I_1]
3,544663,0.950826,Plasmid,contig36,[ColRNAI_1]
4,544663,0.969527,Plasmid,contig18,[IncL/M(pOXA-48)_1_pOXA-48]
5,544663,0.982349,Plasmid,contig46,[IncFIB(K)_1_Kpn3]
6,544663,0.988557,Plasmid,contig53,[IncR_1]
7,544680,0.97006,Plasmid,contig41,[IncL/M(pOXA-48)_1_pOXA-48]
8,544680,0.994624,Plasmid,contig18,[IncFIB(K)_1_Kpn3]
9,544689,0.960841,Plasmid,contig46,[IncFIB(K)_1_Kpn3]
