In [72]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
input_file = "Exponential_High_Complexity_Simulation/Exponential_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt"
df = pd.read_csv(input_file, sep="\t")
df = df.sort_values(by="pvalue")
p_values = list(df['pvalue'])

df['True_Label'] = [x.split("_")[0] for x in df['name']]


y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
y_pred = [1 if x <= 0.01 else 0 for x in df['pvalue']]


p = precision_score(y_true, y_pred)
r = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Scores if prophages are considered bacteria")
print("Precision: " + str(p))
print("Recall: " + str(r))
print("F1 Score: " + str(f1))
my_dict = {}
my_dict['Prophage_Bacteria'] = [p,r,f1]

df['True_Label'] = df['True_Label'].replace(regex='Prophage', value="Phage")
y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
y_pred = [1 if x <= 0.01 else 0 for x in df['pvalue']]




Scores if prophages are considered bacteria
Precision: 0.5312273057371096
Recall: 0.6462014134275619
F1 Score: 0.5831008369868473
Scores if prophages are considered bacteria
Precision: 0.611837327523602
Recall: 0.048348684399299874
F1 Score: 0.08961574258742189


### Remove Prophage

In [12]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
input_file = "Uniform_High_Complexity_Simulation/Uniform_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt"
df = pd.read_csv(input_file, sep="\t")
df = df.sort_values(by="pvalue")
p_values = list(df['pvalue'])

df['True_Label'] = [x.split("_")[0] for x in df['name']]
print(len(df))
df = df[df['True_Label']!="Prophage"]
print(len(df))
y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
y_pred = [1 if x <= 0.01 else 0 for x in df['pvalue']]


p = precision_score(y_true, y_pred)
r = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Scores if prophages are considered bacteria")
print("Precision: " + str(p))
print("Recall: " + str(r))
print("F1 Score: " + str(f1))
my_dict = {}
my_dict['Prophage_Bacteria'] = [p,r,f1]


w = pd.DataFrame(my_dict.items(), columns=["Prophage Category",2])
w = w.set_index("Prophage Category")
w['Precision'] = [x[0] for x in w[2]]
w['Recall'] = [x[1] for x in w[2]]
w['F1 Score'] = [x[2] for x in w[2]]

w['Simulation'] = "uniform"
w['Complexity'] = "high"
w['Tool'] = "DeepVirFinder"
w['Parameter'] = 0.01

w = w.drop(2, axis=1)
w.to_csv("DeepVirFinder_Prophages_Removed.csv", header=None, mode='a')

293877
240288
Scores if prophages are considered bacteria
Precision: 0.5167348904406454
Recall: 0.692258064516129
F1 Score: 0.5917551358058735


In [None]:
df = pd.read_csv(input_file)
df = df.sort_values(by="pvalue")

### DeepVirFinder Thresholding Analysis 

In [5]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

def DVF_Thresholding(input_file, simulation, complexity):
    df = pd.read_csv(input_file, sep="\t")
    df = df.sort_values(by="pvalue")

    df['True_Label'] = [x.split("_")[0] for x in df['name']]
    
    
    b_dict = {}
    v_dict = {}
    
    for item in np.arange(0.005,0.10, 0.005):
       
        y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
        y_pred = [1 if x <= item else 0 for x in df['pvalue']]

        f1 = f1_score(y_true, y_pred)
   
        name = 'Prophage_Bacteria_'+ simulation + "_" + complexity + "_" +str(round(item, 4))
        b_dict[name] = [f1]
        
    
    df['True_Label'] = df['True_Label'].replace(regex='Prophage', value="Phage")
    for item in np.arange(0.005,0.10, 0.005):  
        ### Viral
        y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
        y_pred = [1 if x <= item else 0 for x in df['pvalue']]

        f1 = f1_score(y_true, y_pred)

        name = 'Prophage_Viral_'+ simulation + "_" + complexity + "_" +str(round(item, 4))
        v_dict[name] = [f1]
        
    top_b = max(b_dict, key=b_dict.get)
    top_v = max(v_dict, key=v_dict.get)
        
        
    return(top_b, b_dict[top_b], top_v, v_dict[top_v])

In [7]:
bac = {}
vir = {}
nameb, scoreb, namev, scorev = DVF_Thresholding("Uniform_High_Complexity_Simulation/Uniform_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "uniform", "high")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Uniform_Medium_Complexity_Simulation/Uniform_Medium_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", 'uniform', "medium")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Uniform_Low_Complexity_Simulation/Uniform_Low_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "uniform", "low")
bac[nameb] = scoreb
vir[namev] = scorev


nameb, scoreb, namev, scorev = DVF_Thresholding("Lognormal_High_Complexity_Simulation/Lognormal_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "lognormal", "high")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Lognormal_Medium_Complexity_Simulation/Lognormal_Medium_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", 'lognormal', "medium")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Lognormal_Low_Complexity_Simulation/Lognormal_Low_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "lognormal", "low")
bac[nameb] = scoreb
vir[namev] = scorev

nameb, scoreb, namev, scorev = DVF_Thresholding("Zero_Inflated_Lognormal_High_Complexity_Simulation/Zero_Inflated_Lognormal_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "zero", "high")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Zero_Inflated_Lognormal_Medium_Complexity_Simulation/Zero_Inflated_Lognormal_Medium_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", 'zero', "medium")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Zero_Inflated_Lognormal_Low_Complexity_Simulation/Zero_Inflated_Lognormal_Low_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "zero", "low")
bac[nameb] = scoreb
vir[namev] = scorev


nameb, scoreb, namev, scorev = DVF_Thresholding("Exponential_High_Complexity_Simulation/Exponential_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "exponential", "high")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Exponential_Medium_Complexity_Simulation/Exponential_Medium_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", 'exponential', "medium")
bac[nameb] = scoreb
vir[namev] = scorev
nameb, scoreb, namev, scorev = DVF_Thresholding("Exponential_Low_Complexity_Simulation/Exponential_Low_Complexity_Simulation.fasta_gt500bp_dvfpred.txt", "exponential", "low")
bac[nameb] = scoreb
vir[namev] = scorev

In [8]:
bac

{'Prophage_Bacteria_exponential_high_0.01': [0.5831008369868473],
 'Prophage_Bacteria_exponential_low_0.01': [0.8363636363636363],
 'Prophage_Bacteria_exponential_medium_0.005': [0.6090373280943026],
 'Prophage_Bacteria_lognormal_high_0.01': [0.639494026704146],
 'Prophage_Bacteria_lognormal_low_0.01': [0.42105263157894735],
 'Prophage_Bacteria_lognormal_medium_0.005': [0.6080760095011876],
 'Prophage_Bacteria_uniform_high_0.005': [0.5835359499826329],
 'Prophage_Bacteria_uniform_low_0.01': [0.7692307692307693],
 'Prophage_Bacteria_uniform_medium_0.005': [0.7367506516072979],
 'Prophage_Bacteria_zero_high_0.005': [0.5774086378737541],
 'Prophage_Bacteria_zero_low_0.01': [0.6],
 'Prophage_Bacteria_zero_medium_0.005': [0.7048567870485679]}

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score
input_file = "Exponential_High_Complexity_Simulation/Exponential_High_Complexity_Simulation.fasta_gt500bp_dvfpred.txt"
df = pd.read_csv(input_file, sep="\t")
df = df.sort_values(by="pvalue")
p_values = list(df['pvalue'])

df['True_Label'] = [x.split("_")[0] for x in df['name']]


y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
y_pred = [1 if x <= 0.01 else 0 for x in df['pvalue']]


p = precision_score(y_true, y_pred)
r = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)
print("Scores if prophages are considered bacteria")
print("Precision: " + str(p))
print("Recall: " + str(r))
print("F1 Score: " + str(f1))
my_dict = {}
my_dict['Prophage_Bacteria'] = [p,r,f1]

df['True_Label'] = df['True_Label'].replace(regex='Prophage', value="Phage")
y_true = [1 if x == "Phage" else 0 for x in df['True_Label']]
y_pred = [1 if x <= 0.01 else 0 for x in df['pvalue']]


w = pd.DataFrame(my_dict.items(), columns=["Prophage Category",2])
w = w.set_index("Prophage Category")
w['Precision'] = [x[0] for x in w[2]]
w['Recall'] = [x[1] for x in w[2]]
w['F1 Score'] = [x[2] for x in w[2]]

w['Simulation'] = "uniform"
w['Complexity'] = "high"
w['Tool'] = "DeepVirFinder"
w['Parameter'] = 0.01

w = w.drop(2, axis=1)
w.to_csv("DeepVirFinder_Prophages_Removed.csv", header=None, mode='a')

In [22]:
from statsmodels.stats.multitest import fdrcorrection
x = fdrcorrection(p_values, alpha=0.05, method="p", is_sorted=True)

In [23]:
df['fdr'] = x[1]

In [11]:
df_phages = df[df['pvalue']<=0.05]

In [12]:
len(df_phages)

11450

In [13]:
df_phages['True_Label'] = [x.split("_")[0] for x in df_phages['name']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [14]:
df_phages.groupby("True_Label").count()

Unnamed: 0_level_0,name,len,score,pvalue
True_Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bacteria,7827,7827,7827,7827
Eukaroyte,123,123,123,123
Phage,2001,2001,2001,2001
Prophage,1499,1499,1499,1499
