In [1]:
import pandas as pd

folder_path = "../data/"

In [2]:
def get_data(data_name):
    data_path = folder_path + f"snv-parse-{data_name}.txt"
    df = pd.read_csv(data_path, sep='\t', low_memory=False)
    df_calc = df.astype({c:int for c in df.select_dtypes(include='bool').columns})
    df['pred_1'] = ((df_calc['FILTER_Mutect2'] + df_calc['FILTER_Freebayes'] + df_calc['FILTER_Vardict'] + df_calc['FILTER_Varscan']) > 0)
    df['pred_2'] = ((df_calc['FILTER_Mutect2'] + df_calc['FILTER_Freebayes'] + df_calc['FILTER_Vardict'] + df_calc['FILTER_Varscan']) > 1)
    df['pred_3'] = ((df_calc['FILTER_Mutect2'] + df_calc['FILTER_Freebayes'] + df_calc['FILTER_Vardict'] + df_calc['FILTER_Varscan']) > 2)
    df['pred_4'] = ((df_calc['FILTER_Mutect2'] + df_calc['FILTER_Freebayes'] + df_calc['FILTER_Vardict'] + df_calc['FILTER_Varscan']) > 3)
    return df

In [3]:
import pandas as pd

def calc_F1(pred, truth):
    predv = pred['Chr'].astype(str) + pred['START_POS_REF'].astype(str)
    truthv = truth['Chr'].astype(str) + truth['START_POS_REF'].astype(str)

    res = pd.DataFrame(columns=['TP', 'FP', 'FN', 'Precision', 'Recall', 'F1'])

    res.loc[0, 'TP'] = sum(predv.isin(truthv))
    res.loc[0, 'FP'] = sum(~predv.isin(truthv))
    res.loc[0, 'FN'] = sum(~truthv.isin(predv))

    res.loc[0, 'Precision'] = res.loc[0, 'TP'] / (res.loc[0, 'TP'] + res.loc[0, 'FP'])
    res.loc[0, 'Recall'] = res.loc[0, 'TP'] / (res.loc[0, 'TP'] + res.loc[0, 'FN'])
    res.loc[0, 'F1'] = (2 * res.loc[0, 'Precision'] * res.loc[0, 'Recall']) / (res.loc[0, 'Precision'] + res.loc[0, 'Recall'])

    return res

In [4]:
def get_results(data_list, truth_list, method_list, cols=['Chr', 'START_POS_REF', 'END_POS_REF']):
    
    res_list = []
    for data_name, truth_name in zip(data_list, truth_list):
        df = get_data(data_name)
    
        truth_path = folder_path + f"{data_name}/{truth_name}.bed"
        truth = pd.read_csv(truth_path, sep='\t', header=None, names = ['Chr', 'START_POS_REF', 'END_POS_REF'])
        for m in method_list:
            pred = df.loc[df[m],cols]
            res_df = calc_F1(pred, truth)
            res_df['method'] = m
            res_df['data'] = data_name
            res_list.append(res_df)
            
    return pd.concat(res_list, axis=0).reset_index(drop=True)

In [5]:
data_list = ["real1", "real2_part1", "syn1", "syn2", "syn3", "syn4", "syn5"]
truth_list = ["real1_truth", "real2_truth_chr1to5", "syn1_truth", "syn2_truth", "syn3_truth", "syn4_truth", "syn5_truth"]
method_list = ["FILTER_Mutect2", "FILTER_Freebayes", 'FILTER_Vardict', 'FILTER_Varscan', "pred_1", "pred_2", "pred_3", "pred_4"]

results = get_results(data_list, truth_list, method_list)

In [6]:
pd.pivot_table(results, values=["F1"], index=['method'], columns='data')

Unnamed: 0_level_0,F1,F1,F1,F1,F1,F1,F1
data,real1,real2_part1,syn1,syn2,syn3,syn4,syn5
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
FILTER_Freebayes,0.294353,0.213181,0.559954,0.62665,0.727889,0.529218,0.963614
FILTER_Mutect2,0.3573,0.239803,0.536368,0.511815,0.733557,0.752919,0.960262
FILTER_Vardict,0.267087,0.134204,0.506445,0.544754,0.67303,0.729626,0.974785
FILTER_Varscan,0.01391,0.017015,0.011428,0.187138,0.257474,0.494212,0.718457
pred_1,0.050317,0.03889,0.13692,0.173131,0.288591,0.43333,0.975856
pred_2,0.557166,0.299844,0.752102,0.778101,0.886467,0.796039,0.983169
pred_3,0.812779,0.674503,0.919559,0.926835,0.940371,0.801783,0.966615
pred_4,0.226415,0.347403,0.086084,0.786949,0.680819,0.644913,0.673574


In [7]:
pd.pivot_table(results, values=["Precision"], index=['method'], columns='data')

Unnamed: 0_level_0,Precision,Precision,Precision,Precision,Precision,Precision,Precision
data,real1,real2_part1,syn1,syn2,syn3,syn4,syn5
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
FILTER_Freebayes,0.177414,0.124041,0.393294,0.462826,0.619339,0.504501,0.998865
FILTER_Mutect2,0.221416,0.13852,0.36836,0.346044,0.600371,0.734215,0.969163
FILTER_Vardict,0.157873,0.073473,0.341083,0.377335,0.525144,0.681978,0.999236
FILTER_Varscan,0.007219,0.008804,0.006402,0.108538,0.165481,0.373202,0.995822
pred_1,0.025831,0.019867,0.073517,0.09483,0.169679,0.287527,0.966865
pred_2,0.399934,0.185364,0.607734,0.641957,0.833222,0.798717,0.999567
pred_3,0.796795,0.606504,0.877085,0.885432,0.969497,0.948355,0.999765
pred_4,0.798165,0.856,0.652,0.981041,0.995133,0.991354,0.999957


In [8]:
pd.pivot_table(results, values=["Recall"], index=['method'], columns='data')

Unnamed: 0_level_0,Recall,Recall,Recall,Recall,Recall,Recall,Recall
data,real1,real2_part1,syn1,syn2,syn3,syn4,syn5
method,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
FILTER_Freebayes,0.863533,0.757637,0.971727,0.969991,0.882576,0.556482,0.930767
FILTER_Mutect2,0.924943,0.892057,0.986146,0.982456,0.94268,0.772602,0.951524
FILTER_Vardict,0.866566,0.773931,0.983036,0.979224,0.936859,0.784432,0.951502
FILTER_Varscan,0.190296,0.252546,0.053152,0.67844,0.57978,0.731352,0.561941
pred_1,0.965883,0.91446,0.995194,0.993306,0.96457,0.87913,0.985016
pred_2,0.91812,0.784114,0.986429,0.987535,0.946982,0.79338,0.967301
pred_3,0.829416,0.759674,0.966356,0.972299,0.912944,0.694453,0.935593
pred_4,0.131918,0.217923,0.046084,0.656971,0.517398,0.477904,0.507822


In [9]:
pd.pivot_table(results, values=["F1"], index=['method'], columns=None)

Unnamed: 0_level_0,F1
method,Unnamed: 1_level_1
FILTER_Freebayes,0.559266
FILTER_Mutect2,0.584575
FILTER_Vardict,0.547133
FILTER_Varscan,0.242805
pred_1,0.299576
pred_2,0.721841
pred_3,0.863206
pred_4,0.492308


In [10]:
pd.pivot_table(results, values=["Precision"], index=['method'], columns=None)

Unnamed: 0_level_0,Precision
method,Unnamed: 1_level_1
FILTER_Freebayes,0.468612
FILTER_Mutect2,0.482584
FILTER_Vardict,0.450875
FILTER_Varscan,0.237924
pred_1,0.234017
pred_2,0.638071
pred_3,0.869062
pred_4,0.896236


In [11]:
pd.pivot_table(results, values=["Recall"], index=['method'], columns=None)

Unnamed: 0_level_0,Recall
method,Unnamed: 1_level_1
FILTER_Freebayes,0.847531
FILTER_Mutect2,0.921773
FILTER_Vardict,0.896507
FILTER_Varscan,0.435358
pred_1,0.956794
pred_2,0.91198
pred_3,0.867248
pred_4,0.365146
