In [8]:
import allel
import numpy as np
import pandas as pd
from functools import reduce

import os

from sklearn import metrics

## Finds the two best methods (by F1 score) and gets their intersection

In [6]:
# Function to find ground truths from bed file
# Returns a dataframe with columns [CHROM, POS, truth]

def find_GT(path_to_GT):
    truth_labels = pd.read_csv(path_to_GT, sep = "\t", names = ['Chromo', 'start', 'end'])
    truth_labels = truth_labels[['Chromo', 'start']]
    truth_labels['truth'] = 1
    sub_truth = truth_labels.rename(columns = {'Chromo':'CHROM', 'start':'POS'})
    return sub_truth

In [140]:
# Function to find the best methods for a single dataset
# Returns the names of the two best methods

def find_best_methods(path_to_dataset, dataset_name):
    gt_file = dataset_name + '_truth.bed'
    gt = find_GT(os.path.join(path_to_dataset,gt_file))
    gt['CHROM'] = gt['CHROM'].astype('O')
    
    algos = ['freebayes','mutect2','vardict','varscan']
    f1_scores = dict({}) # score to alg mapping
    
    features = ['CHROM','POS','is_snp']
    for alg in algos:
        df = allel.vcf_to_dataframe(f'{path_to_dataset}/{dataset_name}-{alg}.vcf.gz', fields=features)
        df = df[df['is_snp']]
        combined = df.merge(gt, on=['CHROM','POS'], how ='outer')
        combined['truth'].fillna(0, inplace = True)
        combined['is_snp'].fillna(False, inplace = True)
        f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['is_snp'])
        f1_scores[f1_score] = alg
        print(f'{alg}: {f1_score}')
    
    top_algs = []
    for key in sorted(f1_scores.keys()):
        top_algs.append(f1_scores[key])
    
    return top_algs[2:]

In [141]:
# Trying out the function on syn1 dataset

path_to_dataset = '/Volumes/Samsung_T5/4220 proj/syn1'
dataset_name = 'syn1'

find_best_methods(path_to_dataset, dataset_name)

freebayes: 0.0017405186328159062
mutect2: 0.07192197877544337
vardict: 0.0017321868871819903
varscan: 0.0016516818778240486


['freebayes', 'mutect2']

In [142]:
# Finds the two best methods across all the training data (syn1-5 + real1)

path_to_datasets = '/Volumes/Samsung_T5/4220 proj/'
dataset_names = ['syn1','syn2','syn3','syn4','syn5','real1']

best_methods = []

for dataset in dataset_names:
    path_to_dataset = path_to_datasets + dataset
    best_methods.append(find_best_methods(path_to_dataset, dataset))
    print()
    
for i in range(len(best_methods)):
    print(f'{dataset_names[i]}: {best_methods[i]}')

freebayes: 0.0017405186328159062
mutect2: 0.07192197877544337
vardict: 0.0017321868871819903
varscan: 0.0016516818778240486

freebayes: 0.0020337389892545093
mutect2: 0.08387783605340521
vardict: 0.0020326734313138644
varscan: 0.001969522773394198

freebayes: 0.003501680738362212
mutect2: 0.14931522748375117
vardict: 0.003710212424867112
varscan: 0.003470315461110427

freebayes: 0.004878474954197866
mutect2: 0.2584651988698489
vardict: 0.0073268929943912636
varscan: 0.0066733933352168

freebayes: 0.02175385754214899
mutect2: 0.632614222209598
vardict: 0.0231993655589756
varscan: 0.022083705485152552

freebayes: 0.0005837095485071058
mutect2: 0.02521270782920927
vardict: 0.000585347676593167
varscan: 0.0005751951899784784

syn1: ['freebayes', 'mutect2']
syn2: ['freebayes', 'mutect2']
syn3: ['vardict', 'mutect2']
syn4: ['vardict', 'mutect2']
syn5: ['vardict', 'mutect2']
real1: ['vardict', 'mutect2']


## Tests accuracy of method

In [5]:
# Get ground truths
path_to_truth = '/Volumes/Samsung_T5/4220 proj/real2_part1/real2_truth_chr1to5.bed'

truth_labels = pd.read_csv(path_to_truth, sep = "\t", names = ['Chromo', 'start', 'end'])
print(list(set(truth_labels.start == truth_labels.end) )) # the start and end position are the same 
truth_labels = truth_labels[['Chromo', 'start']]
truth_labels['truth'] = 1
sub_truth= truth_labels.rename(columns = {'Chromo':'CHROM', 'start':'POS'})
sub_truth.head()

[True]


Unnamed: 0,CHROM,POS,truth
0,1,2171787,1
1,1,9414323,1
2,1,13852321,1
3,1,14995104,1
4,1,20818544,1


In [133]:
path_to_dataset = '/Volumes/Samsung_T5/4220 proj/real2_part1'
dataset_name = 'real2'

gt_file = dataset_name + '_truth_chr1to5.bed'
gt = find_GT(os.path.join(path_to_dataset,gt_file))
gt['CHROM'] = gt['CHROM'].astype('O')
gt

Unnamed: 0,CHROM,POS,truth
0,1,2171787,1
1,1,9414323,1
2,1,13852321,1
3,1,14995104,1
4,1,20818544,1
...,...,...,...
486,5,173169047,1
487,5,176198923,1
488,5,178004393,1
489,5,178110086,1


In [135]:
algos = ['mutect2','vardict'] # top two algos

features = ['CHROM','POS','is_snp']
dfs = [allel.vcf_to_dataframe(f'{path_to_dataset}/{dataset_name}_{alg}_chr1to5.vcf.gz', fields=features) for alg in algos]

keep_same = {'CHROM', 'POS'}
for i in range(2):
    dfs[i] = dfs[i][dfs[i]['is_snp']] # obtain only SNPs
    dfs[i].columns = ['{}{}'.format(c, '' if c in keep_same else '_' + algos[i]) for c in dfs[i].columns]

merged = reduce(lambda left, right: pd.merge(left, right, on =['CHROM', 'POS'],
                                                how = 'inner', suffixes = ('', '')), dfs)

merged['is_snp'] = 1
merged

Unnamed: 0,CHROM,POS,is_snp_mutect2,is_snp_vardict,is_snp
0,1,14574,True,True,1
1,1,17020,True,True,1
2,1,17722,True,True,1
3,1,17730,True,True,1
4,1,20166,True,True,1
...,...,...,...,...,...
3982,5,180716496,True,True,1
3983,5,180716531,True,True,1
3984,5,180733023,True,True,1
3985,5,180890210,True,True,1


In [136]:
combined = merged.merge(gt, on=['CHROM','POS'], how='outer')
combined['truth'].fillna(0, inplace = True)
combined['is_snp'].fillna(0, inplace=True)
f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['is_snp'])

print(f1_score)

0.0


In [137]:
combined

Unnamed: 0,CHROM,POS,is_snp_mutect2,is_snp_vardict,is_snp,truth
0,1,14574,True,True,1.0,0.0
1,1,17020,True,True,1.0,0.0
2,1,17722,True,True,1.0,0.0
3,1,17730,True,True,1.0,0.0
4,1,20166,True,True,1.0,0.0
...,...,...,...,...,...,...
4473,5,173169047,,,0.0,1.0
4474,5,176198923,,,0.0,1.0
4475,5,178004393,,,0.0,1.0
4476,5,178110086,,,0.0,1.0


## Tests accuracy for other datasets

In [145]:
# pre: best_methods is a 2D list where each entry contains 2 best algos for the dataset

path_to_datasets = '/Volumes/Samsung_T5/4220 proj/'
dataset_names = ['syn1','syn2','syn3','syn4','syn5','real1']

In [149]:
# Get scores for all datasets

features = ['CHROM','POS','is_snp']
keep_same = {'CHROM', 'POS'}

for i in range(len(dataset_names)):
    path_to_gt = f'{path_to_datasets}{dataset_names[i]}/{dataset_names[i]}_truth.bed'

    gt = find_GT(path_to_gt)
    
    algos = best_methods[i] # top two algos

    dfs = [allel.vcf_to_dataframe(f'{path_to_datasets}{dataset_names[i]}/{dataset_names[i]}-{alg}.vcf.gz', fields=features) for alg in algos]

    for j in range(2):
        dfs[j] = dfs[j][dfs[j]['is_snp']] # obtain only SNPs
        dfs[j].columns = ['{}{}'.format(c, '' if c in keep_same else '_' + algos[j]) for c in dfs[j].columns]

    merged = reduce(lambda left, right: pd.merge(left, right, on =['CHROM', 'POS'],
                                                    how = 'inner', suffixes = ('', '')), dfs)

    merged['is_snp'] = 1
    
    combined = merged.merge(gt, on=['CHROM','POS'], how='outer')
    combined['truth'].fillna(0, inplace = True)
    combined['is_snp'].fillna(0, inplace=True)
    
    precision_score = metrics.precision_score(y_true = combined['truth'], y_pred = combined['is_snp'])
    recall_score = metrics.recall_score(y_true = combined['truth'], y_pred = combined['is_snp'])
    f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['is_snp'])

    print(f'{dataset_names[i]}: {best_methods[i]}')
    print(f'Precision: {precision_score}')
    print(f'Recall: {recall_score}')
    print(f'F1: {f1_score}')

syn1: ['freebayes', 'mutect2']
Precision: 0.38625566235775055
Recall: 0.988408255583828
F1: 0.5554496345726089
syn2: ['freebayes', 'mutect2']
Precision: 0.40939597315436244
Recall: 0.9856879039704525
F1: 0.578512396694215
syn3: ['vardict', 'mutect2']
Precision: 0.43732478974769723
Recall: 0.9672276350752879
F1: 0.6023166023166023
syn4: ['vardict', 'mutect2']
Precision: 0.43521434539494824
Recall: 0.8300950045969966
F1: 0.571037041722008
syn5: ['vardict', 'mutect2']
Precision: 0.9678513981006104
Recall: 0.9678513981006104
F1: 0.9678513981006104
real1: ['vardict', 'mutect2']
Precision: 0.09652477552377785
Recall: 0.8802122820318423
F1: 0.17397167902899527


In [151]:
# Get scores for real2 part1

features = ['CHROM','POS','is_snp']
keep_same = {'CHROM', 'POS'}

path_to_gt = f'{path_to_datasets}real2_part1/real2_truth_chr1to5.bed'

gt = find_GT(path_to_gt)
gt['CHROM'] = gt['CHROM'].astype('O')

algos = ['vardict','mutect2'] # top two algos

dfs = [allel.vcf_to_dataframe(f'{path_to_datasets}real2_part1/real2_{alg}_chr1to5.vcf.gz', fields=features) for alg in algos]

for j in range(2):
    dfs[j] = dfs[j][dfs[j]['is_snp']] # obtain only SNPs
    dfs[j].columns = ['{}{}'.format(c, '' if c in keep_same else '_' + algos[j]) for c in dfs[j].columns]

merged = reduce(lambda left, right: pd.merge(left, right, on =['CHROM', 'POS'],
                                                how = 'inner', suffixes = ('', '')), dfs)

merged['is_snp'] = 1

combined = merged.merge(gt, on=['CHROM','POS'], how='outer')
combined['truth'].fillna(0, inplace = True)
combined['is_snp'].fillna(0, inplace=True)

precision_score = metrics.precision_score(y_true = combined['truth'], y_pred = combined['is_snp'])
recall_score = metrics.recall_score(y_true = combined['truth'], y_pred = combined['is_snp'])
f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['is_snp'])

print(f'{dataset_names[i]}: {best_methods[i]}')
print(f'Precision: {precision_score}')
print(f'Recall: {recall_score}')
print(f'F1: {f1_score}')

real1: ['vardict', 'mutect2']
Precision: 0.0
Recall: 0.0
F1: 0.0


## Checking that the F1 calculation method works

In [143]:
gt_try = pd.read_csv('calculateF1/real1_truth.bed', sep = "\t", names = ['Chromo','start','end'])
gt_try = gt_try[['Chromo','start']]
gt_try['truth'] = 1
gt_try = gt_try.rename(columns = {'Chromo':'CHROM', 'start':'POS'})

gt_pred = pd.read_csv('calculateF1/my-real1-predictions.bed', sep = "\t")
gt_pred = gt_pred[['Chr','START_POS_REF']]
gt_pred['pred'] = 1
gt_pred = gt_pred.rename(columns = {'Chr':'CHROM', 'START_POS_REF':'POS'})

In [144]:
combined = gt_try.merge(gt_pred, on=['CHROM','POS'], how='outer')
combined['truth'].fillna(0, inplace=True)
combined['pred'].fillna(0, inplace=True)

precision_score = metrics.precision_score(y_true = combined['truth'], y_pred = combined['pred'])
recall_score = metrics.recall_score(y_true = combined['truth'], y_pred = combined['pred'])
f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['pred'])

print(f'Precision: {precision_score}')
print(f'Recall: {recall_score}')
print(f'F1: {f1_score}')

Precision: 0.39993394980184943
Recall: 0.9181197877179682
F1: 0.5571658615136876


In [None]:
# Result should be 0.3999339 0.9181198 0.5571659