In [2]:
import allel
import numpy as np
import pandas as pd
from functools import reduce
import matplotlib.pyplot as plt 
import seaborn as sns
from numpy import nan
import glob
from sklearn import metrics

In [3]:
def naive1(folder_path):
    features = ['CHROM','POS','is_snp']
    lst_df = []
    for filename in glob.glob(folder_path + '/*vcf.gz'):
        df = allel.vcf_to_dataframe(filename, fields = features)
        df = df[df.is_snp == True]
        lst_df.append(df)
        
    suffix = ['vs','fb','m2','vd']
    keep_same = {'CHROM', 'POS'}
    i =0 
    for dfs in lst_df:
        dfs.columns = ['{}{}'.format(c, '' if c in keep_same else '_'+suffix[i]) for c in dfs.columns]
        i += 1
        
    merged_df = reduce(lambda left, right: pd.merge(left, right,on =['CHROM', 'POS'],
                                            how = 'outer', suffixes = ('', '')),lst_df)
    
    df = merged_df[merged_df[['is_snp_vd','is_snp_fb','is_snp_m2','is_snp_vs']].isnull().sum(axis=1) < 3]
    df['end']= df['POS']
    
    return df

In [4]:
# for bed file input 
def getF1(pred_file, truth_file):
    gt_pred = pd.read_csv(pred_file, sep = "\t", names = ['Chromo', 'start', 'end'])
    gt_pred['pred'] = 1
    gt_truth = pd.read_csv(truth_file, sep = "\t", names = ['Chromo', 'start', 'end'])
    gt_truth['truth'] = 1
    combined = gt_truth.merge(gt_pred, on=['Chromo', 'start'], how='outer')
    combined['truth'].fillna(0, inplace=True)
    combined['pred'].fillna(0, inplace=True)
    f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['pred'])
    score = metrics.precision_recall_fscore_support(y_true = combined['truth'], y_pred = combined['pred'])
    print(f'F1-score:{f1_score}')
    print(f'Overall:{score}')

In [5]:
def getF1(pred_df, truth_file):
    pred_df['pred'] = 1
    gt_truth = pd.read_csv(truth_file, sep = "\t", names = ['Chromo', 'start', 'end'])
    gt_truth['truth'] = 1
    gt_truth['Chromo'] = gt_truth['Chromo'].astype(str)
    combined = gt_truth.merge(pred_df, on=['Chromo', 'start'], how='outer')
    combined['truth'].fillna(0, inplace=True)
    combined['pred'].fillna(0, inplace=True)
    f1_score = metrics.f1_score(y_true = combined['truth'], y_pred = combined['pred'])
    precision_score = metrics.precision_score(y_true = combined['truth'], y_pred = combined['pred'])
    recall_score = metrics.recall_score(y_true = combined['truth'], y_pred = combined['pred'])


    print(f'F1-score: {f1_score}')
    print(f'Precision: {precision_score}')
    print(f'Recall: {recall_score}')

In [6]:
pd.options.mode.chained_assignment = None
dataset_lst = ['real1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5', 'real2_part1']
for dataset in dataset_lst:
    df = naive1(dataset)
    df = df[['CHROM','POS']]
    df.rename(columns = {'CHROM':'Chromo', 'POS':'start'}, inplace = True)
    print(dataset)
    if dataset == 'real2_part1':
        getF1(df, f'{dataset}/real2_truth_chr1to5.bed')
    else:
        getF1(df, f'{dataset}/{dataset}_truth.bed')
    

real1
F1-score: 0.0006200159752311822
Precision: 0.00031010967487475003
Recall: 0.9454131918119788
syn1
F1-score: 0.001740502966811346
Precision: 0.0008710127117900053
Recall: 0.9957591178965225
syn2
F1-score: 0.0020651890382280746
Precision: 0.0010336690735966135
Recall: 0.9933056325023084
syn3
F1-score: 0.003766522400280735
Precision: 0.0018868951857558946
Recall: 0.9778565101860053
syn4
F1-score: 0.0074706839623875625
Precision: 0.003751354831081303
Recall: 0.8750842782715292
syn5
F1-score: 0.023130953237531105
Precision: 0.011702940938450937
Recall: 0.9846197915519027
real2_part1
F1-score: 0.0005291859015030521
Precision: 0.00026468180391250314
Recall: 0.7881873727087576
