## Variant Calling Report v1.2.2
## Input Parameters

In [35]:
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import numpy as np
import nexusplt as nxp
from configparser import ConfigParser

pd.options.display.float_format = '{:,.2%}'.format

configFile='var_report.config'
parser = ConfigParser()
parser.read(configFile)

prmNames=['run_id',
          'h5_concordance_file', 'h5_model_file'
         ]

prm={}
for name in prmNames:
    prm[name]=parser.get('VarReport', name)

h5outfile = parser.get('VarReport', 'h5_output', fallback='var_report.h5')
imgpref = parser.get('VarReport', 'image_output_prefix', fallback=prm['run_id']+'.vars')+'.'
imgdir = 'plots'

sources = {'Default':(prm['h5_concordance_file'],"concordance"),
           'Trained':(prm['h5_model_file'],"scored_concordance")
          }

data = {}
for s in sources:
    data[s]={}
    d=pd.read_hdf(sources[s][0], key=sources[s][1], mode='r')
    data[s]=d


prm['mean_var_depth']='{:.2f}'.format(data['Default']['well_mapped_coverage'].mean())
prmNames.append('mean_var_depth')
   
try:
    args=pd.read_hdf(sources['Default'][0], 'input_args', mode='r')
    prm['truth_sample_name']=args['truth_sample_name'][0]
except:
    prm['truth_sample_name']=parser.get('VarReport', 'truth_sample_name', fallback='NA')
prmNames.append('truth_sample_name')


prmdf = pd.DataFrame.from_dict(prm, orient='index',columns=['value']).reindex(prmNames)
prmdf.to_hdf(h5outfile, key="parameters")
prmdf

Unnamed: 0,value
run_id,140479-BC21
h5_concordance_file,/home/ec2-user/proj/BioinfoResearch/VariantCal...
h5_model_file,/home/ec2-user/proj/BioinfoResearch/VariantCal...
mean_var_depth,35.49
truth_sample_name,


In [36]:
def filterByCategory(data,cat):
    if cat=='SNP':
        return data[data['indel']==False]
    elif cat=='non-hmer Indel':
        return data[(data['indel']==True) & (data['hmer_indel_length']==0) & (data['indel_length']>0)]
    elif cat=='non-hmer Indel w/o LCR':
        return data[(data['indel']==True) & (data['hmer_indel_length']==0) & (data['indel_length']>0) & 
                    (~data['LCR-hs38'])]
    elif cat=='hmer Indel <=4':
        return data[(data['indel']==True) & (data['hmer_indel_length']>0) & (data['hmer_indel_length']<=4)]
    elif cat=='hmer Indel >4,<=8':
        return data[(data['indel']==True) & (data['hmer_indel_length']>4) & (data['hmer_indel_length']<=8)]
    elif cat=='hmer Indel >8,<=10':
        return data[(data['indel']==True) & (data['hmer_indel_length']>8) & (data['hmer_indel_length']<=10)]
    for i in range (1,10):
        if cat=='hmer Indel {0:d}'.format(i):
            return data[(data['indel']==True) & (data['hmer_indel_length']==i)]
    return None


In [37]:
def calcPerformance(data, gt=False):
    classify='classify_gt' if gt else 'classify'
    
    d=data.copy()
    d['tree_score'] = np.where(d[classify]=='fn',-1,d['tree_score'])
    d=d[[classify,'tree_score']].sort_values(by=['tree_score'])
    d['label'] = np.where(d[classify]=='fp',0,1)

    num=len(d)
    numPos=sum(d['label'])
    numNeg=num-numPos
    if num<10:
        return (pd.DataFrame(),None,numPos,numNeg)
    
    d['fn']=np.cumsum(d['label'])
    d['tp']=numPos-(d['fn'])
    d['fp']=numNeg-np.cumsum(1-d['label'])

    d['recall']=d['tp']/(d['tp']+d['fn'])
    d['precision']=d['tp']/(d['tp']+d['fp'])

    d['f1']=d['tp']/(d['tp']+0.5*d['fn']+0.5*d['fp'])

    d['mask']=((d['tp']+d['fn'])>=20) & ((d['tp']+d['fp'])>=20) & (d['tree_score']>=0)
    if len(d[d['mask']])==0:
        return (pd.DataFrame(),None,numPos,numNeg)
    
    maxF1=max(d[d['mask']]['f1'])
    d['opt']=(d['f1']==maxF1)

    return (d[['recall','precision']][d['mask']],
            d[['recall','precision','f1']][d['opt'] & d['mask']],
            numPos,numNeg)



In [38]:
def plotPerformance(perfCurve,optRes,categories,ext=None,img=None):
    n=len(categories)
    fig, ax = plt.subplots(1,n,figsize=(4*n,4))
    col=['r','b','g','m','k']

    for i,cat in enumerate(categories):
        for j,s in enumerate(sources):
            perf=perfCurve[s][cat]
            opt=optRes[s][cat]
            if not perf.empty:
                ax[i].plot(perf.recall,perf.precision,'-',label=s,color=col[j])    
                ax[i].plot(opt.recall,opt.precision,'o',color=col[j])
            title=cat if ext==None else '{0} ({1})'.format(cat,ext)
            ax[i].set_title(title)
            ax[i].set_xlabel("Recall")
            ax[i].set_xlim([0.4,1])
            ax[i].set_ylim([0.4,1])
            ax[i].grid(True)

    ax[0].set_ylabel("Precision")
    ax[0].legend(loc='lower left')
    
    if img:
        nxp.save(fig,imgpref+img,'png',outdir=imgdir)
    
    
def getPerformance(data,categories,gt=False):
    optTab={}
    optRes={}
    perfCurve={}
    for s in sources:
        optTab[s]=pd.DataFrame()
        optRes[s]={}
        perfCurve[s]={}

        for i,cat in enumerate(categories):
            d=filterByCategory(data[s],cat)
            perf,opt,pos,neg=calcPerformance(d,gt)
            perfCurve[s][cat]=perf
            optRes[s][cat]=opt
            
            row=pd.DataFrame({'# pos':pos,
                              '# neg':neg,
                              'max recall':np.nan if perf.empty else max(perf.recall),
                              'recall':np.nan if perf.empty else opt.recall[0],
                              'precision':np.nan if perf.empty else opt.precision[0],
                              'F1':np.nan if perf.empty else opt.f1[0]
                             },index=[cat])
            optTab[s]=pd.concat([optTab[s],row])
            
    return optTab,optRes,perfCurve


## Performance: all Data

In [39]:
categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']
optTab1,optRes,perfCurve=getPerformance(data,categories)
plotPerformance(perfCurve,optRes,categories,img='all.primary')

In [40]:
categories=['hmer Indel 4','hmer Indel 5','hmer Indel 6','hmer Indel 7','hmer Indel 8','hmer Indel >8,<=10']
optTab2,optRes,perfCurve=getPerformance(data,categories)
plotPerformance(perfCurve,optRes,categories,img='all.hmers')

No handles with labels found to put in legend.


In [41]:
pd.options.display.float_format = '{:,.2%}'.format

optTab={}
for s in sources:
    optTab[s]=pd.concat([optTab1[s], optTab2[s]])
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="all_data")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,7804,221,98.97%,98.83%,99.54%,99.18%,7804,221,98.97%,98.83%,99.55%,99.19%
non-hmer Indel,333,234,84.38%,79.88%,93.99%,86.36%,333,234,84.38%,79.88%,93.66%,86.22%
non-hmer Indel w/o LCR,240,196,89.17%,87.08%,95.00%,90.87%,240,196,89.17%,86.67%,96.30%,91.23%
hmer Indel <=4,112,792,98.21%,95.54%,96.40%,95.96%,112,792,98.21%,94.64%,96.36%,95.50%
"hmer Indel >4,<=8",85,50,97.65%,92.94%,98.75%,95.76%,85,50,97.65%,94.12%,96.39%,95.24%
hmer Indel 4,15,55,nan%,nan%,nan%,nan%,15,55,nan%,nan%,nan%,nan%
hmer Indel 5,11,27,nan%,nan%,nan%,nan%,11,27,nan%,nan%,nan%,nan%
hmer Indel 6,19,12,nan%,nan%,nan%,nan%,19,12,nan%,nan%,nan%,nan%
hmer Indel 7,24,7,100.00%,100.00%,100.00%,100.00%,24,7,100.00%,100.00%,100.00%,100.00%
hmer Indel 8,31,4,96.77%,96.77%,93.75%,95.24%,31,4,96.77%,96.77%,93.75%,95.24%


### Including genotype

In [42]:
categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']
optTab,optRes,perfCurve=getPerformance(data,categories,gt=True)
plotPerformance(perfCurve,optRes,categories)

In [43]:
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="all_data_gt")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,7801,224,98.90%,98.72%,99.61%,99.16%,7801,224,98.90%,98.72%,99.55%,99.13%
non-hmer Indel,332,235,78.31%,72.29%,96.39%,82.62%,332,235,78.31%,72.29%,96.77%,82.76%
non-hmer Indel w/o LCR,240,196,85.42%,82.92%,95.67%,88.84%,240,196,85.42%,82.92%,95.67%,88.84%
hmer Indel <=4,112,792,98.21%,95.54%,96.40%,95.96%,112,792,98.21%,94.64%,96.36%,95.50%
"hmer Indel >4,<=8",85,50,94.12%,92.94%,97.53%,95.18%,85,50,94.12%,90.59%,98.72%,94.48%


### homozygous genotyping accuracy

In [44]:
categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']

hmzData={}
for s in sources:
        d=data[s]
        hmzData[s]=d[(d['gt_ground_truth']==(1,1)) & (d['classify']!='fn')]
optTab,optRes,perfCurve=getPerformance(hmzData,categories,gt=True)
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="good_cvg_data_homozygous")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,3232,0,99.88%,99.88%,100.00%,99.94%,3232,0,99.88%,99.88%,100.00%,99.94%
non-hmer Indel,109,0,83.49%,83.49%,100.00%,91.00%,109,0,83.49%,83.49%,100.00%,91.00%
non-hmer Indel w/o LCR,86,0,88.37%,88.37%,100.00%,93.83%,86,0,88.37%,88.37%,100.00%,93.83%
hmer Indel <=4,50,0,98.00%,98.00%,100.00%,98.99%,50,0,98.00%,98.00%,100.00%,98.99%
"hmer Indel >4,<=8",27,0,85.19%,85.19%,100.00%,92.00%,27,0,85.19%,85.19%,100.00%,92.00%


## Performance: cvg>=20, mappability.0

In [45]:
filtData={}
for s in sources:
    d=data[s]
    filtData[s]=d[(d['well_mapped_coverage']>=20) &
                  (d['mappability.0'])
                 ]

categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']
optTab1,optRes,perfCurve=getPerformance(filtData,categories)
plotPerformance(perfCurve,optRes,categories,img='hicvg.primary')


In [46]:
categories=['hmer Indel 4','hmer Indel 5','hmer Indel 6','hmer Indel 7','hmer Indel 8','hmer Indel >8,<=10']
optTab2,optRes,perfCurve=getPerformance(filtData,categories)
plotPerformance(perfCurve,optRes,categories,img='hicvg.hmers')


No handles with labels found to put in legend.


In [47]:
optTab={}
for s in sources:
    optTab[s]=pd.concat([optTab1[s], optTab2[s]])
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="good_cvg_data")
defTable=df.copy()
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,7566,215,99.79%,99.51%,99.89%,99.70%,7566,215,99.79%,99.55%,99.84%,99.70%
non-hmer Indel,284,233,94.72%,88.38%,96.91%,92.45%,284,233,94.72%,89.08%,95.83%,92.34%
non-hmer Indel w/o LCR,218,195,95.41%,91.28%,98.51%,94.76%,218,195,95.41%,91.28%,98.03%,94.54%
hmer Indel <=4,107,782,99.07%,97.20%,96.30%,96.74%,107,782,99.07%,96.26%,96.26%,96.26%
"hmer Indel >4,<=8",83,50,98.80%,96.39%,96.39%,96.39%,83,50,98.80%,93.98%,98.73%,96.30%
hmer Indel 4,14,55,nan%,nan%,nan%,nan%,14,55,nan%,nan%,nan%,nan%
hmer Indel 5,10,27,nan%,nan%,nan%,nan%,10,27,nan%,nan%,nan%,nan%
hmer Indel 6,18,12,nan%,nan%,nan%,nan%,18,12,nan%,nan%,nan%,nan%
hmer Indel 7,24,7,100.00%,100.00%,100.00%,100.00%,24,7,100.00%,100.00%,100.00%,100.00%
hmer Indel 8,31,4,96.77%,96.77%,93.75%,95.24%,31,4,96.77%,96.77%,93.75%,95.24%


### Including genotype

In [48]:
categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']
optTab,optRes,perfCurve=getPerformance(filtData,categories,gt=True)
plotPerformance(perfCurve,optRes,categories)

In [49]:
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="good_cvg_data_gt")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,7566,215,99.72%,99.46%,99.88%,99.67%,7566,215,99.72%,99.46%,99.87%,99.66%
non-hmer Indel,283,234,87.99%,83.39%,95.55%,89.06%,283,234,87.99%,82.69%,96.69%,89.14%
non-hmer Indel w/o LCR,218,195,91.28%,87.61%,98.45%,92.72%,218,195,91.28%,87.61%,98.45%,92.72%
hmer Indel <=4,107,782,99.07%,97.20%,96.30%,96.74%,107,782,99.07%,96.26%,96.26%,96.26%
"hmer Indel >4,<=8",83,50,95.18%,93.98%,96.30%,95.12%,83,50,95.18%,92.77%,96.25%,94.48%


### homozygous calls

In [50]:
categories=['SNP','non-hmer Indel','non-hmer Indel w/o LCR','hmer Indel <=4','hmer Indel >4,<=8']

hmzData={}
for s in sources:
        d=filtData[s]
        hmzData[s]=d[(d['gt_ground_truth']==(1,1)) & (d['classify']!='fn')]
optTab,optRes,perfCurve=getPerformance(hmzData,categories,gt=True)
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="good_cvg_data_homozygous")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP,3142,0,99.90%,99.90%,100.00%,99.95%,3142,0,99.90%,99.90%,100.00%,99.95%
non-hmer Indel,103,0,83.50%,83.50%,100.00%,91.01%,103,0,83.50%,83.50%,100.00%,91.01%
non-hmer Indel w/o LCR,84,0,88.10%,88.10%,100.00%,93.67%,84,0,88.10%,88.10%,100.00%,93.67%
hmer Indel <=4,48,0,97.92%,97.92%,100.00%,98.95%,48,0,97.92%,97.92%,100.00%,98.95%
"hmer Indel >4,<=8",26,0,84.62%,84.62%,100.00%,91.67%,26,0,84.62%,84.62%,100.00%,91.67%


### per base

In [51]:
categories=['SNP','hmer Indel <=4','hmer Indel >4,<=8','hmer Indel >8,<=10','hmer Indel 8']

baseData={}
b =('A','T')
for s in sources:
        d=filtData[s]
        baseData[s]=d[((d['indel']==False) & ((d['ref']==b[0]) | (d['ref']==b[1]))) |
                      ((d['hmer_indel_length']>0) & ((d['hmer_indel_nuc']==b[0]) | (d['hmer_indel_nuc']==b[1])))
                     ]
optTab1,optRes,perfCurve=getPerformance(baseData,categories)
for s in sources:
    optTab1[s].rename(index={a:'{0} ({1}/{2})'.format(a,b[0],b[1]) for a in optTab1[s].index}, inplace=True)
plotPerformance(perfCurve,optRes,categories,'A/T')


In [52]:
categories=['SNP','hmer Indel <=4','hmer Indel >4,<=8','hmer Indel >8,<=10','hmer Indel 6']
baseData={}
b =('C','G')
for s in sources:
        d=filtData[s]
        baseData[s]=d[((d['indel']==False) & ((d['ref']==b[0]) | (d['ref']==b[1]))) |
                      ((d['hmer_indel_length']>0) & ((d['hmer_indel_nuc']==b[0]) | (d['hmer_indel_nuc']==b[1])))
                     ]
optTab2,optRes,perfCurve=getPerformance(baseData,categories)
for s in sources:
    optTab2[s].rename(index={a:'{0} ({1}/{2})'.format(a,b[0],b[1]) for a in optTab2[s].index}, inplace=True)
plotPerformance(perfCurve,optRes,categories,'C/G')


In [53]:
optTab={}
for s in sources:
    optTab[s]=pd.concat([optTab1[s], optTab2[s]])
df=pd.concat([optTab[s] for s in sources], axis=1, keys=[s for s in sources])
df.to_hdf(h5outfile, key="per_base_data")
df

Unnamed: 0_level_0,Default,Default,Default,Default,Default,Default,Trained,Trained,Trained,Trained,Trained,Trained
Unnamed: 0_level_1,# pos,# neg,max recall,recall,precision,F1,# pos,# neg,max recall,recall,precision,F1
SNP (A/T),3537,79,99.72%,99.32%,99.89%,99.60%,3537,79,99.72%,99.46%,99.74%,99.60%
hmer Indel <=4 (A/T),67,97,100.00%,98.51%,100.00%,99.25%,67,97,100.00%,98.51%,100.00%,99.25%
"hmer Indel >4,<=8 (A/T)",66,35,100.00%,96.97%,98.46%,97.71%,66,35,100.00%,96.97%,98.46%,97.71%
"hmer Indel >8,<=10 (A/T)",68,8,67.65%,67.65%,92.00%,77.97%,68,8,67.65%,67.65%,92.00%,77.97%
hmer Indel 8 (A/T),26,4,100.00%,100.00%,92.86%,96.30%,26,4,100.00%,100.00%,92.86%,96.30%
SNP (C/G),4029,136,99.85%,99.73%,99.88%,99.80%,4029,136,99.85%,99.75%,99.83%,99.79%
hmer Indel <=4 (C/G),40,680,97.50%,97.50%,92.86%,95.12%,40,680,97.50%,97.50%,92.86%,95.12%
"hmer Indel >4,<=8 (C/G)",17,14,nan%,nan%,nan%,nan%,17,14,nan%,nan%,nan%,nan%
"hmer Indel >8,<=10 (C/G)",2,4,nan%,nan%,nan%,nan%,2,4,nan%,nan%,nan%,nan%
hmer Indel 6 (C/G),3,2,nan%,nan%,nan%,nan%,3,2,nan%,nan%,nan%,nan%


In [56]:
%matplotlib agg
d=defTable['Trained'][['max recall','recall','precision']]
labels=['SNP','nhmer','nhmer w/o LCR','hmer 2-4','hmer 5-8','hmer 4','hmer 5','hmer 6','hmer 7','hmer 8','hmer 9-10']
fig=plt.figure()
ax=d.plot()
plt.xticks(np.arange(len(d.index)), rotation=30, ha='right')
ax.set_xticklabels(labels)
plt.ylim([0.4,1.05])
plt.grid()
plt.title('Cvg>20X, Trained variant calls')
plt.tight_layout()
nxp.save(fig,imgpref+'summary','png',outdir=imgdir)
plt.close(fig)