# Detailed Variant Report v0.3

## Data Input

In [None]:
import pandas as pd
import h5py
import matplotlib.pyplot as plt
import numpy as np
from configparser import ConfigParser

import warnings
warnings.filterwarnings('ignore')

pd.options.display.float_format = '{:,.2%}'.format

configFile='detailed_var_report.config'
parser = ConfigParser()
parser.read(configFile)

prmNames=['run_id','pipeline_version',
          'h5_concordance_file']
prm={}
for name in prmNames:
    prm[name]=parser.get('VarReport', name, fallback='NA')

h5outfile = parser.get('VarReport', 'h5_output', fallback='detailed_var_report.h5')
    
source = prm['h5_concordance_file']

chromData=[]

chrKeys=['/chr{:d}'.format(i+1) for i in range(22)]

for k in chrKeys:
    d=pd.read_hdf(source, key=k, mode='r')
    chromData.append(d)

#print([len(chromData[i]) for i in range(len(chromData))])

data=pd.concat(chromData)
data['tree_score'] = np.where(data['classify']=='fn',-1,data['tree_score'])
#print(len(data))
chromData=[]

prm['mean_var_depth']='{:.2f}'.format(data['well_mapped_coverage'].mean())
prmNames.append('mean_var_depth')
   
try:
    args=pd.read_hdf(sources[0], 'input_args', mode='r')
    prm['truth_sample_name']=args['truth_sample_name'][0]
except:
    prm['truth_sample_name']=parser.get('VarReport', 'truth_sample_name', fallback='NA')
prmNames.append('truth_sample_name')

prm['regions']=parser.get('DetailedReport', 'regions').split(',')
prmNames.append('regions')


prmdf = pd.DataFrame.from_dict(prm, orient='index',columns=['value']).reindex(prmNames)
prmdf.to_hdf(h5outfile, key="det_parameters")
prmdf

In [None]:
data['label'] = np.where(data['classify']=='fp',0,1)
data['filter'][data['filter']=='HPOL_RUN']='PASS'
data['filter'][data['classify']=='fn']='MISS'

data['class'] = np.where(data['label']==0,'FP','FN')
data['class'][(data['label']==1) & (data['filter']=='PASS')]='TP'
data['class'][(data['label']==0) & (data['filter']!='PASS')]='TN'

In [None]:
def filterByRegion(data,region):
    if region=='All':
        return data
    elif region.startswith('Non-'):
        return data[data[region[4:]]==False]
    else:
        return data[data[region]==True]

def filterByRange(data,prm,mn,mx):
    return data[(data[prm]>=mn) & (data[prm]<mx)]

def filterByVal(data,prm,val):
    return data[(data[prm]==val)]

def filterByVarCategory(data,cat):
    if cat=='All':
        return data
    if cat=='SNP':
        return data[data['indel']==False]
    elif cat=='cycleskip SNP':
        return data[(data['indel']==False) & (data['cycleskip_status'])==True]
    elif cat=='Indel':
        return data[data['indel']==True]
    elif cat=='non-hmer':
            return data[(data['indel']==True) & (data['hmer_indel_length']==0) & (data['indel_length']>1)]
    elif cat=='hmer 0-1':
        return data[(data['indel']==True) & (data['hmer_indel_length']==0) & (data['indel_length']<=1)]
    elif cat=='hmer 2-4':
        return data[(data['indel']==True) & (data['hmer_indel_length']>0) & (data['hmer_indel_length']<=4)]
    elif cat=='hmer 5-8':
        return data[(data['indel']==True) & (data['hmer_indel_length']>4) & (data['hmer_indel_length']<=8)]
    elif cat=='hmer 9-10':
        return data[(data['indel']==True) & (data['hmer_indel_length']>8) & (data['hmer_indel_length']<=10)]
    elif cat=='hmer 11+':
        return data[(data['indel']==True) & (data['hmer_indel_length']>10)]
    return None


In [None]:
def calcPerformanceStatic(d):
    
    num=len(d)
    numPos=sum(d['label'])
    numNeg=num-numPos
    if num==0 or numPos==0:
        return (None,numPos,numNeg)
    
    fn=len(d[d['class']=='FN'])
    tp=len(d[d['class']=='TP'])
    fp=len(d[d['class']=='FP'])
    
    recall=tp/(tp+fn) if (tp+fn>0) else np.nan
    precision=tp/(tp+fp) if (tp+fp>0) else np.nan
    max_recall=1-len(d[d['filter']=='MISS'])/numPos
    
    f1=tp/(tp+0.5*fn+0.5*fp)

    return ({'max_recall':max_recall,'recall':recall,'precision':precision,'f1':f1,'tp':tp,'fp':fp,'fn':fn},numPos,numNeg)


def calcPerformanceOptimized(d1):
    
    d=d1[['classify','tree_score','label']].sort_values(by=['tree_score']).copy()
    
    num=len(d)
    numPos=sum(d['label'])
    numNeg=num-numPos
    if num==0:
        return (None,numPos,numNeg)
    
    d['fn']=np.cumsum(d['label'])
    d['tp']=numPos-(d['fn'])
    d['fp']=numNeg-np.cumsum(1-d['label'])

    d['recall']=d['tp']/(d['tp']+d['fn'])
    d['precision']=d['tp']/(d['tp']+d['fp'])

    d['f1']=d['tp']/(d['tp']+0.5*d['fn']+0.5*d['fp'])

    d['mask']=((d['tp']+d['fn'])>=1) & ((d['tp']+d['fp'])>=1) & (d['tree_score']>=0)
    if len(d[d['mask']])==0:
        return (None,numPos,numNeg)
    
    maxF1=max(d[d['mask']]['f1'])
    d['opt']=(d['f1']==maxF1)

    r=(d[d['opt']])
    #print(r.iloc[0,:].to_dict())
    return (r.iloc[0,:].to_dict(),numPos,numNeg)
    #return ({'recall':r['recall'],'precision':r['precision'],'f1':r['f1'],'tp':r['tp'],'fp':r['fp'],'fn':r['fn']},numPos,numNeg)


def renameReg(reg):
    regRename={"LCR-hs38": "LCR", "Non-LCR-hs38": "Non-LCR", "exome.twist": "Exome"}
    return regRename[reg] if reg in regRename else reg
    

def getStats(d,reg,cat,var):
    resStat,pos,neg=calcPerformanceStatic(d)
    resOpt,pos1,neg1=calcPerformanceOptimized(d)
    mi=pd.MultiIndex.from_tuples([(renameReg(reg),var,cat)],names=['Region','Variant','Category'])

    row=pd.DataFrame({'# pos':pos,
                      '# neg':neg,
                      'TP':np.nan if not resStat else resStat['tp'],
                      'FP':np.nan if not resStat else resStat['fp'],
                      'FN':np.nan if not resStat else resStat['fn'],
                      'max recall':np.nan if not resStat else resStat['max_recall'],
                      'recall-stat':np.nan if not resStat else resStat['recall'],
                      'precision-stat':np.nan if not resStat else resStat['precision'],
                      'F1-stat':np.nan if not resStat else resStat['f1'],
                      'recall-opt':np.nan if not resOpt else resOpt['recall'],
                      'precision-opt':np.nan if not resOpt else resOpt['precision'],
                      'F1-opt':np.nan if not resOpt else resOpt['f1'],
                      'avg cvg':d['coverage'].mean(),
                      'avg gc':d['gc_content'].mean(),
                     },index=mi)
    
    #print(resOpt)
    
    return row


## Summary Performance - Genome

In [None]:
pd.options.display.float_format = lambda x: '{:,.2%}'.format(x) if x<=1 and x>0 else \
                                           ('{:,.0f}'.format(x) if x.is_integer() else \
                                            '{:,.1f}'.format(x))

varCats=['All','SNP','Indel','non-hmer','hmer 0-1','hmer 2-4','hmer 5-8','hmer 9-10','hmer 11+']
#varCats=['SNP','Indel']
#varCats=['hmer Indel >8,<=10','hmer Indel >10']


res=pd.DataFrame()
for region in ['All']+prm['regions']:
#for region in ['exome.twist']:
    d1=filterByRegion(data,region)
    
    for var in varCats:
        #print(region,var)
        d2=filterByVarCategory(d1,var)
        res=pd.concat([res,getStats(d2,region, 'All', var)])
        
        for gc in [(0,0.2),(0.2,0.8),(0.8,1)]:
            d3=filterByRange(d2,'gc_content',gc[0],gc[1])
            res=pd.concat([res,getStats(d3,region, f'GC {gc[0]*100:.0f}-{gc[1]*100:.0f}', var)])
        for cvg in [(0,20),(20,40),(40,100)]:
            d3=filterByRange(d2,'coverage',cvg[0],cvg[1])
            res=pd.concat([res,getStats(d3,region, 'CVG {0}-{1}'.format(cvg[0],cvg[1]), var)])
        for lcr in [True,False]:
            d3=filterByVal(d2,'LCR-hs38',lcr)
            res=pd.concat([res,getStats(d3,region, 'LCR' if lcr else 'Non-LCR', var)])
        for mappab in [True,False]:
            d3=filterByVal(d2,'mappability.0',mappab)
            res=pd.concat([res,getStats(d3,region, 'Mappable' if mappab else 'Non-Mappable'.format(mappab), var)])
       

In [None]:
res.to_hdf(h5outfile, key="detailed_vars")

parser.read(configFile)
csvFile=parser.get('DetailedReport', 'csv')

out=res.reset_index()
out.to_csv(csvFile,index=False)
#out

In [None]:
import numpy as np
import matplotlib.pyplot as plt

columns = varCats #[:3]
rows = ['All', 'GC 0-20', 'GC 20-80', 'GC 80-100', 'LCR', 'Non-LCR', 'CVG 0-20', 'CVG 20-40', 'CVG 40-100', 
        'Mappable','Non-Mappable','Exome'] #[:3]

def getRow(cat,var,exome=False):
    reg='Exome' if exome else 'All'
    x=out[((((out['Category']==reg) & (out['Region']==cat)) | 
            ((out['Category']==cat) & (out['Region']==reg))) & 
           (out['Variant']==var))]
    return x

def getTabTxt(cat,var,prm,exome=False):
    x=getRow(cat,var,exome)
    n=x['# pos'].iloc[0] 
    num='{:d}k'.format(int(n/1000)) if n>1000 else '{:d}'.format(n)
    return '{:.1%}\n({:s},{:.1f})'.format(x[prm].iloc[0],num,x['avg cvg'].iloc[0])

def getTabCol(cat,var,prm,exome=False):
    x=getRow(cat,var,exome)
    if x['# pos'].iloc[0]<30:
        return 'white'
    return plt.cm.RdYlGn(x[prm].iloc[0])


tabl = [[getTabTxt(r,c,'max recall') for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'max recall') for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Genome - VC performance per category: max recall (n,cvg)', fontsize=30)

In [None]:

tabl = [[getTabTxt(r,c,'F1-stat') for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'F1-stat') for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Genome - VC performance per category: F1 (n,cvg)', fontsize=30)


In [None]:

tabl = [[getTabTxt(r,c,'F1-opt') for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'F1-opt') for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Genome - VC performance per category: Reoptimized F1 (n,cvg)', fontsize=30)


## Summary Performance - Exome

In [None]:
columns = varCats #[:3]
rows = ['All', 'GC 0-20', 'GC 20-80', 'GC 80-100', 'LCR', 'Non-LCR', 'CVG 0-20', 'CVG 20-40', 'CVG 40-100', 
        'Mappable','Non-Mappable'] #[:3]

tabl = [[getTabTxt(r,c,'max recall',True) for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'max recall',True) for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Exome - VC performance per category: max recall (n,cvg)', fontsize=30)

In [None]:
tabl = [[getTabTxt(r,c,'F1-stat',True) for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'F1-stat',True) for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Exome - VC performance per category: F1 (n,cvg)', fontsize=30)


In [None]:
tabl = [[getTabTxt(r,c,'F1-opt',True) for c in columns] for r in rows] 
tabcol = [[getTabCol(r,c,'F1-opt',True) for c in columns] for r in rows] 

fig, ax = plt.subplots(figsize=(20,10)) 
ax.set_axis_off() 
table = ax.table( 
    cellText = tabl,  
    rowLabels = rows,  
    colLabels = columns,
    cellColours = tabcol,
    cellLoc ='center',  
    loc ='upper left')     
table.set_fontsize(30)
table.scale(1, 4)
x=plt.title('Exome - VC performance per category: Optimized F1 (n,cvg)', fontsize=30)


## Detailed Performance

In [None]:
pd.set_option('display.max_rows', 1000)
res