In [1]:
import allel
import numpy as np
import pandas as pd
from functools import reduce
import glob

import xgboost as xgb
from xgboost import XGBClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
import matplotlib.pyplot as plt 
import seaborn as sns
from numpy import nan
from sklearn.preprocessing import OrdinalEncoder

### Generation of combinatorial dataset

In [3]:
final_target_features = {'freebayes': ['SOMATIC', 'QUAL', 'ID', 'QR', 'DPB', 'CHROM', 'POS', 'is_snp'],
 'mutect2': ['FILTER_PASS','ID','ECNT','MQ','FILTER_t_lod_fstar','HCNT','DP','CHROM','POS','is_snp',],
 'vardict': ['FILTER_PASS','STATUS','ID','SSF','VD','SOMATIC','CHROM','POS','is_snp'],
 'varscan': ['SOMATIC', 'ID', 'SPV', 'FILTER_PASS', 'CHROM', 'POS', 'is_snp']}

In [11]:
# Save all merged_dfs 
folders = ['real1', 'real2_part1'] + [f'syn{i}' for i in range(1,6)]

In [5]:
# function to get merged_df
def getmerged(f, final_target_features= final_target_features):
    lst_df = []
    suffix_list = []
    v = glob.glob(f + '/*vcf.gz')
    v.sort()
    
    for filename in v:
        if f != 'real2_part1':
            snv_caller = filename.split("/")[1].split(".")[0].split("-")[1]
        else:
            snv_caller = filename.split("/")[1].split("_")[1]
            if snv_caller == 'mutect': snv_caller += '2'
            
        df = allel.vcf_to_dataframe(filename, fields = final_target_features[snv_caller])
        df = df[df.is_snp == True]
        df.drop(['is_snp'],axis=1, inplace = True)
        df['ID_binary'] = df['ID'] != '.'
        df.drop(['ID'],axis=1, inplace = True)
        lst_df.append(df)
        
    suffix = ['fb','m2','vd','vs']
    keep_same = {'CHROM', 'POS'}
    i =0 
    for dfs in lst_df:
        dfs.columns = ['{}{}'.format(c, '' if c in keep_same else '_'+suffix[i]) for c in dfs.columns]
        i += 1
        
    merged_df = reduce(lambda left, right: pd.merge(left, right,on =['CHROM', 'POS'],
                                            how = 'outer', suffixes = ('', '')),lst_df)
    if f != 'real2_part1':
        truth_labels = pd.read_csv(f'{f}/{f}_truth.bed', sep = '\t', names = ['Chromo', 'start', 'end'])
    else:
        truth_labels = pd.read_csv(f'{f}/real2_truth_chr1to5.bed', sep = '\t', names = ['Chromo', 'start', 'end'])
        truth_labels = truth_labels.astype({'Chromo': 'object', 'start': 'int32', 'end': 'int32'})
    if sum(set(truth_labels.start == truth_labels.end)): #check the start and end pos are the same
        truth_labels = truth_labels[['Chromo', 'start']]
        truth_labels['truth'] = 1
        sub_truth = truth_labels.rename(columns = {'Chromo': 'CHROM', 'start': 'POS'})
        
        #what if all 4 callers did not call the pos in truth......
        final_df = merged_df.merge(sub_truth, on=['CHROM','POS'], how = 'outer')
        final_df['truth'].fillna(0, inplace = True)
    
    return final_df

### Obtain best combination of datasets

In [12]:
# Get optimal combination of datasets 
random_states = 1 
name_lists = ['real1', 'real2_part1', 'syn3', 'syn4', 'syn5']
ratio = 0.2 
concat_lst = []

for i in name_lists:
    df = getmerged(i)
    if 'real' in i:
        concat_lst.append(df)
    elif ratio == 1:
        concat_lst.append(df)
    else:
        stratified = df.groupby('truth', group_keys=False).apply(lambda x: x.sample(frac = ratio, random_state = random_states))
        concat_lst.append(stratified)

df = pd.concat(concat_lst, ignore_index = True)
#concatenation migh overwrite some category dtypes
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].astype('category')
        
# save dataset 
df.to_csv("combined_dataset.csv")

In [15]:
df.columns

Index(['SOMATIC_fb', 'QUAL_fb', 'QR_fb', 'DPB_fb', 'CHROM', 'POS',
       'ID_binary_fb', 'FILTER_PASS_m2', 'ECNT_m2', 'MQ_m2',
       'FILTER_t_lod_fstar_m2', 'HCNT_m2', 'DP_m2', 'ID_binary_m2',
       'FILTER_PASS_vd', 'STATUS_vd', 'SSF_vd', 'VD_vd', 'SOMATIC_vd',
       'ID_binary_vd', 'SOMATIC_vs', 'SPV_vs', 'FILTER_PASS_vs',
       'ID_binary_vs', 'truth'],
      dtype='object')

"combined_dataset.csv" can then be used as input for 'optimizer.py' which runs bayesian optimisation

In [16]:
df = pd.read_csv("combined_dataset.csv")

  df = pd.read_csv("combined_dataset.csv")


In [26]:
df = df.iloc[: , 1:]
df

Unnamed: 0,SOMATIC_fb,ID_fb,SPV_fb,FILTER_PASS_fb,CHROM,POS,is_snp_fb,ID_binary_fb,FILTER_PASS_m2,ID_m2,...,SSF_vd,VD_vd,QUAL_vs,QR_vs,DPB_vs,ECNT_fb,MQ_fb,FILTER_t_lod_fstar_fb,HCNT_fb,DP_fb
0,False,rs199706086,0.52291,False,1,10250,True,True,,,...,,,,,,,,,,
1,False,rs145427775,0.60839,False,1,10291,True,True,,,...,,,,,,,,,,
2,False,rs58108140,0.68908,False,1,10583,True,True,,,...,,,,,,,,,,
3,False,rs62635284,0.75242,False,1,12783,True,True,,,...,,,,,,,,,,
4,False,.,0.33043,False,1,12817,True,False,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8887028,,.,,True,6,14779529,True,False,,,...,,,,,,1.0,60.0,False,1.0,46.0
8887029,,.,,True,1,35073418,True,False,False,.,...,,,,,,1.0,60.0,False,1.0,46.0
8887030,,.,,True,3,10758905,True,False,True,.,...,,,,,,1.0,60.0,False,1.0,56.0
8887031,,.,,True,13,45580457,True,False,True,.,...,,,,,,1.0,60.0,False,1.0,70.0


In [27]:
for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].astype('category')

In [28]:
df

Unnamed: 0,SOMATIC_fb,ID_fb,SPV_fb,FILTER_PASS_fb,CHROM,POS,is_snp_fb,ID_binary_fb,FILTER_PASS_m2,ID_m2,...,SSF_vd,VD_vd,QUAL_vs,QR_vs,DPB_vs,ECNT_fb,MQ_fb,FILTER_t_lod_fstar_fb,HCNT_fb,DP_fb
0,False,rs199706086,0.52291,False,1,10250,True,True,,,...,,,,,,,,,,
1,False,rs145427775,0.60839,False,1,10291,True,True,,,...,,,,,,,,,,
2,False,rs58108140,0.68908,False,1,10583,True,True,,,...,,,,,,,,,,
3,False,rs62635284,0.75242,False,1,12783,True,True,,,...,,,,,,,,,,
4,False,.,0.33043,False,1,12817,True,False,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8887028,,.,,True,6,14779529,True,False,,,...,,,,,,1.0,60.0,False,1.0,46.0
8887029,,.,,True,1,35073418,True,False,False,.,...,,,,,,1.0,60.0,False,1.0,46.0
8887030,,.,,True,3,10758905,True,False,True,.,...,,,,,,1.0,60.0,False,1.0,56.0
8887031,,.,,True,13,45580457,True,False,True,.,...,,,,,,1.0,60.0,False,1.0,70.0


### Loading and testing of models 

In [15]:
# Classifier/Regressor
from xgboost import XGBClassifier, DMatrix
from functools import partial
# Model selection
from sklearn.model_selection import KFold, StratifiedKFold
import pprint
# Metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# Skopt functions
from skopt import BayesSearchCV
from skopt.callbacks import DeadlineStopper, DeltaYStopper
from skopt.space import Real, Categorical, Integer

# Data processing
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# from bayes_opt import BayesianOptimization
from sklearn.model_selection import cross_val_score

In [13]:
folders

['real1', 'real2_part1', 'syn1', 'syn2', 'syn3', 'syn4', 'syn5']

#### Load Datasets

In [36]:
df_real1 = getmerged('real1')
df_real2Part1 = getmerged('real2_part1')
df_syn1 = getmerged('syn1')
df_syn2 = getmerged('syn2')
df_syn3 = getmerged('syn3')
df_syn4 = getmerged('syn4')
df_syn5 = getmerged('syn5')

#### Load Models

In [17]:
# load model 
xgb_tuned1 = xgb.XGBClassifier()
xgb_tuned1.load_model('bayesian_opt/tuned1.model')

In [18]:
xgb_tuned2 = xgb.XGBClassifier()
xgb_tuned2.load_model('bayesian_opt_encoded/tuned2.model')

In [57]:
xgb_tuned3 = xgb.XGBClassifier()
xgb_tuned3.load_model('bayesian_opt_encoded/tuned3.model')

#### Make Prediction

In [92]:
df = df_syn5
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
df[['STATUS_vd']] = enc.fit_transform(df[['STATUS_vd']])
X = df.drop(['truth', 'POS', 'CHROM'], axis=1).values
y = df.truth.values

In [93]:
from sklearn import metrics
stats = [metrics.precision_score, metrics.recall_score, metrics.f1_score]
y_pred = xgb_tuned3.predict(X)
[g(y, y_pred) for g in stats]

[0.9997521294335031, 0.9776127624881563, 0.9885585053642452]

In [94]:
y_pred = xgb_tuned1.predict(X)
[g(y, y_pred) for g in stats]

[0.9996850747947362, 0.9792433289998458, 0.9893586232997172]

In [95]:
y_pred = xgb_tuned2.predict(X)
[g(y, y_pred) for g in stats]

[0.9995715413237118, 0.97670934050195, 0.9880082025677603]