In [55]:
import allel
import numpy as np
import pandas as pd
from functools import reduce

import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns
from numpy import nan
from sklearn.preprocessing import OrdinalEncoder

In [2]:
#place the python file in the same folder containing all the real1, real2_part1, ... folders

def parse_to_df(folder='test', features='*', algos=['freebayes', 'mutect2', 'vardict', 'varscan']):
    '''
    reads all vcf.gz files corresponding to algos in the specified folder with the specified list of features
    and combines the read files into one dataframe with (CHROM, POS, REF) as index.
    '''
    if folder == 'test':
        dfs = [allel.vcf_to_dataframe(f'{folder}/{i}.vcf.gz', fields = features) for i in algos]
    else:
        dfs = [allel.vcf_to_dataframe(f'{folder}/{folder}-{i}.vcf.gz', fields = features) for i in algos]
    algo_dicts = dict(zip(algos, dfs))
    
    #some manipulations
    for i in algo_dicts:
        algo_dicts[i].set_index(keys=['CHROM', 'POS', 'REF'], inplace = True) #will be use as keys for later merging
        algo_dicts[i] = algo_dicts[i][algo_dicts[i]['is_snp']]    #obtain only SNPs
        algo_dicts[i].columns = [j + '_' + i for j in algo_dicts[i].columns]

    #combining the dfs
    edited_dfs = [algo_dicts[i] for i in algos]

    merged = reduce(lambda left, right: pd.merge(left, right,
                                            how = 'outer',
                                            left_index=True, right_index=True,
                                            suffixes = ('', '')), edited_dfs)

    merged.columns = sorted(merged.columns)

    return merged


In [51]:
folders = ['real1'] + [f'syn{i}' for i in range(1,6)]

folder_files = []
varscan_features = ['CHROM','POS','REF','ALT', 'SSC','SPV','is_snp']
freebayes_features = ['CHROM','POS','REF','ALT', 'MQMR','is_snp']
mutect2_features = ['CHROM','POS','REF','ALT', 'MQ','is_snp']
vardict_features = ['CHROM','POS','REF','ALT', 'SSF','MSI','is_snp']

In [52]:
#generate the big dataframe with every data

for f in folders:
    print(f'generating df of {f}')
    #read files
    varscan_sub = allel.vcf_to_dataframe(f"{f}/{f}-varscan.vcf.gz", fields = varscan_features)
    freebayes_sub = allel.vcf_to_dataframe(f"{f}/{f}-freebayes.vcf.gz", fields = freebayes_features)
    mutect2_sub = allel.vcf_to_dataframe(f"{f}/{f}-mutect2.vcf.gz", fields = mutect2_features)
    vardict_sub = allel.vcf_to_dataframe(f"{f}/{f}-vardict.vcf.gz", fields = vardict_features)

    #isolate snp only
    varscan = varscan_sub[varscan_sub.is_snp == True]
    freebayes = freebayes_sub[freebayes_sub.is_snp == True]
    mutect2 = mutect2_sub[mutect2_sub.is_snp == True]
    vardict = vardict_sub[vardict_sub.is_snp == True]

    #edit column lables of each df
    lst_dfs = [varscan_sub,freebayes_sub,mutect2_sub,vardict_sub]
    suffix = ['vs','fb','m2','vd']
    keep_same = {'CHROM', 'POS'}
    
    for i in range(len(lst_dfs)):
        df = lst_dfs[i]
        df.columns = ['{}{}'.format(c, '' if c in keep_same else '_'+suffix[i]) for c in df.columns]
    
    #merge the dfs together
    merged_df = reduce(lambda left, right: pd.merge(left, right,on =['CHROM', 'POS'],
                                            how = 'outer', suffixes = ('', '')),lst_dfs)
    merged_df = merged_df.drop(['is_snp_vd','is_snp_fb','is_snp_m2','is_snp_vs'], axis=1)

    #add in the trute values in *truth.bed files
    truth_labels = pd.read_csv(f'{f}/{f}_truth.bed', sep = '\t', names = ['Chromo', 'start', 'end'])
    if sum(set(truth_labels.start == truth_labels.end)): #check the start and end pos are the same
        truth_labels = truth_labels[['Chromo', 'start']]
        truth_labels['truth'] = 1
        sub_truth = truth_labels.rename(columns = {'Chromo': 'CHROM', 'start': 'POS'})
        
        #what if all 4 callers did not call the pos in truth......
        final_df = merged_df.merge(sub_truth, on=['CHROM','POS'], how = 'left' )
        final_df['truth'].fillna(0, inplace = True)
    else:
        print('Not an SNP in truth.bed file')
        continue

    #save the files
    print(f'generated {f} df')
    #print(merged_df.info(verbose=True))
    folder_files.append(final_df)
    final_df.to_csv(f"{f}_merged_df.csv")

generating df of real1
generated real1 df
generating df of syn1
generated syn1 df
generating df of syn2
generated syn2 df
generating df of syn3
generated syn3 df
generating df of syn4
generated syn4 df
generating df of syn5
generated syn5 df


In [53]:
#concatenating the generated dfs together

combined = pd.concat(folder_files, ignore_index = True)

In [54]:
#checking concatenation
sum_rows = 0

for df in folder_files:
    sum_rows += len(df)
    print(df.shape)

print(sum_rows)
print(combined.shape)

(5790219, 25)
(5539098, 25)
(5783210, 25)
(5633087, 25)
(5045774, 25)
(4977710, 25)
32769098
(32769098, 25)


In [56]:
#defining X and Y for Xgboost
X = combined[combined.columns[~combined.columns.isin(['truth','POS','CHROM'])]]

y = combined['truth'] 

# ordinal encoding for REF and ALT, required for categorical variable
enc = OrdinalEncoder()
enc.fit(X)
new_X = enc.transform(X)

In [57]:
#split data set into test and train, 20% of data will be used as a test set
X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2)

In [58]:
#create a model using X_train and y_train data with XGBoost
model = XGBClassifier(eval_metric='rmse')
model.fit(X_train, y_train)

In [59]:
#what does this do? dose it update the model?
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)



In [60]:
#prediction
y_pred = model.predict(X_test)

In [61]:
#check the results
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[6537589     777]
 [   1259   14195]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   6538366
         1.0       0.95      0.92      0.93     15454

    accuracy                           1.00   6553820
   macro avg       0.97      0.96      0.97   6553820
weighted avg       1.00      1.00      1.00   6553820

