In [1]:
import allel
import numpy as np
import pandas as pd
from functools import reduce

## Parsing functions

In [2]:
#place the python file in the same folder containing all the real1, real2_part1, ... folders

def parse_to_df(folder='test', features='*', algos=['freebayes', 'mutect2', 'vardict', 'varscan']):
    '''
    reads all vcf.gz files corresponding to algos in the specified folder with the specified list of features
    and combines the read files into one dataframe with (CHROM, POS, REF) as index.
    '''
    if folder == 'test':
        dfs = [allel.vcf_to_dataframe(f'{folder}/{i}.vcf.gz', fields = features) for i in algos]
    else:
        dfs = [allel.vcf_to_dataframe(f'{folder}/{folder}-{i}.vcf.gz', fields = features) for i in algos]
    algo_dicts = dict(zip(algos, dfs))
    
    #some manipulations
    for i in algo_dicts:
        algo_dicts[i].set_index(keys=['CHROM', 'POS', 'REF'], inplace = True) #will be use as keys for later merging
        algo_dicts[i] = algo_dicts[i][algo_dicts[i]['is_snp']]    #obtain only SNPs
        algo_dicts[i].columns = [j + '_' + i for j in algo_dicts[i].columns]

    #combining the dfs
    edited_dfs = [algo_dicts[i] for i in algos]

    merged = reduce(lambda left, right: pd.merge(left, right,
                                            how = 'outer',
                                            left_index=True, right_index=True,
                                            suffixes = ('', '')), edited_dfs)

    merged.columns = sorted(merged.columns)

    return merged


In [26]:
def parse_with_drop_na(folder='test', features='*', drop_threshold=50):
    
    # Parse VCF files to DataFrames
    algos = ['freebayes', 'mutect2', 'vardict', 'varscan']
    if folder == 'test':
        dfs = [allel.vcf_to_dataframe(f'{folder}/{i}.vcf.gz', fields=features) for i in algos]
    else:
        dfs = [allel.vcf_to_dataframe(f'{folder}/{folder}-{i}.vcf.gz', fields=features) for i in algos]
    algo_dicts = dict(zip(algos, dfs)) # a dictionary of key = algo, value = algo results
    
    
    # Rename columns
    keep_same = {'CHROM', 'POS'}
    for i in algos:
        algo_dicts[i] = algo_dicts[i][algo_dicts[i]['is_snp']] # obtain only SNPs
        algo_dicts[i].columns = ['{}{}'.format(c, '' if c in keep_same else '_' + i) for c in algo_dicts[i].columns]
        
        
    # Drop columns with >50% missing values
    for alg in algo_dicts:
        features_to_drop = []
        for feature in algo_dicts[alg].columns:
            
            # Calculate percentage of rows with missing values
            n_miss = algo_dicts[alg][[feature]].isnull().sum() 
            perc = n_miss / algo_dicts[alg].shape[0] * 100
            
            # If >50% of values are missing, drop feature
            if (float)(perc) > drop_threshold:
                features_to_drop.append(feature)

        algo_dicts[alg].drop(labels = features_to_drop, axis=1, inplace=True)
        
        
    # Merge DataFrames from diff algos
    edited_dfs = [algo_dicts[i] for i in algos]
    merged = reduce(lambda left, right: pd.merge(left, right, on =['CHROM', 'POS'],
                                                how = 'outer', suffixes = ('', '')), edited_dfs)
    
    
    return merged

In [None]:
# Takes some time to run

target_folder = 'syn1'

df = parse_with_drop_na(folder=target_folder)
df.to_csv(f'{target_folder}_merged_df.csv')

## Meanings of VCF file headers
To help with feature choice

#### Freebayes
* FILTER=<ID=PASS,Description="All filters passed">
* FILTER=<ID=REJECT,Description="Not somatic due to normal call frequency or phred likelihoods: tumor: 35, normal 35.">
* FORMAT=<ID=AD,Number=R,Type=Integer,Description="Number of observation for each allele">
* FORMAT=<ID=AO,Number=A,Type=Integer,Description="Alternate allele observation count">
* FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
* FORMAT=<ID=GL,Number=G,Type=Float,Description="Genotype Likelihood, log10-scaled likelihoods of the data given the called genotype for each possible genotype generated from the reference and alternate alleles given the sample ploidy">
* FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality, the Phred-scaled marginal (or unconditional) probability of the called genotype">
* FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
* FORMAT=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
* FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
* FORMAT=<ID=QA,Number=A,Type=Integer,Description="Sum of quality of the alternate observations">
* FORMAT=<ID=QR,Number=1,Type=Integer,Description="Sum of quality of the reference observations">
* FORMAT=<ID=RO,Number=1,Type=Integer,Description="Reference allele observation count">
* INFO=<ID=AB,Number=A,Type=Float,Description="Allele balance at heterozygous sites: a number between 0 and 1 representing the ratio of reads showing the reference allele to all reads, considering only reads from individuals called as heterozygous">
* INFO=<ID=ABP,Number=A,Type=Float,Description="Allele balance probability at heterozygous sites: Phred-scaled upper-bounds estimate of the probability of observing the deviation between ABR and ABA given E(ABR/ABA) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=AC,Number=A,Type=Integer,Description="Total number of alternate alleles in called genotypes">
* INFO=<ID=AF,Number=A,Type=Float,Description="Estimated allele frequency in the range (0,1]">
* INFO=<ID=AN,Number=1,Type=Integer,Description="Total number of alleles in called genotypes">
* INFO=<ID=AO,Number=A,Type=Integer,Description="Count of full observations of this alternate haplotype.">
* INFO=<ID=CIGAR,Number=A,Type=String,Description="The extended CIGAR representation of each alternate allele, with the exception that '=' is replaced by 'M' to ease VCF parsing.  Note that INDEL alleles do not have the first matched base (which is provided by default, per the spec) referred to by the CIGAR.">
* INFO=<ID=DECOMPOSED,Number=0,Type=Flag,Description="The allele was parsed using vcfallelicprimitives.">
* INFO=<ID=DP,Number=1,Type=Integer,Description="Total read depth at the locus">
* INFO=<ID=DPB,Number=1,Type=Float,Description="Total read depth per bp at the locus; bases in reads overlapping / bases in haplotype">
* INFO=<ID=DPRA,Number=A,Type=Float,Description="Alternate allele depth ratio.  Ratio between depth in samples with each called alternate allele and those without.">
* INFO=<ID=END,Number=1,Type=Integer,Description="Last position (inclusive) in gVCF output record.">
* INFO=<ID=EPP,Number=A,Type=Float,Description="End Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=EPPR,Number=1,Type=Float,Description="End Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between EL and ER given E(EL/ER) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=GTI,Number=1,Type=Integer,Description="Number of genotyping iterations required to reach convergence or bailout.">
* INFO=<ID=LEN,Number=A,Type=Integer,Description="allele length">
* INFO=<ID=MEANALT,Number=A,Type=Float,Description="Mean number of unique non-reference allele observations per sample with the corresponding alternate alleles.">
* INFO=<ID=MIN_DP,Number=1,Type=Integer,Description="Minimum depth in gVCF output block.">
* INFO=<ID=MQM,Number=A,Type=Float,Description="Mean mapping quality of observed alternate alleles">
* INFO=<ID=MQMR,Number=1,Type=Float,Description="Mean mapping quality of observed reference alleles">
* INFO=<ID=NS,Number=1,Type=Integer,Description="Number of samples with data">
* INFO=<ID=NUMALT,Number=1,Type=Integer,Description="Number of unique non-reference alleles in called genotypes at this position.">
* INFO=<ID=ODDS,Number=1,Type=Float,Description="The log odds ratio of the best genotype combination to the second-best.">
* INFO=<ID=OLD_VARIANT,Number=.,Type=String,Description="Original chr:pos:ref:alt encoding">
* INFO=<ID=PAIRED,Number=A,Type=Float,Description="Proportion of observed alternate alleles which are supported by properly paired read fragments">
* INFO=<ID=PAIREDR,Number=1,Type=Float,Description="Proportion of observed reference alleles which are supported by properly paired read fragments">
* INFO=<ID=PAO,Number=A,Type=Float,Description="Alternate allele observations, with partial observations recorded fractionally">
* INFO=<ID=PQA,Number=A,Type=Float,Description="Alternate allele quality sum in phred for partial observations">
* INFO=<ID=PQR,Number=1,Type=Float,Description="Reference allele quality sum in phred for partial observations">
* INFO=<ID=PRO,Number=1,Type=Float,Description="Reference allele observation count, with partial observations recorded fractionally">
* INFO=<ID=QA,Number=A,Type=Integer,Description="Alternate allele quality sum in phred">
* INFO=<ID=QR,Number=1,Type=Integer,Description="Reference allele quality sum in phred">
* INFO=<ID=RO,Number=1,Type=Integer,Description="Count of full observations of the reference haplotype.">
* INFO=<ID=RPL,Number=A,Type=Float,Description="Reads Placed Left: number of reads supporting the alternate balanced to the left (5') of the alternate allele">
* INFO=<ID=RPP,Number=A,Type=Float,Description="Read Placement Probability: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=RPPR,Number=1,Type=Float,Description="Read Placement Probability for reference observations: Phred-scaled upper-bounds estimate of the probability of observing the deviation between RPL and RPR given E(RPL/RPR) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=RPR,Number=A,Type=Float,Description="Reads Placed Right: number of reads supporting the alternate balanced to the right (3') of the alternate allele">
* INFO=<ID=RUN,Number=A,Type=Integer,Description="Run length: the number of consecutive repeats of the alternate allele in the reference genome">
* INFO=<ID=SAF,Number=A,Type=Integer,Description="Number of alternate observations on the forward strand">
* INFO=<ID=SAP,Number=A,Type=Float,Description="Strand balance probability for the alternate allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SAF and SAR given E(SAF/SAR) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=SAR,Number=A,Type=Integer,Description="Number of alternate observations on the reverse strand">
* INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
* INFO=<ID=SRF,Number=1,Type=Integer,Description="Number of reference observations on the forward strand">
* INFO=<ID=SRP,Number=1,Type=Float,Description="Strand balance probability for the reference allele: Phred-scaled upper-bounds estimate of the probability of observing the deviation between SRF and SRR given E(SRF/SRR) ~ 0.5, derived using Hoeffding's inequality">
* INFO=<ID=SRR,Number=1,Type=Integer,Description="Number of reference observations on the reverse strand">
* INFO=<ID=TYPE,Number=A,Type=String,Description="The type of allele, either snp, mnp, ins, del, or complex.">
* INFO=<ID=technology.illumina,Number=A,Type=Float,Description="Fraction of observations supporting the alternate observed in reads from illumina">

#### Mutect2
* FILTER=<ID=PASS,Description="All filters passed">
* FILTER=<ID=MinAF,Description="Allele frequency is lower than 10.0% (default threshold in bcbio; override with min_allele_fraction in the algorithm section)">
* FILTER=<ID=alt_allele_in_normal,Description="Evidence seen in the normal sample">
* FILTER=<ID=clustered_events,Description="Clustered events observed in the tumor">
* FILTER=<ID=clustered_read_position,Description="Evidence for somatic variant clusters near the ends of reads">
* FILTER=<ID=germline_risk,Description="Evidence indicates this site is germline, not somatic">
* FILTER=<ID=homologous_mapping_event,Description="More than three events were observed in the tumor">
* FILTER=<ID=multi_event_alt_allele_in_normal,Description="Multiple events observed in tumor and normal">
* FILTER=<ID=panel_of_normals,Description="Seen in at least 2 samples in the panel of normals">
* FILTER=<ID=str_contraction,Description="Site filtered due to contraction of short tandem repeat region">
* FILTER=<ID=strand_artifact,Description="Evidence for alt allele comes from one read direction only">
* FILTER=<ID=t_lod_fstar,Description="Tumor does not meet likelihood threshold">
* FILTER=<ID=triallelic_site,Description="Site filtered because more than two alt alleles pass tumor LOD">
* FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
* FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele fraction of the event in the tumor">
* FORMAT=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth (reads with MQ=255 or with bad mates are filtered)">
* FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
* FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
* FORMAT=<ID=PGT,Number=1,Type=String,Description="Physical phasing haplotype information, describing how the alternate alleles are phased in relation to one another">
* FORMAT=<ID=PID,Number=1,Type=String,Description="Physical phasing ID information, where each unique ID within a given sample (but not across samples) connects records within a phasing group">
* FORMAT=<ID=PL,Number=G,Type=Integer,Description="Normalized, Phred-scaled likelihoods for genotypes as defined in the VCF specification">
* INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
* INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
* INFO=<ID=ECNT,Number=1,Type=String,Description="Number of events in this haplotype">
* INFO=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
* INFO=<ID=GC,Number=1,Type=Float,Description="GC content around the variant (see docs for window size details)">
* INFO=<ID=HCNT,Number=1,Type=String,Description="Number of haplotypes that support this variant">
* INFO=<ID=HRun,Number=1,Type=Integer,Description="Largest Contiguous Homopolymer Run of Variant Allele In Either Direction">
* INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
* INFO=<ID=MAX_ED,Number=1,Type=Integer,Description="Maximum distance between events in this active region">
* INFO=<ID=MIN_ED,Number=1,Type=Integer,Description="Minimum distance between events in this active region">
* INFO=<ID=MQ,Number=1,Type=Float,Description="RMS Mapping Quality">
* INFO=<ID=MQ0,Number=1,Type=Integer,Description="Total Mapping Quality Zero Reads">
* INFO=<ID=MQRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref read mapping qualities">
* INFO=<ID=NLOD,Number=1,Type=String,Description="Normal LOD score">
* INFO=<ID=PON,Number=1,Type=String,Description="Count from Panel of Normals">
* INFO=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
* INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">
* INFO=<ID=TLOD,Number=1,Type=String,Description="Tumor LOD score">

#### Vardict
* FILTER=<ID=PASS,Description="All filters passed">
* FILTER=<ID=Bias,Description="Strand Bias">
* FILTER=<ID=Cluster0bp,Description="Two somatic variants are within 0 bp">
* FILTER=<ID=DIFF0.2,Description="Non-somatic or LOH and allele frequency difference < 0.2">
* FILTER=<ID=InDelLikely,Description="Likely Indels are not considered somatic">
* FILTER=<ID=InGap,Description="The somatic variant is in the deletion gap, thus likely false positive">
* FILTER=<ID=InIns,Description="The somatic variant is adjacent to an insertion variant">
* FILTER=<ID=LongAT,Description="The somatic variant is flanked by long A/T (>=14)">
* FILTER=<ID=LowAlleleDepth,Description="Low depth per allele frequency along with poor depth, quality, mapping quality and read mismatches.">
* FILTER=<ID=LowFreqQuality,Description="Low frequency read with poor quality and p-value (SSF).">
* FILTER=<ID=MAF0.05,Description="Matched sample has AF > 0.05, thus not somatic">
* FILTER=<ID=MSI12,Description="Variant in MSI region with 12 non-monomer MSI or 12 monomer MSI">
* FILTER=<ID=NM4.25,Description="Mean mismatches in reads >= 4.25, thus likely false positive">
* FILTER=<ID=P0.01Likely,Description="Likely candidate but p-value > 0.01/5**vd2">
* FILTER=<ID=P0.9,Description="Not significant with p-value > 0.9">
* FILTER=<ID=Q0,Description="Mean Mapping Quality Below 0">
* FILTER=<ID=REJECT,Description="Not Somatic via VarDict">
* FILTER=<ID=SN1.5,Description="Signal to Noise Less than 1.5">
* FILTER=<ID=d5,Description="Total Depth < 5">
* FILTER=<ID=f0.1,Description="Allele frequency < 0.1">
* FILTER=<ID=p8,Description="Mean Position in Reads Less than 8">
* FILTER=<ID=pSTD,Description="Position in Reads has STD of 0">
* FILTER=<ID=q22.5,Description="Mean Base Quality Below 22.5">
* FILTER=<ID=v3,Description="Var Depth < 3">
* FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
* FORMAT=<ID=ADJAF,Number=1,Type=Float,Description="Adjusted AF for indels due to local realignment">
* FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele Frequency">
* FORMAT=<ID=ALD,Number=2,Type=Integer,Description="Variant forward, reverse reads">
* FORMAT=<ID=BIAS,Number=1,Type=String,Description="Strand Bias Info">
* FORMAT=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
* FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
* FORMAT=<ID=HIAF,Number=1,Type=Float,Description="Allele frequency using only high quality bases">
* FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mean Mapping Quality">
* FORMAT=<ID=NM,Number=1,Type=Float,Description="Mean mismatches in reads">
* FORMAT=<ID=ODDRATIO,Number=1,Type=Float,Description="Strand Bias Odds ratio">
* FORMAT=<ID=PMEAN,Number=1,Type=Float,Description="Mean position in reads">
* FORMAT=<ID=PSTD,Number=1,Type=Float,Description="Position STD in reads">
* FORMAT=<ID=QSTD,Number=1,Type=Float,Description="Quality score STD in reads">
* FORMAT=<ID=QUAL,Number=1,Type=Float,Description="Mean quality score in reads">
* FORMAT=<ID=RD,Number=2,Type=Integer,Description="Reference forward, reverse reads">
* FORMAT=<ID=SBF,Number=1,Type=Float,Description="Strand Bias Fisher p-value">
* FORMAT=<ID=SN,Number=1,Type=Float,Description="Signal to noise">
* FORMAT=<ID=VD,Number=1,Type=Integer,Description="Variant Depth">
* INFO=<ID=AF,Number=A,Type=Float,Description="Allele Frequency, for each ALT allele, in the same order as listed">
* INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
* INFO=<ID=END,Number=1,Type=Integer,Description="Chr End Position">
* INFO=<ID=LSEQ,Number=1,Type=String,Description="5' flanking seq">
* INFO=<ID=MSI,Number=1,Type=Float,Description="MicroSatellite. > 1 indicates MSI">
* INFO=<ID=MSILEN,Number=1,Type=Float,Description="MSI unit repeat length in bp">
* INFO=<ID=RSEQ,Number=1,Type=String,Description="3' flanking seq">
* INFO=<ID=SAMPLE,Number=1,Type=String,Description="Sample name (with whitespace translated to underscores)">
* INFO=<ID=SHIFT3,Number=1,Type=Integer,Description="No. of bases to be shifted to 3 prime for deletions due to alternative alignment">
* INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Somatic event">
* INFO=<ID=SOR,Number=1,Type=Float,Description="Odds ratio">
* INFO=<ID=SSF,Number=1,Type=Float,Description="P-value">
* INFO=<ID=STATUS,Number=1,Type=String,Description="Somatic or germline status">
* INFO=<ID=TYPE,Number=1,Type=String,Description="Variant Type: SNV Insertion Deletion Complex">
* INFO=<ID=VD,Number=1,Type=Integer,Description="Variant Depth">

#### Varscan
* FILTER=<ID=PASS,Description="All filters passed">
* FILTER=<ID=REJECT,Description="Set if true: SS != '.' && SS != '2'">
* FILTER=<ID=SpvFreq,Description="High frequency (tumor FREQ > 0.35) and low p-value for somatic (SPV < 0.05)">
* FILTER=<ID=indelError,Description="Likely artifact due to indel reads at this position">
* FILTER=<ID=str10,Description="Less than 10% or more than 90% of variant supporting reads on one strand">
* FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">
* FORMAT=<ID=DP,Number=1,Type=Integer,Description="Read Depth">
* FORMAT=<ID=DP4,Number=1,Type=String,Description="Strand read counts: ref/fwd, ref/rev, var/fwd, var/rev">
* FORMAT=<ID=FREQ,Number=1,Type=Float,Description="Variant allele frequency">
* FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
* FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
* FORMAT=<ID=RD,Number=1,Type=Integer,Description="Depth of reference-supporting bases (reads1)">
* INFO=<ID=DP,Number=1,Type=Integer,Description="Total depth of quality bases">
* INFO=<ID=GPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor+normal versus no variant for Germline calls">
* INFO=<ID=SOMATIC,Number=0,Type=Flag,Description="Indicates if record is a somatic mutation">
* INFO=<ID=SPV,Number=1,Type=Float,Description="Fisher's Exact Test P-value of tumor versus normal for Somatic/LOH calls">
* INFO=<ID=SS,Number=1,Type=String,Description="Somatic status of variant (0=Reference,1=Germline,2=Somatic,3=LOH, or 5=Unknown)">
* INFO=<ID=SSC,Number=1,Type=Float,Description="Somatic score in Phred scale (0-255) derived from somatic p-value">

In [None]:
cols_to_drop = ['ID_freebayes','ID_mutect2','ID_vardict','ID_varscan', # id
                 'REF_freebayes','REF_mutect2','REF_vardict','REF_varscan', # ref genome
                 'ALT_1_freebayes','ALT_1_mutect2','ALT_1_vardict','ALT_1_varscan',
                 'ANN_freebayes','ANN_mutect2','ANN_vardict','ANN_varscan',
                 'LSEQ_vardict','RSEQ_vardict','SAMPLE_vardict','STATUS_vardict',
                 'TYPE_1_freebayes','TYPE_vardict', # type of mutation
                 'CIGAR_1_freebayes'] # only one value ("1X") found

## Gradient Boosting Decision Trees
* popular algorithms like XGboost and Catboost are examples of using the gradient boosting framework 
* unlike random forests, the decision trees in gradient boosting are built additively; each decision tree is built one after another
* each new tree is built to improve on deficiencies of the previous trees and this concept is called boosting 
* gradient of gradient boosting comes from minimising the gradient of the loss function 

In [8]:
import xgboost as xgb
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
import seaborn as sns
from numpy import nan

In [5]:
'''This part has been incorporated into parsing code
lst_dfs = [varscan_sub,freebayes_sub,mutect2_sub,vardict_sub]
suffix = ['vs','fb','m2','vd']
keep_same = {'CHROM', 'POS'}
i =0 
for df in lst_dfs:
    df.columns = ['{}{}'.format(c, '' if c in keep_same else '_'+suffix[i]) for c in df.columns]
    i += 1
lst_dfs
'''

[          CHROM     POS REF_vs  ALT_1_vs  SSC_vs    SPV_vs  is_snp_vs
 0             1   10146     AC       NaN     2.0  0.568070      False
 1             1   10177      A       NaN     8.0  0.129110      False
 2             1   10230     AC       NaN     4.0  0.361360      False
 3             1   10247      T       NaN    11.0  0.070191       True
 4             1   10248      A       NaN     8.0  0.152890       True
 ..          ...     ...    ...       ...     ...       ...        ...
 229  GL000192.1  546648      C       NaN     0.0  0.896130       True
 230  GL000192.1  547087      T       NaN     4.0  0.376560       True
 231  GL000192.1  547102      C       NaN     1.0  0.728090       True
 232  GL000192.1  547218      C       NaN     3.0  0.469670       True
 233  GL000192.1  547406      G       NaN     2.0  0.520770       True
 
 [4718826 rows x 7 columns],
             CHROM     POS REF_fb  ALT_1_fb    MQMR_fb  is_snp_fb
 0               1   10352      T       NaN        

In [9]:
'''This part has been incorporated into parsing code
merged_df = reduce(lambda left, right: pd.merge(left, right,on =['CHROM', 'POS'],
                                            how = 'outer', suffixes = ('', '')),lst_dfs)
'''

algos = ['freebayes', 'mutect2', 'vardict', 'varscan']
merged_df = merged_df.drop([f'is_snp_{algo}' for algo in algos], axis=1)
#merged_df.to_csv("syn1_mergered_df.csv")

In [12]:
from sklearn import datasets
import xgboost as xgb 
from xgboost import XGBClassifier
from sklearn.impute import KNNImputer

In [262]:
##  function to get y labels
truth_labels = pd.read_csv("syn1/syn1_truth.bed", sep = "\t", names = ['Chromo', 'start', 'end'])
print(list(set(truth_labels.start == truth_labels.end) )) # the start and end position are the same 
truth_labels = truth_labels[['Chromo', 'start']]
truth_labels['truth'] = 1
sub_truth= truth_labels.rename(columns = {'Chromo':'CHROM', 'start':'POS'})

[True]


In [263]:
# combine dataset 
combined = merged_df.merge(sub_truth, on=['CHROM','POS'], how = 'left' )
combined['truth'].fillna(0, inplace = True)

In [264]:
X = combined[combined.columns[~combined.columns.isin(['truth','POS','CHROM'])]]

y = combined['truth'] 

In [265]:
# ordinal encoding for REF and ALT
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(X)
new_X = enc.transform(X)

In [266]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(new_X, y, test_size=0.2)

In [267]:
model = XGBClassifier(eval_metric='rmse')

In [268]:
model.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='rmse', feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

In [269]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='mlogloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)



XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False,
              eval_metric='mlogloss', feature_types=None, gamma=0, gpu_id=-1,
              grow_policy=None, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_bin=None, max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=0, max_depth=6, max_leaves=None,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=16, num_parallel_tree=1,
              objective='multi:softprob', predictor=None, ...)

In [270]:
y_pred = model.predict(X_test)

In [271]:
# evaluate model performance # for syn1 dataset 
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[926684     45]
 [    45    664]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    926729
         1.0       0.94      0.94      0.94       709

    accuracy                           1.00    927438
   macro avg       0.97      0.97      0.97    927438
weighted avg       1.00      1.00      1.00    927438



In [246]:
# evaluate model performance # for real1 dataset 
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[930908     27]
 [    42    227]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    930935
         1.0       0.89      0.84      0.87       269

    accuracy                           1.00    931204
   macro avg       0.95      0.92      0.93    931204
weighted avg       1.00      1.00      1.00    931204

