In [1]:
import allel
import numpy as np
import pandas as pd
from functools import reduce

In [2]:
#place the python file in the same folder containing all the real1, real2_part1, ... folders

def parse_to_df(folder='test', features='*', algos=['freebayes', 'mutect2', 'vardict', 'varscan']):
    '''
    reads all vcf.gz files corresponding to algos in the specified folder with the specified list of features
    and combines the read files into one dataframe with (CHROM, POS, REF) as index.
    '''
    if folder == 'test':
        dfs = [allel.vcf_to_dataframe(f'{folder}/{i}.vcf.gz', fields = features) for i in algos]
    else:
        dfs = [allel.vcf_to_dataframe(f'{folder}/{folder}-{i}.vcf.gz', fields = features) for i in algos]
    algo_dicts = dict(zip(algos, dfs))
    
    #some manipulations
    for i in algo_dicts:
        algo_dicts[i].set_index(keys=['CHROM', 'POS', 'REF'], inplace = True) #will be use as keys for later merging
        algo_dicts[i] = algo_dicts[i][algo_dicts[i]['is_snp']]    #obtain only SNPs
        algo_dicts[i].columns = [j + '_' + i for j in algo_dicts[i].columns]

    #combining the dfs
    edited_dfs = [algo_dicts[i] for i in algos]

    merged = reduce(lambda left, right: pd.merge(left, right,
                                            how = 'outer',
                                            left_index=True, right_index=True,
                                            suffixes = ('', '')), edited_dfs)

    merged.columns = sorted(merged.columns)

    return merged


In [3]:
test = parse_to_df(folder='real1')

test

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ABP_1_freebayes,ABP_2_freebayes,ABP_3_freebayes,AB_1_freebayes,AB_2_freebayes,AB_3_freebayes,AC_1_freebayes,AC_2_freebayes,AC_3_freebayes,AF_1_freebayes,...,is_snp_mutect2,is_snp_vardict,is_snp_varscan,numalt_freebayes,numalt_mutect2,numalt_vardict,numalt_varscan,technology.illumina_1_freebayes,technology.illumina_2_freebayes,technology.illumina_3_freebayes
CHROM,POS,REF,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,10231,C,,,,,,,,,,,...,,,,,,,,,,
1,10250,A,,,,,,,,,,,...,False,True,False,False,False,1.0,0.0,0.0,0.0,True
1,10291,C,,,,,,,,,,,...,False,True,False,False,False,1.0,0.0,0.0,0.0,True
1,10583,G,rs58108140,A,,,271.100006,0.431034,,,5.406410,,...,False,True,False,False,False,1.0,0.0,0.0,0.0,True
1,12783,G,rs62635284,A,,,2044.599976,0.710801,,,113.786003,,...,False,True,False,False,False,1.0,0.0,0.0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Y,59033300,C,rs62604356,T,,,1367.199951,0.000000,,,0.000000,,...,,,,,,,,,,
Y,59033357,A,,,,,,,,,,,...,,,,,,,,,,
Y,59033361,G,,,,,,,,,,,...,,,,,,,,,,
Y,59033375,T,,,,,,,,,,,...,,,,,,,,,,
