In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
usecols = ["Chr", "Start", "End", "Ref", "Alt", "Func.refGene", "Otherinfo10", "Otherinfo11"]
dtype = {"Chr":"category",
          "Start":np.int32,
          "End":np.int32,
          "Ref":"category",
          "Alt":"category",
          "Func.refGene":"category",
          "Otherinfo10":"category",
          "Otherinfo11":"string"}

In [5]:
def category_filter(df, **kwargs):
    for key, value in kwargs.items():
        if key == "FuncrefGene":
            df = df[df["Func.refGene"] == value]
        else:
            df = df[df[key] == value]
    return df

In [7]:
def num_filter(df, **kwargs):
    df = pd.eval("DP = df.Otherinfo11.str.split(';',3).str[2].str.split('DP=').str[1].astype('int32')", target=df)
    df = pd.eval("AF = df.Otherinfo11.str.split(';',5).str[4].str.split('AF=').str[1].astype('float')", target=df)
    for key, value in kwargs.items():
        df.loc[:,key].where(df.loc[:,key] > value, inplace=True)
    return df.dropna()

In [3]:
def subtract(df1, df2):
    # minuend - subtrahend parameters
    uqid_df1 = df1[df1.columns[1:5]].apply(lambda x: ''.join(x.astype(str)),axis=1).squeeze()
    uqid_df2 = df2[df2.columns[1:5]].apply(lambda x: ''.join(x.astype(str)),axis=1).squeeze()  
    uqid = pd.concat([uqid_df1, uqid_df2], ignore_index=True)
    uqid_df1_len, uqid_df2_len= len(uqid_df1.index), len(uqid_df2.index)
    idx = np.append(np.arange(0, uqid_df1_len), np.arange(0, uqid_df2_len))
    uqid_df = pd.DataFrame({'idx':idx}, index=uqid)
    uqid_df.reset_index(inplace=True)
    uqid_df = uqid_df.sort_index().groupby('index').filter(lambda x: len(x) == 1)
    idx = uqid_df['idx'].to_numpy()
    mid_val = idx[len(idx)//5:].argmin() + len(idx)//5
    inv_arr = lambda max_val,idx_arr: np.array(sorted(set(range(0, max_val)).difference(idx_arr)))
    df1=df1.drop(df1.index[inv_arr(uqid_df1_len, idx[:mid_val])])
    df2=df2.drop(df2.index[inv_arr(uqid_df2_len, idx[mid_val:])])
    if export:
        df1.to_csv('sub_df1.csv', index=False, header=True)
        df2.to_csv('sub_df2.csv', index=False, header=True)
    return [df1,df2]

In [8]:
def init_df(chromomosomes_arg, sample_id, dtype_arg, usecols_arg):
    return pd.concat(map(pd.read_csv, 
                       ["chr{}_{}.csv".format(chrom, sample_id) for chrom in chromosomes_arg], 
                      dtype=dtype+arg, usecols=usecols_arg), ignore_index=True)

In [None]:
def main():
    ids = {'22241':['22334','22243']}
    chromosomes = ['1','2']
    # p - v, p filtered v unfiltered
    for key, value in ids.items():
        parent_df = init_df(chromosomes, key, dtype, usecols)
        category_filter(parent_df, Otherinfo10='PASS')
        num_filter(parent_df, DP=15, AF=.15)
        for val in value:
            # instead of init_df make it a class 
            # variant_df = mech(chrom...)
            variant_df = init_df(chromosomes, val, dtype, usecols)
            subtract(parent_df, variant_df).to_csv("{}-{}.csv".format(key, val))
            # want subtract as one csv
    # v - p, v filtered p unfiltered
    for key, value in ids.items():
        parent_df = init_df(chromosomes, key, dtype, usecols)
        for val in value:
            variant_df = init_df(chromosomes, val, dtype, usecols)
            category_filter(variant_df, Otherinfo10='PASS')
            num_filter(parent_df, DP=15, AF=.15)
            subtract(variant_df, parent_df).to_csv("{}-{}.csv".format(key, val))
    return None

In [None]:
"""
mass subtraction and filtration to speed up

TEMPLATES:
minuend - subtrahend

* all chromosomes and sample ids
1)
minuend: DP>15, AF>0.15, Otherinfo10: PASS
subtrahend: none
some of the time, Funcrefgene, exonic, exonic splicing

2) 
mineund, subtrahend: SAME AS 1)
remove condition for FuncrefGene
"""

In [None]:
main()