In [3]:
import pandas as pd
import numpy as np
import os

In [237]:
usecols = ["Chr", "Start", "End", "Ref", "Alt", "Func.refGene", "Otherinfo10", "Otherinfo11"]
dtype = {"Chr":"category",
          "Start":np.int32,
          "End":np.int32,
          "Ref":"category",
          "Alt":"category",
          "Func.refGene":"category",
          "Otherinfo10":"category",
          "Otherinfo11":"string"}

In [238]:
df = pd.read_csv("anno1.fixed.chrY.genes.VE_22251.hg38_multianno.csv", usecols=usecols, dtype=dtype)
df = pd.eval("DP = df.Otherinfo11.str.split(';',3).str[2].str.split('DP=').str[1].astype('int32')", target=df)
df = pd.eval("AF = df.Otherinfo11.str.split(';',5).str[4].str.split('AF=').str[1].astype('float')", target=df)

In [60]:
# this works on Chr, Ref, Alt, Func.refGene, Otherinfo10
def category_filters(**kwargs):
    new_df = df
    for key, value in kwargs.items():
        if key == "FuncrefGene":
            new_df = new_df[new_df["Func.refGene"] == value]
        else:
            new_df = new_df[new_df[key] == value]
    return new_df

In [7]:
# for Func.refGene, enter 'FuncrefGene' instead 
display(category_filters(FuncrefGene="exonic", Otherinfo10="PASS"))

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Otherinfo10,Otherinfo11,DP,AF
0,chrY,2787119,2787119,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=415;VD=7;AF=0.0169...,415,0.0169
1,chrY,2787445,2787445,T,C,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=317;VD=4;AF=0.0126...,317,0.0126
15,chrY,2865200,2865200,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=173;VD=2;AF=0.0116...,173,0.0116
16,chrY,2866887,2866887,G,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=232;VD=4;AF=0.0172...,232,0.0172
28,chrY,2961350,2961350,G,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=152;VD=2;AF=0.0132...,152,0.0132
33,chrY,2978846,2978846,T,C,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=182;VD=2;AF=0.011;...,182,0.011
34,chrY,2979004,2979004,A,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=148;VD=4;AF=0.027;...,148,0.027
37,chrY,2979454,2979454,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=232;VD=10;AF=0.043...,232,0.0431
43,chrY,5100327,5100327,G,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=88;VD=88;AF=1;BIAS...,88,1.0
44,chrY,5100614,5100614,T,G,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=189;VD=189;AF=1;BI...,189,1.0


In [8]:
# this assumes greater than i.e 'DP = 15' means keep DP vals > 15
def num_filter(**kwargs):
    new_df = df
    for key, value in kwargs.items():
        new_df.loc[:,key].where(new_df.loc[:,key] > value,inplace=True)
    return new_df.dropna()

In [9]:
display(num_filter(DP=150,AF=.15))

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Otherinfo10,Otherinfo11,DP,AF
44,chrY,5100614,5100614,T,G,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=189;VD=189;AF=1;BI...,189.0,1.0


In [415]:
def subtraction(df1, df2):
    # joins cols 1 - start -> 4 - alt, concat df2 col at btm of df1 col
    uqid_df1 = df1[df1.columns[1:4]].apply(lambda x: ''.join(x.astype(str)),axis=1).squeeze()
    uqid_df2 = df2[df2.columns[1:4]].apply(lambda x: ''.join(x.astype(str)),axis=1).squeeze()  
    uqid = pd.concat([uqid_df1, uqid_df2], ignore_index=True)
    
    uqid_df = pd.DataFrame({'idx':np.arange(0, len(uqid.index))}, index=uqid)
    uqid_df.reset_index(inplace=True)
    uqid_df = uqid_df.sort_index().groupby('index').filter(lambda x: len(x) == 1)
    
    idx_arr = uqid_df['idx'].to_numpy()
    sep_dfs = np.searchsorted(idx_arr, len(uqid_df1.index), side='right')

    # [1,3,5] -> [2,4], gets index of values that should be dropped
    invert_arr = lambda arr: np.array(sorted(set(range(arr[0], arr[-1] + 1)).difference(arr)))
    
    df1_drop_idxs = invert_arr(idx_arr[:sep_dfs])
    df2_drop_idxs = invert_arr(idx_arr[sep_dfs:]) - len(uqid_df1.index)
    df1.drop(df1.index[[df1_drop_idxs]])
    df2.drop(df2.index[[df2_drop_idxs]])
    
    return df2

In [416]:
df2_ = pd.read_csv("chrY_22243.csv", usecols=["Chr", "Start", "End", "Ref", "Alt"], sep='\t')
df2_ = df2_.append(df.iloc[-1])
sub_df1 = subtraction(df, df2_)
display(sub_df1[:60])

Unnamed: 0,Chr,Start,End,Ref,Alt,AF,DP,Func.refGene,Otherinfo10,Otherinfo11
0,chrY,2786930,2786930,A,G,,,,,
1,chrY,2787647,2787647,G,T,,,,,
2,chrY,2841923,2841923,C,G,,,,,
3,chrY,2841942,2841942,G,A,,,,,
4,chrY,2841962,2841962,A,T,,,,,
5,chrY,2842106,2842106,G,T,,,,,
6,chrY,2844103,2844103,C,A,,,,,
7,chrY,2845886,2845886,G,A,,,,,
8,chrY,2854465,2854465,A,G,,,,,
9,chrY,2854484,2854484,C,A,,,,,


In [210]:
#TODO: make option to see what is NA and what is not NA
#TODO: make program and functions run on a class
# possible error of pd.to_numeric int out of range may come from long ref/alt strings