In [3]:
import pandas as pd
import numpy as np
import os

In [4]:
usecol_ = ["Chr", "Start", "End", "Ref", "Alt", "Func.refGene", "Otherinfo10", "Otherinfo11"]
dtype_ = {"Chr":"category",
          "Start":np.int32,
          "End":np.int32,
          "Ref":"category",
          "Alt":"category",
          "Func.refGene":"category",
          "Otherinfo10":"category",
          "Otherinfo11":"string"}

In [5]:
df = pd.read_csv("anno1.fixed.chrY.genes.VE_22251.hg38_multianno.csv", usecols=usecol_, dtype=dtype_)

# add two columns DP and AF int and float
# split otherinfo11 into 4 columns (,3) and take the 3rd column (str[2]) which has DP=415
# run .str.split again to take DP=415 to 415 only
df = pd.eval("DP = df.Otherinfo11.str.split(';',3).str[2].str.split('DP=').str[1].astype('int32')", target=df)
df = pd.eval("AF = df.Otherinfo11.str.split(';',5).str[4].str.split('AF=').str[1].astype('float')", target=df)

In [6]:
# category filters works on Chr, Ref, Alt, Func.refGene, Otherinfo10
def category_filters(**kwargs):
    new_df = df
    for key, value in kwargs.items():
        if key == "FuncrefGene":
            new_df = new_df[new_df["Func.refGene"] == value]
        else:
            new_df = new_df[new_df[key] == value]
    return new_df

In [7]:
# if you want to use Func.refGene, enter 'FuncrefGene' instead 
display(category_filters(FuncrefGene="exonic", Otherinfo10="PASS"))

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Otherinfo10,Otherinfo11,DP,AF
0,chrY,2787119,2787119,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=415;VD=7;AF=0.0169...,415,0.0169
1,chrY,2787445,2787445,T,C,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=317;VD=4;AF=0.0126...,317,0.0126
15,chrY,2865200,2865200,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=173;VD=2;AF=0.0116...,173,0.0116
16,chrY,2866887,2866887,G,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=232;VD=4;AF=0.0172...,232,0.0172
28,chrY,2961350,2961350,G,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=152;VD=2;AF=0.0132...,152,0.0132
33,chrY,2978846,2978846,T,C,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=182;VD=2;AF=0.011;...,182,0.011
34,chrY,2979004,2979004,A,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=148;VD=4;AF=0.027;...,148,0.027
37,chrY,2979454,2979454,C,A,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=232;VD=10;AF=0.043...,232,0.0431
43,chrY,5100327,5100327,G,T,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=88;VD=88;AF=1;BIAS...,88,1.0
44,chrY,5100614,5100614,T,G,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=189;VD=189;AF=1;BI...,189,1.0


In [8]:
# filter keeps keys (DP or AF) greater than values (ie. 'DP = 15'; keeps DP vals > 15)
def num_filter(**kwargs):
    new_df = df
    for key, value in kwargs.items():
        # new_df.loc[:,key].loc[lambda x: x > value]
        # alternate, slightly less efficient vectorized method line below
        new_df.loc[:,key].where(new_df.loc[:,key] > value,inplace=True)
    return new_df.dropna()

In [9]:
display(num_filter(DP=150,AF=.15))

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Otherinfo10,Otherinfo11,DP,AF
44,chrY,5100614,5100614,T,G,exonic,PASS,SAMPLE=VE_22251;TYPE=SNV;DP=189;VD=189;AF=1;BI...,189.0,1.0


In [30]:
# category of refs: ["A","G","T"] -> [145, 166, 175]
def alpha_to_num(categories):
    ord_cats = []
    for category in categories:
        ord_total = ""
        for letter in category:
            ord_total += str(ord(letter))
        ord_cats.append(int(ord_total))
    return ord_cats

In [51]:
# subtraction algorithm
# first find all the same starts index in df1 and df2
def sub_start(df1, df2):
    df1 = df1[["Start", "End", "Ref", "Alt"]]
    df2 = df2[["Start", "End", "Ref", "Alt"]]
    se1 = pd.to_numeric(df1["Start"].astype("string") + df1["End"].astype("string"))
    se2 = pd.to_numeric(df2["Start"].astype("string") + df2["End"].astype("string"))
    # compare = np.isin(ndf1.Start.to_numpy(), ndf2.Start.to_numpy(), invert=True)
    ref_cats = np.array(pd.Categorical(df1["Ref"]).categories)
    alt_cats = np.array(pd.Categorical(df1["Alt"]).categories)
    # ref_num_cats = alpha_to_num()
    # alt__num_cats = alpha_to_num()
    np_ref = np.array(df1["Ref"])
    np_alt = np.array(df["Alt"])
    for cat in ref_cats:
        # category "A" -> 145
        cat_num = int(''.join([str(ord(x)) for x in cat]))
        np_ref = np.where(np_ref==cat, cat_num, np_ref)

    return np_ref

In [52]:
# if sub algorithm works, return df should have length = len(df)//2 - 3
print(sub_start(df[len(df)//2 + 3:], df[:len(df)//2]))
# output is df1 w indices of where it is equal

[67 67 84 67 84 67 65 67 67 65 84 67 65 67 67 71 84 67 71 71 67 84 71 71
 67 65 71 67 71 71 71 65 71 67 71 71 67 67 84 65 45 65 71 71 67 67 71 65
 71 71 67 65 45 67 65 71 67 71 71 65 71 71 65 45 65 67 71 71 65 71 65 71
 67 67 65 45 67 65 67 71 71 84 71 65 71 71 65 45 65 84 71 71 67 67 84 67
 84846784 84 84 67 67 71 71 84 84 67 67 67 65 71 71 65]


In [53]:
print(np.array(pd.Categorical(df["Ref"]).categories))
# print(np.array(ref.to_numpy()).decode('ascii'))
# np.array(['a','b','c']).tostring().encode("ascii")
# print([''.join([ord(i) for i in x]) for x in list(ref)])

['-' 'A' 'AA' 'C' 'G' 'GCA' 'GGGC' 'T' 'TG' 'TT' 'TTCT']


In [277]:
# Extract values and keys
x = np.array([[1,2],
     [3,4],
     [5,6]])
y = np.array([1,2,3,4,5,6])
d = {1:10,2:20,3:30,4:40,5:50,6:60}
dv = np.array(list(d.values()))
dk = np.array(list(d.keys()))

# Get positions of keys in first column of x and thus change the first column
_,C = np.where(x[:,0][:,None] == dk)
x[:,0] = dv[C]

_,C = np.where(x[:,1][:,None] == dk)
x[:,1] = dv[C]

_,C = np.where(y[:][:,None] == dk)
y[:] = dv[C]

print(y)

[10 20 30 40 50 60]


In [None]:
#TODO: make option to see what is NA and what is not NA
#TODO: make program and functions run on a class