In [1]:
import numpy as np
import pandas as pd
from numba import jit
from itertools import compress
import os

In [2]:
"""
run extract_uq_id twice, for each file, then run subtraction function both ways
extract_uq_id is slow when put in subtraction function (~3 min for 157000 rows)
this is because it runs in1d on strings not ints
could be changed in future (ref alt to ints or subtracted after start end subtraction)
for reference start end subtraction took .035 seconds for same length of arrays
"""

'\nrun extract_uq_id twice, for each file, then run subtraction function both ways\nextract_uq_id is slow when put in subtraction function (~3 min for 157000 rows)\nthis is because it runs in1d on strings not ints\ncould be changed in future (ref alt to ints or subtracted after start end subtraction)\nfor reference start end subtraction took .035 seconds for same length of arrays\n'

In [46]:
# ONLY DOING SUBTRACTION HERE
# takes file - parent or variant - extracts only first 5 columns as list of unqiue ids i.e 'chr123471CA'
# if header is true, skip first row
# not using chromosome column because it is assumed the same respective per parent-variant pair
def extract_uq_id(filename, header):
    if header:
        df = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[1,2,3,4], names=['start', 'end', 'ref', 'alt'], dtype={'start':'string','end':'string','ref':'string','alt':'string'})
    else:
        df = pd.read_csv(filename, sep='\t', engine='c', usecols=[1,2,3,4], names=['start', 'end', 'ref', 'alt'], dtype={'start':'string','end':'string','ref':'string','alt':'string'})
    uq_id = pd.Series(df.start + df.end + df.ref + df.alt).to_numpy()
    return uq_id

In [35]:
# testing in1d function for subtraction
a = np.array([1,2,3,4,5,6])
b = np.array([2,3])
print(np.in1d(a, b))

c = np.array(['a','b','c'])
d = np.array(['a'])
print(np.in1d(c, d))

[False  True  True False False False]
[ True False False]


In [5]:
# runs only one starts ends to run subtraction faster (only ints)
# returns start int followed by end int - start int as int type
# i.e. start 234314, end 234316 --> 2343142
# must shorten int since numba njit requires 10 digit max,  c long

# read two columns separately squeeze then subtract
def extract_uq_id_start_end(filename, header):
    if header:
        start = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[1], names=['start'], squeeze=True, dtype={'start':'int'}).to_numpy()
        end = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[2], names=['end'], squeeze=True, dtype={'end':'int'}).to_numpy()
    else:
        start = pd.read_csv(filename, sep='\t', engine='c', usecols=[1], names=['start'], squeeze=True, dtype={'start':'int'}).to_numpy()
        end = pd.read_csv(filename, sep='\t', engine='c', usecols=[2], names=['end'], squeeze=True, dtype={'end':'int'}).to_numpy()
    
    # string of end - starts appended to starts
    subtraction = np.subtract(end, start)
    del end
    
    df = pd.DataFrame({'start':start, 'subtraction':subtraction})
    uq_id = pd.Series(df.start + df.subtraction).to_numpy()
    
    return uq_id # list(map(''.join, zip(starts_str, subtraction)))

In [6]:
unique_ids2 = extract_uq_id_start_end('test_anno.txt', True)
print(unique_ids2)

[    65433     65999     69569 ... 248918112 248918129 248918250]


In [47]:
# testing extract_uq_id function
unique_ids = extract_uq_id('test_anno.txt', True)
print(len(unique_ids))
unique_ids

            start        end ref alt
0           65433      65433   G   T
1           65999      65999   G   T
2           69569      69569   T   A
3           69907      69907   C   T
4           70250      70250   A   G
...           ...        ...  ..  ..
157593  248917519  248917519   C   A
157594  248917675  248917675   G   T
157595  248918112  248918112   C   A
157596  248918129  248918129   C   A
157597  248918250  248918250   C   A

[157598 rows x 4 columns]
157598


array(['6543365433GT', '6599965999GT', '6956969569TA', ...,
       '248918112248918112CA', '248918129248918129CA',
       '248918250248918250CA'], dtype=object)

In [27]:
# two arrays - first second - return second subtracted from first
# i.e. first [1,2,3,4,5] second [1,2,3], subtraction [4,5]
@jit
def subtraction(first, second):
    return np.in1d(first, second)
    # return np.setdiff1d(first.astype(int, copy=False), second.astype(int, copy=False))

In [64]:
# should return ['d', 'e']
arr_a = ['a', 'b', 'c', 'd', 'e']
arr_b = ['a', 'b', 'c']
subtraction(arr_a, arr_b)

array(['d', 'e'], dtype='<U1')

In [41]:
arr_c = unique_ids
arr_d = unique_ids[:len(unique_ids)//2]
sub_uq_ids = subtraction(arr_c, arr_d)

Compilation is falling back to object mode WITH looplifting enabled because Function "subtraction" failed type inference due to: [1m[1mnon-precise type array(pyobject, 1d, C)[0m
[0m[1mDuring: typing of argument at <ipython-input-27-0e840c8605ed> (5)[0m
[1m
File "<ipython-input-27-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
  @jit
[1m
File "<ipython-input-27-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "<ipython-input-27-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m


In [39]:
arr_e = unique_ids2
arr_f = unique_ids2[:len(unique_ids2)//2]
sub_uq_ids2 = subtraction(arr_e, arr_f)

In [43]:
print(sub_uq_ids2)
print(len(sub_uq_ids2))
print(len(sub_uq_ids2)//2)
print(np.count_nonzero(sub_uq_ids2))

[ True  True  True ... False False False]
157598
78799
78799


In [57]:
print(list(sub_uq_ids))
print(len(sub_uq_ids))
print(len(sub_uq_ids)//2)
print(np.count_nonzero(sub_uq_ids))

[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, Tru