In [19]:
import numpy as np
import pandas as pd
from numba import jit
from itertools import compress
import os

In [20]:
"""
run extract_uq_id twice, for each file, then run subtraction function both ways
extract_uq_id is slow when put in subtraction function (~3 min for 157000 rows)
this is because it runs in1d on strings not ints
could be changed in future (ref alt to ints or subtracted after start end subtraction)
for reference start end subtraction took .035 seconds for same length of arrays
"""

'\nrun extract_uq_id twice, for each file, then run subtraction function both ways\nextract_uq_id is slow when put in subtraction function (~3 min for 157000 rows)\nthis is because it runs in1d on strings not ints\ncould be changed in future (ref alt to ints or subtracted after start end subtraction)\nfor reference start end subtraction took .035 seconds for same length of arrays\n'

In [21]:
# ONLY DOING SUBTRACTION HERE
# takes file - parent or variant - extracts only first 5 columns as list of unqiue ids i.e 'chr123471CA'
# if header is true, skip first row
# not using chromosome column because it is assumed the same respective per parent-variant pair
def extract_uq_id(filename, header):
    if header:
        df = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[1,2,3,4], names=['start', 'end', 'ref', 'alt'], dtype={'start':'string','end':'string','ref':'string','alt':'string'})
    else:
        df = pd.read_csv(filename, sep='\t', engine='c', usecols=[1,2,3,4], names=['start', 'end', 'ref', 'alt'], dtype={'start':'string','end':'string','ref':'string','alt':'string'})
    uq_id = pd.Series(df.start + df.end + df.ref + df.alt).to_numpy()
    return uq_id

In [22]:
# testing in1d function for subtraction
a = np.array([1,2,3,4,5,6])
b = np.array([2,3])
print(np.in1d(a, b))

c = np.array(['a','b','c'])
d = np.array(['a'])
print(np.in1d(c, d))

[False  True  True False False False]
[ True False False]


In [23]:
# runs only one starts ends to run subtraction faster (only ints)
# returns start int followed by end int - start int as int type
# i.e. start 234314, end 234316 --> 2343142
# must shorten int since numba njit requires 10 digit max,  c long

# read two columns separately squeeze then subtract
def extract_uq_id_start_end(filename, header):
    if header:
        start = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[1], names=['start'], squeeze=True, dtype={'start':'int'}).to_numpy()
        end = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[2], names=['end'], squeeze=True, dtype={'end':'int'}).to_numpy()
    else:
        start = pd.read_csv(filename, sep='\t', engine='c', usecols=[1], names=['start'], squeeze=True, dtype={'start':'int'}).to_numpy()
        end = pd.read_csv(filename, sep='\t', engine='c', usecols=[2], names=['end'], squeeze=True, dtype={'end':'int'}).to_numpy()
    
    # string of end - starts appended to starts
    subtraction = np.subtract(end, start)
    
    df = pd.DataFrame({'start':start, 'subtraction':subtraction})
    uq_id = pd.Series(df.start.astype(str) + df.subtraction.astype(str)).astype(np.int64).to_numpy()
    
    return uq_id

In [24]:
def extract_uq_id_ref_alt(filename, header):
    if header:
        ref_alt = pd.read_csv(filename, sep='\t', skiprows=1, engine='c', usecols=[3,4], names=['ref', 'alt'], dtype={'ref':'string','alt':'string'})
    else:
        ref_alt = pd.read_csv(filename, sep='\t', engine='c', usecols=[3,4], names=['ref', 'alt'], dtype={'ref':'string','alt':'string'})
    
    ref_alt['ref'] = ref_alt['ref'].apply(lambda x: sum([ord(i) for i in x]))
    ref_alt['alt'] = ref_alt['alt'].apply(lambda x: sum([ord(i) for i in x]))
    
    uq_id = pd.Series(ref_alt.ref.astype(str) + ref_alt.alt.astype(str)).astype(int).to_numpy()
    return uq_id

In [25]:
unique_ids2 = extract_uq_id_start_end('test_anno.txt', True)
print(unique_ids2)

[    654330     659990     695690 ... 2489181120 2489181290 2489182500]


In [26]:
unique_ids3 = extract_uq_id_ref_alt('test_anno.txt', True)
print(unique_ids3)

[7184 7184 8465 ... 6765 6765 6765]


In [27]:
# testing extract_uq_id function
unique_ids = extract_uq_id('test_anno.txt', True)
print(len(unique_ids))
unique_ids

157598


array(['6543365433GT', '6599965999GT', '6956969569TA', ...,
       '248918112248918112CA', '248918129248918129CA',
       '248918250248918250CA'], dtype=object)

In [28]:
# two arrays - first second - return second subtracted from first
# i.e. first [1,2,3,4,5] second [1,2,3], subtraction [4,5]
@jit
def subtraction(first, second):
    return np.in1d(first, second)
    # return np.setdiff1d(first.astype(int, copy=False), second.astype(int, copy=False))

In [29]:
arr_x = unique_ids3
arr_y = unique_ids3[len(unique_ids3)//2:]
sub_uq_ids3 = subtraction(arr_x, arr_y)

Compilation is falling back to object mode WITH looplifting enabled because Function "subtraction" failed type inference due to: [1m[1mUse of unsupported NumPy function 'numpy.in1d' or unsupported use of the function.
[1m
File "<ipython-input-28-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
[0m[1mDuring: typing of get attribute at <ipython-input-28-0e840c8605ed> (5)[0m
[1m
File "<ipython-input-28-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
  @jit
[1m
File "<ipython-input-28-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-

In [30]:
# should return ['d', 'e']
arr_a = ['a', 'b', 'c', 'd', 'e']
arr_b = ['a', 'b', 'c']
subtraction(arr_a, arr_b)

Compilation is falling back to object mode WITH looplifting enabled because Function "subtraction" failed type inference due to: [1m[1mUse of unsupported NumPy function 'numpy.in1d' or unsupported use of the function.
[1m
File "<ipython-input-28-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
[0m[1mDuring: typing of get attribute at <ipython-input-28-0e840c8605ed> (5)[0m
[1m
File "<ipython-input-28-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
  @jit
[1m
File "<ipython-input-28-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-

array([ True,  True,  True, False, False])

In [31]:
arr_c = unique_ids
arr_d = unique_ids[:len(unique_ids)//2]
sub_uq_ids = subtraction(arr_c, arr_d)

Compilation is falling back to object mode WITH looplifting enabled because Function "subtraction" failed type inference due to: [1m[1mnon-precise type array(pyobject, 1d, C)[0m
[0m[1mDuring: typing of argument at <ipython-input-28-0e840c8605ed> (5)[0m
[1m
File "<ipython-input-28-0e840c8605ed>", line 5:[0m
[1mdef subtraction(first, second):
[1m    return np.in1d(first, second)
[0m    [1m^[0m[0m
[0m
  @jit
[1m
File "<ipython-input-28-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m
Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.pydata.org/numba-doc/latest/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit
[1m
File "<ipython-input-28-0e840c8605ed>", line 4:[0m
[1m@jit
[1mdef subtraction(first, second):
[0m[1m^[0m[0m
[0m


KeyboardInterrupt: 

In [None]:
arr_e = unique_ids2
arr_f = unique_ids2[:len(unique_ids2)//2]
sub_uq_ids2 = subtraction(arr_e, arr_f)

In [None]:
print(sub_uq_ids2)
print(len(sub_uq_ids2))
print(len(sub_uq_ids2)//2)
print(np.count_nonzero(sub_uq_ids2))

In [None]:
print(list(sub_uq_ids))
print(len(sub_uq_ids))
print(len(sub_uq_ids)//2)
print(np.count_nonzero(sub_uq_ids))