In [59]:
import numpy as np
import tensorflow as tf
from tensorflow.contrib.memory_stats.python.ops.memory_stats_ops import BytesInUse

import itertools

import pandas as pd

# sorting results
from collections import defaultdict
from operator import itemgetter
import timeit #measure runtime
import random #masking
import operator #remove entire columns from 2d arrays

import multiprocessing #parallel tasks
from functools import partial # pool.map with multiple args



do_parallel_MAF = True
ncores = multiprocessing.cpu_count() #for parallel processing


In [2]:
def process_data(posfile, infile, categorical="False"):
    
    start = timeit.default_timer()
    
    #Header and column names start with hastag, skip those
    #posfile should contain 2 columns separated by tab: 1st = chromosome ID, 2nd = position
    #vcf can be imported as posfile as well, but will take much longer to read and process
    refpos = pd.read_csv(posfile, sep='\t', comment='#',header=None)
    
    #0      22065657
    #1      22065697
    #2      22065904
    #3      22065908
    #4      22065974
    #5      22065977
    
    refpos = pd.Series(refpos[1], index=range(len(refpos[1])))

    #print(refpos[1])
    
    #infile is the input file: genotype data set to be imputed
    df = pd.read_csv(infile, sep='\t', comment='#',header=None)
    
    #0      22065657
    #1      22066211
    #2      22066363
    #3      22066572
    #4      22067004
    #5      22067276
    
    inpos = pd.Series(range(len(df[1])), index=df[1])
    
    #print(inpos[2])
    
    #genetic variants are rows and samples are columns
    #let's transpose so the variants become columns and samples are rows
    df_T = df.transpose()
    
    new_df = 0
    
    if(categorical=="False"):
        new_df = np.zeros((len(df_T)-9,len(refpos),2)) #subjects, variants, Allele counts
    else:
        new_df = np.zeros((len(df_T)-9,len(refpos)))  #subjects, variants
    #print(new_df.shape)
    i = 9 #RR column index
    j = 0 #RR row index
    idx = 0
    print("Processing input data.")
    #print(categorical)
    myidx = 0
    
    while i < len(df_T):
        j = 0
        while j < len(refpos): #"|" is present when phased data is proved, "/" is usually unphased
            if(refpos[j] in inpos.keys()):
                myidx = inpos[refpos[j]]
                #print(j)
                #print(inpos[refpos[j]])
                #print(refpos[j])
                #print(df[i][myidx])
                #print(df[i+1][myidx])
                if(df[i][myidx].startswith('1|1') or df[i][myidx].startswith('1/1')):
                    if(categorical=="True"):
                        new_df[idx][j] = 2
                    else:
                        #new_df[idx][j] = np.array([0,2])
                        new_df[idx][j][0] = 0
                        new_df[idx][j][1] = 2
                elif(df[i][myidx].startswith('1|0') or df[i][myidx].startswith('0|1') or df[i][myidx].startswith('1/0') or df[i][myidx].startswith('0/1')):
                    if(categorical=="True"):
                        new_df[idx][j] = 1
                    else:
                        #new_df[idx][j] = np.array([1,1])
                        new_df[idx][j][0] = 1
                        new_df[idx][j][1] = 1
                elif(df[i][myidx].startswith('0|0') or df[i][myidx].startswith('0/0')):
                    if(categorical=="True"): 
                        new_df[idx][j] = 0
                    else:
                        #new_df[idx][j] = np.array([2,0])
                        new_df[idx][j][0] = 2
                        new_df[idx][j][1] = 0
                else:
                    if(categorical=="True"):
                        new_df[idx][j] = -1
                    else:
                        #new_df[idx][j] = np.array([0,0]) 
                        new_df[idx][j][0] = 0 
                        new_df[idx][j][1] = 0 
            else:
                if(categorical=="True"):
                    new_df[idx][j] = -1
                else:
                    #new_df[idx][j] = np.array([0,0]) 
                    new_df[idx][j][0] = 0 
                    new_df[idx][j][1] = 0 
                #if(idx==0):
                #    print(j)
                #RR I forgot to mention that we have to take into account possible missing data
                #RR in case there is missing data (NA, .|., -|-, or anything different from 0|0, 1|1, 0|1, 1|0) = 3
            j += 1
        i += 1
        #pbar.update(1)
        idx += 1

    #print("processed_data")
    #for i in range(10):
    #    print(new_df[i][0])

    #the data needs to be flattened because the matrix multiplication step (x*W) 
    #doesn't support features with subfeatures (matrix of vectors)
    #new_df = np.reshape(new_df, (new_df.shape[0],new_df.shape[1]*2))
    #print(new_df.shape)
    #pbar.close()
    stop = timeit.default_timer()
    print('Time to load the data (sec): ', stop - start)
    
    return new_df


In [3]:
def mask_data(mydata, mask_rate=0.9, categorical="False"):
    start = timeit.default_timer()
    # random matrix the same shape of your data
    #print(len(mydata))
    nmask = int(round(len(mydata[0])*mask_rate))
    # random boolean mask for which values will be changed
    maskindex = random.sample(range(0, len(mydata[0]-1)), nmask)
    maskindex = [211, 654, 274, 410, 236, 731, 678, 490, 710, 650, 377, 826, 483, 828, 283, 237, 625, 844, 796, 428, 235, 238, 185, 790, 630, 89, 51, 760, 134, 421, 217, 138, 513, 116, 824, 623, 266, 142, 286, 158, 485, 309, 794, 546, 203, 809, 333, 593, 258, 172, 605, 293, 466, 693, 97, 234, 326, 764, 628, 372, 426, 46, 52, 752, 680, 493, 798, 507, 98, 243, 87, 31, 477, 510, 28, 264, 606, 166, 154, 63, 374, 547, 57, 91, 543, 818, 363, 122, 218, 219, 365, 624, 703, 821, 707, 191, 423, 61, 399, 738, 222, 540, 685, 801, 737, 104, 587, 793, 661, 486, 782, 37, 516, 772, 814, 305, 588, 103, 859, 762, 22, 567, 487, 223, 658, 541, 523, 33, 296, 465, 130, 626, 361, 761, 419, 111, 226, 622, 402, 163, 132, 691, 39, 648, 495, 287, 492, 40, 190, 501, 520, 827, 231, 770, 713, 659, 702, 340, 517, 24, 277, 749, 221, 524, 167, 504, 126, 531, 617, 527, 371, 742, 108, 719, 164, 392, 739, 602, 298, 41, 640, 674, 313, 646, 422, 566, 594, 290, 383, 187, 837, 112, 450, 603, 457, 805, 500, 148, 629, 225, 389, 21, 591, 627, 378, 3, 512, 449, 570, 140, 855, 263, 468, 143, 322, 544, 604, 451, 754, 242, 95, 10, 413, 49, 864, 347, 461, 141, 375, 689, 692, 706, 345, 542, 849, 145, 107, 803, 584, 780, 632, 385, 579, 503, 69, 415, 788, 81, 330, 643, 559, 56, 284, 438, 571, 718, 241, 34, 316, 532, 817, 775, 665, 58, 511, 787, 268, 536, 147, 379, 533, 637, 740, 14, 783, 118, 171, 168, 366, 84, 842, 853, 357, 319, 292, 88, 406, 324, 829, 765, 489, 408, 435, 444, 156, 215, 109, 644, 808, 607, 744, 663, 845, 4, 12, 220, 865, 356, 331, 418, 813, 315, 447, 454, 539, 779, 170, 251, 83, 631, 189, 73, 262, 105, 858, 615, 188, 353, 301, 581, 700, 350, 456, 730, 555, 664, 196, 70, 458, 529, 723, 725, 840, 32, 247, 60, 306, 439, 38, 573, 834, 822, 387, 48, 701, 216, 564, 810, 317, 442, 54, 327, 437, 254, 386, 240, 230, 734, 590, 151, 136, 472, 328, 836, 611, 515, 820, 460, 250, 589, 425, 5, 786, 497, 655, 666, 568, 657, 535, 561, 233, 835, 71, 125, 756, 131, 769, 161, 578, 862, 577, 653, 123, 720, 272, 181, 528, 552, 178, 807, 208, 152, 96, 434, 209, 785, 621, 619, 176, 462, 295, 610, 649, 745, 232, 514, 861, 667, 239, 229, 433, 260, 159, 113, 481, 360, 35, 436, 494, 397, 11, 416, 797, 72, 25, 213, 595, 320, 198, 348, 560, 395, 153, 771, 727, 318, 66, 139, 92, 99, 694, 411, 179, 502, 304, 804, 120, 252, 101, 750, 403, 9, 204, 480, 741, 811, 42, 592, 129, 598, 599, 124, 843, 329, 574, 224, 19, 420, 396, 310, 43, 370, 812, 792, 777, 521, 846, 635, 115, 393, 645, 609, 368, 616, 519, 160, 755, 380, 362, 711, 94, 311, 746, 409, 802, 776, 312, 68, 400, 816, 735, 668, 280, 373, 67, 30, 612, 660, 302, 509, 259, 173, 841, 390, 582, 79, 369, 200, 192, 26, 681, 733, 384, 526, 597, 585, 795, 246, 647, 554, 271, 833, 55, 257, 850, 478, 565, 417, 498, 269, 430, 613, 401, 128, 227, 537, 114, 0, 854, 248, 194, 699, 747, 407, 851, 228, 342, 86, 183, 155, 278, 508, 636, 614, 135, 459, 212, 424, 496, 452, 100, 273, 245, 684, 671, 766, 337, 484, 59, 376, 297, 106, 341, 1, 839, 344, 332, 146, 732, 714, 580, 180, 29, 355, 856, 276, 78, 767, 548, 214, 819, 18, 2, 62, 557, 76, 751, 275, 562, 499, 728, 525, 279, 15, 432, 45, 823, 119, 85, 475, 675, 569, 255, 300, 608, 690, 708, 784, 253, 6, 715, 736, 551, 729, 448, 717, 20, 102, 367, 679, 47, 575, 641, 455, 336, 832, 670, 323, 398, 556, 726, 64, 427, 202, 186, 530, 93, 848, 748, 695, 482, 36, 77, 82, 620, 388, 23, 474, 314, 470, 338, 669, 759, 860, 800, 696, 852, 743, 583, 618, 506, 601, 53, 65, 463, 633, 716, 169, 288, 414, 774, 291, 193, 596, 261, 768, 704, 453, 440, 359, 550, 381, 90, 473, 758, 303, 150, 294, 697, 799, 174, 197, 445, 265, 335, 639, 572, 446, 863, 825, 80, 299, 781, 133, 538, 144, 270, 206, 7, 207, 471, 175, 74, 753, 44, 712, 382, 722, 600, 334, 13, 182, 349, 476, 488, 165, 687, 709, 210, 351, 431, 576, 682, 321, 549, 308, 289, 50, 121]
    #np.random.randint(0,len(mydata[0]),size=nmask)
    print("Masking markers...")
    print(maskindex)
    #mydata = np.transpose(mydata)
    print(mydata.shape)
    #mydata
    
    #pbar = tqdm(total = len(maskindex))
    #for i in range(10):
    #    print(mydata[i][0:11])


    for i in maskindex:
        #print(len(mydata[i]))
        j = 0
        while j < len(mydata):
            if(categorical=="True"):
                mydata[j][i]=-1
            else:
                mydata[j][i]=[0,0]
            j=j+1
        #pbar.update(1)
        #print(mydata[i])
    #mydata = np.transpose(mydata)
    #print(mydata.shape)
    #pbar.close()

    #print("after masking:")
    #for i in range(10):
    #    print(mydata[i][0:11])

    stop = timeit.default_timer()
    print('Time to mask the data (sec): ', stop - start)  
    return mydata

In [80]:
def accuracy_maf_threshold(x, y, threshold1, threshold2, categorical=False):
    
    colsum=np.sum(y, axis=0)
    indexes_to_keep = []
    i = 0
    j = 0
    k = 0  
    #print(len(MAF_all_var))
    
    if(threshold1!=0 or threshold2!=max(MAF_all_var)):
          
        #MAFs = calculate_MAF(y, categorical)
    
        while i < len(MAF_all_var):
            if(MAF_all_var[i]>=threshold1 and MAF_all_var[i]<=threshold2):
                if(categorical==True):
                    if(colsum[j]!=0 or colsum[j+1]!=0 or colsum[j+2]!=0):
                        indexes_to_keep.append(j)
                        indexes_to_keep.append(j+1)
                        indexes_to_keep.append(j+2)
                elif(categorical==False):
                    if(colsum[k]!=0 or colsum[k+1]!=0):
                        indexes_to_keep.append(k)
                        indexes_to_keep.append(k+1)            
            i += 1
            j += 3
            k += 2
        
        #print(indexes_to_keep)
        #print(len(x[0]))
        #print(len(y[0]))
        #print(len(filtered_data_x))
        #print(len(filtered_data_y[0]))
    else:

        while i < len(MAF_all_var):
            if(categorical==True):
                if(colsum[j]!=0 or colsum[j+1]!=0 or colsum[j+2]!=0):
                    indexes_to_keep.append(j)
                    indexes_to_keep.append(j+1)
                    indexes_to_keep.append(j+2)
            elif(categorical==False):
                if(colsum[k]!=0 or colsum[k+1]!=0):
                    indexes_to_keep.append(k)
                    indexes_to_keep.append(k+1)            
            i += 1
            j += 3
            k += 2   
            
    getter = operator.itemgetter(indexes_to_keep)
    filtered_data_x = list(map(list, map(getter, np.copy(x))))
    filtered_data_y = list(map(list, map(getter, np.copy(y))))
    
    correct_prediction = np.equal( np.round( filtered_data_x ), np.round( filtered_data_y ) )
    accuracy_per_marker = np.mean(correct_prediction.astype(float), 0)
    accuracy = np.mean(accuracy_per_marker)

    #correct_prediction = sess.run(tf.equal( tf.round( filtered_data_x ), tf.round( filtered_data_y ) ))
    #accuracy_per_marker = sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32), 0))
    #accuracy = sess.run(tf.reduce_mean(accuracy_per_marker))

    return accuracy, accuracy_per_marker

In [71]:
def calculate_MAF_global_GPU(indexes, inx, categorical=False):
    

    j=0
    if(do_parallel_MAF==True):
        getter = operator.itemgetter(indexes)
        x = list(map(list, map(getter, np.copy(inx))))
    else:
        x = inx
    MAF_list = []
        
    #tf.reset_default_graph()
    
    with tf.Session(config=config) as sess:        
       
        #print("LENGTH", len(x[0]))
        if(categorical==True):
            while j < (len(x[0])):
                ref = 0
                alt = 0
                MAF = 0        
                for i in range(len(x)):
                    if(i == 0):
                        ref = sess.run(tf.add(ref,2))
                    elif(i == 1):
                        ref = sess.run(tf.add(ref,1))
                        alt = sess.run(tf.add(alt,1))
                    elif(i == 2):
                        alt = sess.run(tf.add(alt,2))
                if(alt<=ref):
                    MAF=sess.run(tf.div(alt,tf.add(ref,alt)))
                    #major=ref/len(y)
                else:
                    MAF=sess.run(tf.div(ref,tf.add(ref,alt)))
                    #major=alt/len(y)
                    #print(MAF)
                MAF_list.append(MAF)
                j+=1          
        elif(categorical==False):
            while j < (len(x[0])):
                ref = 0
                alt = 0
                MAF = 0        
                for i in range(len(x)):
                    ref = sess.run(tf.add(ref,x[i][j][0]))
                    alt = sess.run(tf.add(alt,x[i][j][1]))  
                if(alt<=ref):
                    MAF=sess.run(tf.div(alt,tf.add(ref,alt)))
                    #major=ref/len(y)
                else:
                    MAF=sess.run(tf.div(ref,tf.add(ref,alt)))
                MAF_list.append(MAF)    
                j+=1
    
    #reset tensorflow session
    #tf.reset_default_graph()
    sess.close()
    return MAF_list

In [74]:
def calculate_MAF_global(indexes, inx, categorical=False):
    print("processing variants with indexes:", indexes)
    j=0
    if(do_parallel_MAF==True):
        getter = operator.itemgetter(indexes)
        x = list(map(list, map(getter, np.copy(inx))))
    else:
        x = inx
    MAF_list = []
    #print("LENGTH", len(x[0]))
    if(categorical==True):
        while j < (len(x[0])):
            ref = 0
            alt = 0
            MAF = 0        
            for i in range(len(x)):
                if(i == 0):
                    ref+=2
                elif(i == 1):
                    ref+=1
                    alt+=1
                elif(i == 2):
                    alt+=2
            if(alt<=ref):
                MAF=alt/(ref+alt)
                #major=ref/len(y)
            else:
                MAF=ref/(ref+alt)
                #major=alt/len(y)
                #print(MAF)
            MAF_list.append(MAF)    
            j+=1          
    elif(categorical==False):
        while j < (len(x[0])):
            ref = 0
            alt = 0
            MAF = 0        
            for i in range(len(x)):
                ref+=x[i][j][0]
                alt+=x[i][j][1]   
            if(alt<=ref):
                MAF=alt/(ref+alt)
                #major=ref/len(y)
            else:
                MAF=ref/(ref+alt)
            MAF_list.append(MAF)    
            j+=1
    
    print("processing variants done for indexes:", indexes)

    return MAF_list

In [54]:

#split inut data into chunks so we can prepare batches in parallel
def chunk(L,nchunks):
    L2 = list()
    j = round(len(L)/nchunks)
    chunk_size = j
    i = 0
    while i < len(L):
        chunk = L[i:j]
        L2.append(chunk)
        i = j
        j += chunk_size
        if(j>len(L)):
            j = len(L)
    return L2

In [4]:
def flatten(mydata):
    #subjects, SNP, REF/ALT counts
    if(len(mydata.shape) == 3):
        mydata = np.reshape(mydata, (mydata.shape[0],-1))
    else:#do one hot encoding, depth=3 because missing (-1) is encoded to all zeroes
        mydata = tf.one_hot(indices=mydata, depth=3)
        mydata = tf.layers.flatten(mydata)#flattening to simplify calculations later (matmul, add, etc)
    return mydata

In [5]:
new_df = process_data("HRC.r1-1.EGA.GRCh37.chr9.haplotypes.9p21.3.vcf.pos.clean4", "ARIC_PLINK_flagged_chromosomal_abnormalities_zeroed_out_bed.lifted_NCBI36_to_GRCh37.GH.ancestry-1.chr9_intersect1.vcf.gz.9p21.3.recode.vcf", categorical="False")


Processing input data.
Time to load the data (sec):  27.102002907002316


In [6]:
new_df_obs = process_data("HRC.r1-1.EGA.GRCh37.chr9.haplotypes.9p21.3.vcf.pos.clean4", "c1_ARIC_WGS_Freeze3.lifted_already_GRCh37_intersect1.vcf.gz.9p21.3.recode.vcf", categorical="False")


Processing input data.
Time to load the data (sec):  181.9635181800004


In [7]:
orig_new_df_obs = np.copy(new_df_obs)

In [8]:
#new_df = mask_data(np.copy(orig_new_df_obs))

In [9]:
#new_df_obs2 = process_data("HRC.r1-1.EGA.GRCh37.chr9.haplotypes.9p21.3.vcf.pos.clean3", "HRC.r1-1.EGA.GRCh37.chr9.haplotypes.9p21.3.vcf.clean3", categorical="False")


In [10]:
new_df = flatten(new_df.copy())


In [11]:
(new_df.shape)

(1456, 1692)

In [12]:
new_df_obs = flatten(new_df_obs.copy())

In [13]:
(new_df_obs.shape)

(1456, 1692)

In [14]:
#new_df_obs2 = flatten(new_df_obs2.copy())

In [15]:
#(new_df_obs2.shape)

In [26]:


# define layer size
#n_input = len(new_df[0])     # input features N_variants
#n_hidden_1 = n_input  # hidden layer for encoder, equal to input number of features for now
#print(n_input)
#tf input
#X = tf.placeholder("float", [None, n_input])
#Y = tf.placeholder("float", [None, n_input])

    
#biases = {
#    'encoder_b1': tf.Variable(tf.random_normal([n_hidden_1])),
#    'decoder_b1': tf.Variable(tf.random_normal([n_input])),
#}

    #print(X.get_shape())

config = tf.ConfigProto(log_device_placement=False)
config.intra_op_parallelism_threads = 4
config.inter_op_parallelism_threads = 4
config.gpu_options.per_process_gpu_memory_fraction = 0.15
#config.gpu_options.allow_growth=True
#with tf.device('/device:GPU:0'):  # Replace with device you are interested in
bytes_in_use = BytesInUse()
    
sess=tf.Session(config=config) 

#First let's load meta graph and restore weights

tf.reset_default_graph()

with tf.Session(config=config) as sess:
    #saver.restore(sess,tf.train.latest_checkpoint('/home/rdias/myscripts/raqueld/Autoencoder_tensorflow/10-fold_CV_F_new_backup2/'))
    saver = tf.train.import_meta_graph('/home/rdias/myscripts/raqueld/Autoencoder_tensorflow/test_data_augmentation_100b/inference_model-1.ckpt.meta')

    saver.restore(sess,'/home/rdias/myscripts/raqueld/Autoencoder_tensorflow/test_data_augmentation_100b/inference_model-1.ckpt')
    #with tf.device('/device:GPU:0'): 
    # Access saved Variables directly
    graph = sess.graph
    for g_var in tf.global_variables():
        print(g_var)
    for g_var in tf.local_variables():
        print(g_var)

    #optimizer = graph.get_operation_by_name( "optimizer" )
    print(sess.run('Y:0', feed_dict={"X:0": new_df, "Y:0": new_df_obs}))
    print("\n****\n")
    #print(new_df_obs)
    #print("\n****\n")
    print(sess.run('X:0', feed_dict={"X:0": new_df, "Y:0": new_df_obs}))
    print("\n****\n")
    #print(new_df)
    y_pred = (sess.run('y_pred:0', feed_dict={"X:0": new_df, "Y:0": new_df_obs}))
    print("\n****\n")
    print(y_pred)
    
    j=0
    '''    while j < (len(y_pred[0])-1):
        test1 = np.zeros(shape=(len(y_pred),2))
        test2 = np.zeros(shape=(len(y_pred),2))
        for i in range(len(y_pred)):
            test1[i][0] = y_pred[i][j]
            test2[i][0] =  new_df_obs[i][j]
            j +=1
            test1[i][1] = y_pred[i][j]
            test2[i][1] =  new_df_obs[i][j]       
        
        correct_prediction = sess.run( tf.equal( tf.round( test1 ), tf.round( test2 ) ) )
        accuracy = sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))
        print(accuracy)
        j += 1
        
       '''
 
    correct_prediction = sess.run( tf.equal( tf.round( tf.cast(y_pred, tf.float64) ), tf.round( new_df_obs ) ) )

            
    correct_prediction = sess.run( tf.equal( tf.round( tf.cast(y_pred, tf.float64) ), tf.round( new_df_obs ) ) )
    accuracy = sess.run(tf.reduce_mean(tf.cast(correct_prediction, tf.float32)))


INFO:tensorflow:Restoring parameters from /home/rdias/myscripts/raqueld/Autoencoder_tensorflow/test_data_augmentation_100b/inference_model-1.ckpt
<tf.Variable 'weights/w_encoder_h1:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'weights/w_decoder_h1:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'biases/b_encoder_b1:0' shape=(1692,) dtype=float32_ref>
<tf.Variable 'biases/b_decoder_b1:0' shape=(1692,) dtype=float32_ref>
<tf.Variable 'dense/kernel:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'dense/bias:0' shape=(1692,) dtype=float32_ref>
<tf.Variable 'weights/w_encoder_h1/optimizer:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'weights/w_encoder_h1/optimizer_1:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'weights/w_decoder_h1/optimizer:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'weights/w_decoder_h1/optimizer_1:0' shape=(1692, 1692) dtype=float32_ref>
<tf.Variable 'biases/b_encoder_b1/optimizer:0' shape=(1692,) dtype=float32_ref>
<tf.V

In [27]:
#chkp.print_tensors_in_checkpoint_file("/tmp/model.ckpt", tensor_name='v2', all_tensors=False)
print(accuracy)

0.7947135


In [23]:
tf.reset_default_graph()
sess.close()

In [76]:
global MAF_all_var

indexes=list(range(len(orig_new_df_obs[0])))

if(do_parallel_MAF == False):        
        
    MAF_all_var = calculate_MAF_global_GPU(indexes, results, categorical)
    
else:
    chunks = chunk(indexes, int(round(len(indexes)/ncores)) )
        
    pool = multiprocessing.Pool(ncores)

    MAF_all_var = pool.map(partial(calculate_MAF_global, inx=orig_new_df_obs, categorical=False),chunks)

    pool.close()

    pool.join()
    print("merging results...")
        #merge outputs from all processes, reshaping nested list
    MAF_all_var = [item for sublist in MAF_all_var for item in sublist]

#print(indexes

processing variants with indexes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]
processing variants with indexes: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]




processing variants with indexes: [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]
processing variants done for indexes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]




processing variants with indexes: [72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]
processing variants done for indexes: [24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47]




processing variants with indexes: [96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]
processing variants done for indexes: [48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]




processing variants with indexes: [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
processing variants done for indexes: [72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95]




processing variants with indexes: [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167]
processing variants done for indexes: [96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119]




processing variants with indexes: [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]




processing variants done for indexes: [120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143]
processing variants with indexes: [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215]
processing variants done for indexes: [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167]




processing variants with indexes: [216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239]




processing variants done for indexes: [168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191]
processing variants done for indexes: [192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215]
processing variants with indexes: [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263]




processing variants with indexes: [264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287]
processing variants done for indexes: [216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239]




processing variants with indexes: [288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311]




processing variants done for indexes: [240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263]
processing variants with indexes: [312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335]
processing variants done for indexes: [264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287]




processing variants with indexes: [336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359]
processing variants done for indexes: [288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311]




processing variants with indexes: [360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383]




processing variants done for indexes: [312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335]
processing variants with indexes: [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407]
processing variants done for indexes: [336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359]




processing variants with indexes: [408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431]




processing variants done for indexes: [360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383]
processing variants with indexes: [432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455]
processing variants done for indexes: [384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407]




processing variants with indexes: [456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479]
processing variants done for indexes: [408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431]




processing variants with indexes: [480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503]




processing variants done for indexes: [432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455]
processing variants with indexes: [504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527]
processing variants done for indexes: [456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479]




processing variants with indexes: [528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551]




processing variants done for indexes: [480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503]
processing variants done for indexes: [504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527]
processing variants with indexes: [552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575]




processing variants with indexes: [576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599]
processing variants done for indexes: [528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551]
processing variants with indexes: [600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623]




processing variants with indexes: [624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647]
processing variants done for indexes: [576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599]
processing variants done for indexes: [552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575]
processing variants with indexes: [648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671]
processing variants done for indexes: [600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623]
processing variants with indexes: [672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695]
processing variants done for indexes: [624, 625, 626, 627,

In [81]:
accuracy2, accuracy_per_marker = accuracy_maf_threshold(y_pred, new_df_obs, 0, 1, categorical=False)

In [82]:
print(accuracy2)

0.876449938949939


In [87]:
j=0
for i in range(len(MAF_all_var)):
    if(MAF_all_var[i]>0):
        print("MAF:",MAF_all_var[i],"Accuracy:",accuracy_per_marker[j])
        j=j+1


MAF: 0.39629120879120877 Accuracy: 0.1614010989010989
MAF: 0.00034340659340659343 Accuracy: 0.1614010989010989
MAF: 0.00034340659340659343 Accuracy: 1.0
MAF: 0.0010302197802197802 Accuracy: 1.0
MAF: 0.0006868131868131869 Accuracy: 1.0
MAF: 0.0037774725274725275 Accuracy: 1.0
MAF: 0.0006868131868131869 Accuracy: 0.9993131868131868
MAF: 0.00034340659340659343 Accuracy: 0.9993131868131868
MAF: 0.07177197802197802 Accuracy: 0.9993131868131868
MAF: 0.39629120879120877 Accuracy: 0.9993131868131868
MAF: 0.0006868131868131869 Accuracy: 0.9979395604395604
MAF: 0.014766483516483516 Accuracy: 0.9979395604395604
MAF: 0.395260989010989 Accuracy: 0.9986263736263736
MAF: 0.0006868131868131869 Accuracy: 0.9986263736263736
MAF: 0.002403846153846154 Accuracy: 0.992445054945055
MAF: 0.39732142857142855 Accuracy: 0.992445054945055
MAF: 0.39732142857142855 Accuracy: 0.9986263736263736
MAF: 0.4223901098901099 Accuracy: 0.9986263736263736
MAF: 0.0027472527472527475 Accuracy: 0.9993131868131868
MAF: 0.3962912

In [88]:
accuracy2, accuracy_per_marker = accuracy_maf_threshold(y_pred, new_df_obs, 0, 0.005, categorical=False)

In [89]:
print(accuracy2)

0.9545521185545224


In [90]:
accuracy2, accuracy_per_marker = accuracy_maf_threshold(y_pred, new_df_obs, 0.005, 1, categorical=False)

In [91]:
print(accuracy2)

0.5800037097938558


In [92]:
accuracy2, accuracy_per_marker = accuracy_maf_threshold(y_pred, new_df_obs, 0, 0.1, categorical=False)

In [93]:
print(accuracy2)

0.948202838827839


In [94]:
accuracy2, accuracy_per_marker = accuracy_maf_threshold(y_pred, new_df_obs, 0.1, 1, categorical=False)

In [95]:
print(accuracy2)

0.2649904443382704
