In [1]:
import numpy as np
import pickle, os, math
import pandas as pd
import statsmodels.api as sm

In [2]:
os.chdir('350bp_Length_101_GC_content/')

In [3]:
lower_cnt, upper_cnt = 10, 90
thr = 3
lag = 10

def interpolate_zero_elements(array):
    first_non_zero_index = np.argmax(array != 0)
    last_non_zero_index = len(array) - 1 - np.argmax(array[::-1] != 0)
    inner_array = array[first_non_zero_index:last_non_zero_index + 1]
    non_zero_indices = np.nonzero(inner_array)[0]
    zero_indices = np.where(inner_array == 0)[0]
    inner_array[zero_indices] = np.interp(zero_indices, non_zero_indices, inner_array[non_zero_indices])
    array[first_non_zero_index:last_non_zero_index + 1] = inner_array
    return array

def process_length_level(len_sample):
    processed_sample = np.zeros( len_sample.shape )
    for i in range(len_sample.shape[0]):
        processed_sample[i, :] = interpolate_zero_elements(len_sample[i, :])
    return processed_sample

def remove_outlier(arr):
    high = np.percentile(arr, 100-thr)
    low = np.percentile(arr, thr)
    arr = np.where(arr>high, high, arr)
    arr = np.where(arr<low, low, arr)
    return arr

def construct_GC_bias(sample_lengths, ref_lengths, X):
    sample = np.sum(sample_lengths, axis=0)
    ref = np.sum(ref_lengths, axis=0)
    safe_ref = ref + 1
    GC_array = sample / safe_ref
    
    GC_array = GC_array[lower_cnt: upper_cnt+1]
    GC_array = remove_outlier(GC_array)
    GC_array = GC_array / np.mean(GC_array)
    
    lowess = sm.nonparametric.lowess(GC_array, X, frac=0.1)
    GC_array = lowess[:, 1]
    GC_array = GC_array[lag:-lag]
    GC_array = GC_array / np.mean(GC_array)
    return GC_array

def post_process_outlier(arr):
    min_thr = min(1/20, np.percentile(arr, 1)) # this thr can be 0 or less also
    positive_min = np.min(arr[arr>0])
    final_thr = max(min_thr, positive_min) # ensures +ve GC bias value
    arr[arr<final_thr] = final_thr

def correct_GC_bias(Input, Output, corr_path):
    dic = None
    ref_GC_array = np.load('../ref_genome_GC.npy')
    ref_len_GC_array = process_length_level(ref_GC_array)
    
    len_group_dic = {51: [51, 55],
                     52: [51, 55],
                     399: [396, 400],
                     400: [396, 400]}
    for length in range(53, 399):
        if length>=101 and length<=350:
            len_group_dic[length] = [length-4, length+4]
        else:
            len_group_dic[length] = [length-2, length+2]
#         len_group_dic[length] = [length-2, length+2]

    sample_GC_array = np.load(Input)
    sample_len_GC_array = process_length_level(sample_GC_array)
    GC_bin_no = upper_cnt - lower_cnt - (2*lag - 1)
    final_GC_array = np.zeros((350, GC_bin_no))
    correction_factors = np.zeros((350, 101))

    X = [float(i/100) for i in range(lower_cnt, upper_cnt+1)]
    for length in range(51, 401):
        start_len_ind = len_group_dic[length][0] - 51
        end_len_ind = len_group_dic[length][1] - 51
        final_GC_array[length-51, :] = construct_GC_bias(sample_len_GC_array[start_len_ind: end_len_ind+1], 
                              ref_len_GC_array[start_len_ind: end_len_ind+1], X)
    post_process_outlier(final_GC_array)
    correction_factors[:, lower_cnt+lag: upper_cnt-lag+1] = 1.0/final_GC_array

    corrected_GC_array = sample_GC_array * correction_factors
    print(Input.split('/')[-1], np.min(correction_factors[correction_factors>0]),
          np.percentile(correction_factors, 99), np.max(correction_factors))
    np.save(Output, corrected_GC_array)
    np.save(corr_path, correction_factors)

In [4]:
cohorts = ['BRCA_gis', 'CRC_gis', 'healthy_c2i', 'healthy_invitro']

for cohort in cohorts:
    print(cohort)
    input_folder, output_folder = f'Original/{cohort}', f'Proposed_Corrected/{cohort}'
    corr_factor_folder = f'Proposed_Correction_Factors/{cohort}'
    for file_ in os.listdir(input_folder):
        Input = f'{input_folder}/{file_}'
        Output = f'{output_folder}/{file_}'
        corr_path = f'{corr_factor_folder}/{file_}'
        correct_GC_bias(Input, Output, corr_path)
    print()

BRCA_gis
D14.npy 0.2881421146984038 2.7675110631135285 20.0
D19.npy 0.3145413391011722 11.78286778128382 20.0
D23.npy 0.3735747818070048 2.783962146600474 9.14684234949594
D7.npy 0.22836623602013165 13.559881831951719 20.0
D9.npy 0.4435698652706418 5.11911796478412 18.130633304184602
E10c.npy 0.29927891431776793 3.0648631916973 14.785796887770484
E2c.npy 0.3989752485720005 46.06354889553275 56.029128586307046
E6c.npy 0.2782620340881262 12.409333361958739 20.0
E7c.npy 0.319344660394268 4.776543311136358 20.0
E8c.npy 0.3874984341792456 8.805539547142056 20.0

CRC_gis
NCC_CRC-1014_180816-CW-T-ready.npy 0.24403124361483022 2.8690802605471517 20.0
NCC_CRC-1279_221015_CW-T-ready.npy 0.3620209333149655 2.80888217284518 20.0
NCC_CRC-1279_241116_CW-T-ready.npy 0.39739035429157044 3.297243223491643 12.587375728558353
NCC_CRC-1531_180119_CW-T-ready.npy 0.41498833942503754 2.7464459504989325 20.0
NCC_CRC-512_051015-CW-T-ready.npy 0.434799810650459 3.317788597493994 20.0
NCC_CRC-512_130114-CW-T-rea

In [5]:
cohorts = ['healthy_c2i', 'healthy_invitro', 'BRCA_gis', 'CRC_gis']

for cohort in cohorts:
    print(cohort)
    input_folder, output_folder = f'Original/{cohort}', f'Proposed_Corrected/{cohort}'
    for file_ in os.listdir(input_folder):
        original = np.sum( np.load( f'{input_folder}/{file_}' ) )
        corrected = np.sum( np.load( f'{output_folder}/{file_}' ) )
        diff = round ( (corrected - original)/original * 100, 2 )
        print(f'{file_[:-4]}: {diff}%')
    print()

healthy_c2i
WHC556.sort: 6.22%
WHC557.sort: 3.18%
WHC558.sort: 5.7%
WHC559.sort: -2.69%
WHC560.sort: 4.84%
WHC561.sort: 7.64%
WHC562.sort: 0.33%
WHC563.sort: 0.71%
WHC564.sort: 3.63%
WHC565.sort: 4.75%

healthy_invitro
WHC1384: -2.3%
WHC1385: -3.28%
WHC1386: -5.35%
WHC1387: -1.45%
WHC1388: -5.24%
WHC1389: 0.33%
WHC1390: -0.67%
WHC1391: -1.93%
WHC1392: -0.17%
WHC1393: -0.89%
WHC1394: -2.74%
WHC1395: -1.95%

BRCA_gis
D14: -2.3%
D19: -3.54%
D23: 3.21%
D7: -0.23%
D9: -5.37%
E10c: -4.58%
E2c: 9.06%
E6c: 3.58%
E7c: -6.24%
E8c: -2.1%

CRC_gis
NCC_CRC-1014_180816-CW-T-ready: 7.79%
NCC_CRC-1279_221015_CW-T-ready: 7.68%
NCC_CRC-1279_241116_CW-T-ready: 7.99%
NCC_CRC-1531_180119_CW-T-ready: 3.68%
NCC_CRC-512_051015-CW-T-ready: 7.07%
NCC_CRC-512_130114-CW-T-ready: 10.97%
NCC_CRC-519_210114_CW-T-ready: 6.0%
NCC_CRC-809_110914-CW-T-ready: 17.45%
NCC_CRC-986_100215-CW-T-ready: 3.09%

