This script contains 2 parts:
- process whole features
    - generate label
    - generate training data
    - output filtered data
- process only evs features **(don't need to remove the missing values in prediction scores)**
    - generate label
    - generate training data
    - output orig data

In [1]:
import gc
import pathlib


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import KNNImputer


from myLibs.loader import *
from myLibs.dummy import *

features_with_continuous_val = get_continuous_feature()
evs_val = get_evs_val()

# GET full features for training
## organize input data

In [2]:
dataname = 'Marilyn-std0.02'
datapath = "../Marilyn_data/Marilyn.test.sim2.av.hg19.test.DP.new_training.txt"

outdir = pathlib.Path("%s" % dataname)
outdir.mkdir(exist_ok=True, parents=True)

In [3]:
data = pd.read_csv(datapath, sep='\t', low_memory=False)
data = data.replace(".", np.nan)

filterd_df = data.loc[data.loc[:, features_with_continuous_val].isna().sum(axis=1) <= 13, :]
# filterd_df.to_csv("%s/filtered_dat.csv" % dataname, index=False)
print(filterd_df.shape)
# del(data)
# gc.collect()

(5234, 37)


In [4]:
true_label_df = pd.DataFrame(filterd_df.loc[:, 'New_test'].apply(lambda x: 0 if x=="benign" else 1))
true_label_df.columns = ["true_label"]
# true_label_df.to_csv("%s/true_label.csv" % dataname, index=False)

EVS_features_df, EVS_label_df, EVS_tail_df = convert_evidence(filterd_df, evs_val)

continous_df = filterd_df.loc[:, features_with_continuous_val]
continous_df.reset_index(drop=True, inplace=True)

features_df = pd.concat([continous_df, EVS_features_df], axis = 1)
features_df = features_df.astype(float)
features_df['label'] = dataname
features_df['id']    = [i for i in range(len(true_label_df))]
print(features_df.shape)
# features_df.head(2)
# del(continous_df, EVS_features_df)
# gc.collect()

features_df.head(3)

(5234, 37)


Unnamed: 0,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,MetaSVM_score,MetaLR_score,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,EVS_1,EVS_2,EVS_3,EVS_4,EVS_5,EVS_6,EVS_7,EVS_8,EVS_9,EVS_10,EVS_11,EVS_12,label,id
0,0.012,0.996,0.877,0.0,0.967,2.14,0.65,-2.67,0.861,3.623,23.2,0.999,0.94,-0.718,0.181,0.487,0.0,4.09,0.871,0.935,0.998,0.964,12.971,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,Marilyn-std0.02,0
1,0.003,0.79,0.365,0.694,1.0,0.695,1.97,-1.64,0.068,3.988,23.6,0.989,0.195,-1.032,0.019,0.707,0.0,2.04,0.868,0.884,0.017,0.176,4.107,1.0,1.0,1.0,1.0,0.0,0.0,0.0,-1.0,1.0,1.0,1.0,1.0,Marilyn-std0.02,1
2,0.048,0.545,0.061,0.001,1.0,0.095,-1.59,-3.42,0.336,5.858,27.3,0.999,0.94,-0.587,0.201,0.707,0.0,3.87,0.917,0.953,1.0,1.0,11.988,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,Marilyn-std0.02,2


## addin daatabase data

In [5]:
db_dat_raw = pd.read_csv("../data_process/mutations_db_continous_evs_nonmissing.csv")
db_dat = db_dat_raw.sample(n=1000000, replace=False, axis=0, random_state=1)

del(db_dat_raw)
gc.collect()

0

In [6]:
def func_combine_data(db_dat, inpu_dat):
    if db_dat.columns.to_list() != inpu_dat.columns.to_list():
        sys.stdout.write("## The columns isn't same")
        return False
    return pd.concat([db_dat, inpu_dat], axis=0)

total_df_raw = func_combine_data(db_dat, features_df)
total_df_raw.reset_index(drop=True, inplace=True)

total_index = total_df_raw.loc[:, ['label', 'id']]
total_df_raw.drop(['label', 'id'], axis=1, inplace=True)

# del(db_dat)
# gc.collect()
print(total_df_raw.shape)
total_df_raw.tail(5)

(1005234, 35)


Unnamed: 0,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,MetaSVM_score,MetaLR_score,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,EVS_1,EVS_2,EVS_3,EVS_4,EVS_5,EVS_6,EVS_7,EVS_8,EVS_9,EVS_10,EVS_11,EVS_12
1005229,0.001,1.0,0.937,0.0,1.0,1.235,-1.76,-7.57,0.959,7.076,33.0,0.995,0.993,0.318,0.629,0.707,0.0,5.62,0.917,0.998,1.0,1.0,19.648,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,2.0,1.0,1.0,1.0
1005230,0.178,0.001,0.003,0.037,0.97,1.04,-1.59,-1.86,0.017,0.638,8.418,0.92,0.914,-0.845,0.301,0.651,0.0,2.98,0.991,1.061,0.997,0.999,7.129,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,1.0,1.0
1005231,1.0,0.0,0.001,0.013,1.0,1.175,0.55,-0.37,0.69,0.054,3.12,0.556,0.265,-1.027,0.068,0.707,0.0,1.13,0.046,-0.228,0.997,0.977,9.764,1.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,1.0
1005232,1.0,0.0,0.0,,1.0,-1.445,1.08,-0.2,0.019,-2.56,0.001,0.318,0.002,-0.988,0.023,,,-0.014,-0.41,-0.136,0.034,0.002,4.028,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,0.0
1005233,0.457,0.0,0.0,0.518,,0.0,3.39,-0.81,0.068,0.068,3.267,0.763,0.338,-0.931,0.015,0.598,0.0,-1.84,-0.058,-0.243,0.02,0.451,7.942,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,1.0,1.0


## tansformation

1. **dummilize**

In [7]:
evs_dummy_df = pd.DataFrame(total_df_raw.apply(lambda x: myGetDummy_with_gaussian(x, ), axis=1).tolist(), 
                            columns = ['evs_%s_%s' % (i + 1, j) for i in range(12) for j in range(-1, 3)])
evs_dummy_df.tail(5)

Unnamed: 0,evs_1_-1,evs_1_0,evs_1_1,evs_1_2,evs_2_-1,evs_2_0,evs_2_1,evs_2_2,evs_3_-1,evs_3_0,evs_3_1,evs_3_2,evs_4_-1,evs_4_0,evs_4_1,evs_4_2,evs_5_-1,evs_5_0,evs_5_1,evs_5_2,evs_6_-1,evs_6_0,evs_6_1,evs_6_2,evs_7_-1,evs_7_0,evs_7_1,evs_7_2,evs_8_-1,evs_8_0,evs_8_1,evs_8_2,evs_9_-1,evs_9_0,evs_9_1,evs_9_2,evs_10_-1,evs_10_0,evs_10_1,evs_10_2,evs_11_-1,evs_11_0,evs_11_1,evs_11_2,evs_12_-1,evs_12_0,evs_12_1,evs_12_2
1005229,0.003051,0.009311,0.961428,-0.022565,0.003051,0.009311,0.961428,-0.022565,0.003051,1.009311,-0.038572,-0.022565,0.003051,0.009311,0.961428,-0.022565,0.003051,1.009311,-0.038572,-0.022565,0.003051,1.009311,-0.038572,-0.022565,0.003051,0.009311,0.961428,-0.022565,0.003051,0.009311,0.961428,-0.022565,0.003051,0.009311,-0.038572,0.977435,0.003051,0.009311,0.961428,-0.022565,0.003051,0.009311,0.961428,-0.022565,0.003051,0.009311,0.961428,-0.022565
1005230,-0.001057,1.029723,0.001071,-0.000655,-0.001057,1.029723,0.001071,-0.000655,-0.001057,0.029723,1.001071,-0.000655,-0.001057,0.029723,1.001071,-0.000655,-0.001057,1.029723,0.001071,-0.000655,-0.001057,1.029723,0.001071,-0.000655,-0.001057,0.029723,1.001071,-0.000655,-0.001057,1.029723,0.001071,-0.000655,-0.001057,1.029723,0.001071,-0.000655,0.998943,0.029723,0.001071,-0.000655,-0.001057,0.029723,1.001071,-0.000655,-0.001057,0.029723,1.001071,-0.000655
1005231,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478
1005232,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,1.033489,0.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947
1005233,0.003605,0.0004,1.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,1.003605,0.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,1.003605,0.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,0.003605,0.0004,1.010294,0.020786


In [8]:
minmaxSeries = pd.DataFrame([[-0.1 for i in range(evs_dummy_df.shape[1])], 
                          [ 1.1 for i in range(evs_dummy_df.shape[1])]], 
                         columns=evs_dummy_df.columns)

evs_dummy_df = evs_dummy_df.append(minmaxSeries, ignore_index=True)
evs_dummy_df.tail(5)

Unnamed: 0,evs_1_-1,evs_1_0,evs_1_1,evs_1_2,evs_2_-1,evs_2_0,evs_2_1,evs_2_2,evs_3_-1,evs_3_0,evs_3_1,evs_3_2,evs_4_-1,evs_4_0,evs_4_1,evs_4_2,evs_5_-1,evs_5_0,evs_5_1,evs_5_2,evs_6_-1,evs_6_0,evs_6_1,evs_6_2,evs_7_-1,evs_7_0,evs_7_1,evs_7_2,evs_8_-1,evs_8_0,evs_8_1,evs_8_2,evs_9_-1,evs_9_0,evs_9_1,evs_9_2,evs_10_-1,evs_10_0,evs_10_1,evs_10_2,evs_11_-1,evs_11_0,evs_11_1,evs_11_2,evs_12_-1,evs_12_0,evs_12_1,evs_12_2
1005231,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478
1005232,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,1.033489,0.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947
1005233,0.003605,0.0004,1.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,0.003605,1.0004,0.010294,0.020786,1.003605,0.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,1.003605,0.0004,0.010294,0.020786,0.003605,0.0004,1.010294,0.020786,0.003605,0.0004,1.010294,0.020786
1005234,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1,-0.1
1005235,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1,1.1


In [9]:
evs_dummy_df

Unnamed: 0,evs_1_-1,evs_1_0,evs_1_1,evs_1_2,evs_2_-1,evs_2_0,evs_2_1,evs_2_2,evs_3_-1,evs_3_0,evs_3_1,evs_3_2,evs_4_-1,evs_4_0,evs_4_1,evs_4_2,evs_5_-1,evs_5_0,evs_5_1,evs_5_2,evs_6_-1,evs_6_0,evs_6_1,evs_6_2,evs_7_-1,evs_7_0,evs_7_1,evs_7_2,evs_8_-1,evs_8_0,evs_8_1,evs_8_2,evs_9_-1,evs_9_0,evs_9_1,evs_9_2,evs_10_-1,evs_10_0,evs_10_1,evs_10_2,evs_11_-1,evs_11_0,evs_11_1,evs_11_2,evs_12_-1,evs_12_0,evs_12_1,evs_12_2
0,0.000669,1.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,0.017445,1.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,0.017445,1.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,1.000669,0.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880,0.000669,1.017445,0.009256,-0.009880
1,-0.007478,1.002452,0.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,0.002452,1.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,0.002452,1.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856,-0.007478,0.002452,1.006950,-0.005856,-0.007478,0.002452,1.006950,-0.005856,-0.007478,1.002452,0.006950,-0.005856
2,-0.008294,1.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,0.002143,1.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,0.002143,1.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,0.991706,0.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680,-0.008294,1.002143,0.001682,-0.027680
3,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,0.007990,0.991923,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,0.991649,0.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542,-0.008351,1.007990,-0.008077,0.012542
4,0.028219,1.014096,-0.012969,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,0.014096,0.987031,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,0.014096,0.987031,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,0.014096,0.987031,0.007388,0.028219,1.014096,-0.012969,0.007388,0.028219,1.014096,-0.012969,0.007388
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1005231,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,0.989081,-0.035019,-0.018478,0.972052,-0.010919,-0.035019,-0.018478,-0.027948,-0.010919,0.964981,-0.018478,-0.027948,-0.010919,0.964981,-0.018478
1005232,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,1.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,1.033489,0.001848,-0.027736,0.023947,0.033489,0.001848,0.972264,0.023947,0.033489,1.001848,-0.027736,0.023947
1005233,0.003605,0.000400,1.010294,0.020786,0.003605,1.000400,0.010294,0.020786,0.003605,1.000400,0.010294,0.020786,0.003605,0.000400,1.010294,0.020786,0.003605,1.000400,0.010294,0.020786,0.003605,1.000400,0.010294,0.020786,0.003605,1.000400,0.010294,0.020786,1.003605,0.000400,0.010294,0.020786,0.003605,0.000400,1.010294,0.020786,1.003605,0.000400,0.010294,0.020786,0.003605,0.000400,1.010294,0.020786,0.003605,0.000400,1.010294,0.020786
1005234,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000,-0.100000


In [11]:
scaled_evs_dummy_df = (evs_dummy_df - evs_dummy_df.min())/(evs_dummy_df.max() - evs_dummy_df.min())
scaled_evs_dummy_df = scaled_evs_dummy_df.iloc[:-2, ]

2. **fill NA**

In [12]:
continous_df = total_df_raw.iloc[:, 0:23]
continous_df = (continous_df - continous_df.min())/(continous_df.max() - continous_df.min())

## fill NAs
imputer = KNNImputer(n_neighbors=40)
filled_continous_arr = imputer.fit_transform(continous_df.values)
filled_continous_df  = pd.DataFrame(filled_continous_arr, columns = continous_df.columns)
filled_continous_df.tail(5)

Unnamed: 0,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,MetaSVM_score,MetaLR_score,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds
1005229,0.001,1.0,0.937,0.0,1.0,0.543774,0.517485,0.282265,0.958959,0.577877,0.589278,0.994872,0.993,0.580036,0.629,0.841667,0.0,0.970222,0.97526,0.983816,1.0,1.0,0.642068
1005230,0.178,0.001,0.003,0.037,0.97,0.524805,0.524563,0.532924,0.016016,0.295174,0.150306,0.917949,0.914,0.231415,0.301,0.775,0.0,0.827287,0.987886,0.988889,0.997,0.999,0.232924
1005231,1.0,0.0,0.001,0.013,1.0,0.537938,0.613655,0.598332,0.68969,0.26953,0.055697,0.544615,0.265,0.176859,0.068,0.841667,0.0,0.727125,0.826651,0.885105,0.997,0.977,0.31904
1005232,1.0,0.0,0.0,0.32795,1.0,0.283074,0.63572,0.605795,0.018018,0.154745,0.0,0.300513,0.002,0.188549,0.023,0.780238,0.0,0.665187,0.748848,0.892512,0.034,0.002,0.131577
1005233,0.457,0.0,0.0,0.518,0.99775,0.423638,0.73189,0.579017,0.067067,0.270144,0.058322,0.756923,0.338,0.205635,0.015,0.711905,0.0,0.566324,0.808906,0.883897,0.02,0.451,0.259494


3. **combined**

In [13]:
tot_df        = pd.concat([filled_continous_df, scaled_evs_dummy_df], axis=1)
output_tot_df = pd.concat([tot_df, total_index], axis=1)

In [14]:
feature_name = tot_df.columns
output_tot_df.loc[output_tot_df['label'] == dataname, feature_name].to_csv("%s/labeled.csv" % dataname, index=False)
output_tot_df.loc[output_tot_df['label'] != dataname, feature_name].to_csv("%s/unlabel.csv" % dataname, index=False)

## GET EVS FOR TRAINING

In [2]:
import gc
import pathlib


import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)


import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.impute import KNNImputer


from myLibs.loader import *
from myLibs.dummy import *

features_with_continuous_val = get_continuous_feature()
evs_val = get_evs_val()

In [3]:
dataname = 'Marilyn-std0.02'
datapath = "../Marilyn_data/Marilyn.test.sim2.av.hg19.test.DP.new_training.txt"

outdir = pathlib.Path("%s" % dataname)
outdir.mkdir(exist_ok=True, parents=True)

In [4]:
data = pd.read_csv(datapath, sep='\t', low_memory=False)
data = data.replace(".", np.nan)
print(data.shape)

(6326, 37)


In [5]:
true_label_df = pd.DataFrame(data.loc[:, 'New_test'].apply(lambda x: 0 if x=="benign" else 1))
true_label_df.columns = ["true_label"]
true_label_df.to_csv("%s/true_label_for_evs.csv" % dataname, index=False)

EVS_features_df, EVS_label_df, EVS_tail_df = convert_evidence(data, evs_val)

continous_df = data.loc[:, features_with_continuous_val]
continous_df.reset_index(drop=True, inplace=True)

features_df = pd.concat([continous_df, EVS_features_df], axis = 1)
features_df = features_df.astype(float)
features_df['label'] = dataname
features_df['id']    = [i for i in range(len(true_label_df))]
print(features_df.shape)
# features_df.head(2)
# del(continous_df, EVS_features_df)
# gc.collect()

features_df.head(3)

(6326, 37)


Unnamed: 0,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,MetaSVM_score,MetaLR_score,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,EVS_1,EVS_2,EVS_3,EVS_4,EVS_5,EVS_6,EVS_7,EVS_8,EVS_9,EVS_10,EVS_11,EVS_12,label,id
0,0.012,0.996,0.877,0.0,0.967,2.14,0.65,-2.67,0.861,3.623,23.2,0.999,0.94,-0.718,0.181,0.487,0.0,4.09,0.871,0.935,0.998,0.964,12.971,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,1.0,0.0,0.0,Marilyn-std0.02,0
1,0.003,0.79,0.365,0.694,1.0,0.695,1.97,-1.64,0.068,3.988,23.6,0.989,0.195,-1.032,0.019,0.707,0.0,2.04,0.868,0.884,0.017,0.176,4.107,1.0,1.0,1.0,1.0,0.0,0.0,0.0,-1.0,1.0,1.0,1.0,1.0,Marilyn-std0.02,1
2,0.048,0.545,0.061,0.001,1.0,0.095,-1.59,-3.42,0.336,5.858,27.3,0.999,0.94,-0.587,0.201,0.707,0.0,3.87,0.917,0.953,1.0,1.0,11.988,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,Marilyn-std0.02,2


## addin daatabase data

In [6]:
db_dat_raw = pd.read_csv("../data_process/mutations_db_continous_evs_nonmissing.csv")
db_dat = db_dat_raw.sample(n=1000000, replace=False, axis=0, random_state=1)

del(db_dat_raw)
gc.collect()

4

In [9]:
def func_combine_data(db_dat, inpu_dat):
    if db_dat.columns.to_list() != inpu_dat.columns.to_list():
        sys.stdout.write("## The columns isn't same")
        return False
    return pd.concat([db_dat, inpu_dat], axis=0)

total_df_raw = func_combine_data(db_dat, features_df)
total_df_raw.reset_index(drop=True, inplace=True)

total_index = total_df_raw.loc[:, ['label', 'id']]
total_df_raw.drop(['label', 'id'], axis=1, inplace=True)

# del(db_dat)
# gc.collect()
print(total_df_raw.shape)
total_df_raw.tail(5)

(1006326, 35)


Unnamed: 0,SIFT_score,Polyphen2_HDIV_score,Polyphen2_HVAR_score,LRT_score,MutationTaster_score,MutationAssessor_score,FATHMM_score,PROVEAN_score,VEST3_score,CADD_raw,CADD_phred,DANN_score,fathmm-MKL_coding_score,MetaSVM_score,MetaLR_score,integrated_fitCons_score,integrated_confidence_value,GERP++_RS,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,EVS_1,EVS_2,EVS_3,EVS_4,EVS_5,EVS_6,EVS_7,EVS_8,EVS_9,EVS_10,EVS_11,EVS_12
1006321,,,,,,,,,,,,,,,,,,,,,,,,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1006322,0.178,0.001,0.003,0.037,0.97,1.04,-1.59,-1.86,0.017,0.638,8.418,0.92,0.914,-0.845,0.301,0.651,0.0,2.98,0.991,1.061,0.997,0.999,7.129,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.0,1.0,1.0
1006323,1.0,0.0,0.001,0.013,1.0,1.175,0.55,-0.37,0.69,0.054,3.12,0.556,0.265,-1.027,0.068,0.707,0.0,1.13,0.046,-0.228,0.997,0.977,9.764,1.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.0,0.0,-1.0,1.0,1.0
1006324,1.0,0.0,0.0,,1.0,-1.445,1.08,-0.2,0.019,-2.56,0.001,0.318,0.002,-0.988,0.023,,,-0.014,-0.41,-0.136,0.034,0.002,4.028,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-1.0,1.0,0.0
1006325,0.457,0.0,0.0,0.518,,0.0,3.39,-0.81,0.068,0.068,3.267,0.763,0.338,-0.931,0.015,0.598,0.0,-1.84,-0.058,-0.243,0.02,0.451,7.942,1.0,0.0,0.0,1.0,0.0,0.0,0.0,-1.0,1.0,-1.0,1.0,1.0


## tansformation

1. **dummilize**

In [10]:
evs_dummy_df = pd.DataFrame(total_df_raw.apply(lambda x: myGetDummy_with_gaussian(x, ), axis=1).tolist(), 
                            columns = ['evs_%s_%s' % (i + 1, j) for i in range(12) for j in range(-1, 3)])
evs_dummy_df.tail(5)

Unnamed: 0,evs_1_-1,evs_1_0,evs_1_1,evs_1_2,evs_2_-1,evs_2_0,evs_2_1,evs_2_2,evs_3_-1,evs_3_0,evs_3_1,evs_3_2,evs_4_-1,evs_4_0,evs_4_1,evs_4_2,evs_5_-1,evs_5_0,evs_5_1,evs_5_2,evs_6_-1,evs_6_0,evs_6_1,evs_6_2,evs_7_-1,evs_7_0,evs_7_1,evs_7_2,evs_8_-1,evs_8_0,evs_8_1,evs_8_2,evs_9_-1,evs_9_0,evs_9_1,evs_9_2,evs_10_-1,evs_10_0,evs_10_1,evs_10_2,evs_11_-1,evs_11_0,evs_11_1,evs_11_2,evs_12_-1,evs_12_0,evs_12_1,evs_12_2
1006321,-0.012907,-0.014202,0.995476,-0.000811,-0.012907,-0.014202,0.995476,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,-0.014202,0.995476,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,0.985798,-0.004524,-0.000811,-0.012907,-0.014202,0.995476,-0.000811,-0.012907,0.985798,-0.004524,-0.000811
1006322,-0.018371,1.011319,0.004743,-0.030608,-0.018371,1.011319,0.004743,-0.030608,-0.018371,0.011319,1.004743,-0.030608,-0.018371,0.011319,1.004743,-0.030608,-0.018371,1.011319,0.004743,-0.030608,-0.018371,1.011319,0.004743,-0.030608,-0.018371,0.011319,1.004743,-0.030608,-0.018371,1.011319,0.004743,-0.030608,-0.018371,1.011319,0.004743,-0.030608,0.981629,0.011319,0.004743,-0.030608,-0.018371,0.011319,1.004743,-0.030608,-0.018371,0.011319,1.004743,-0.030608
1006323,0.002265,-0.029797,1.007068,-0.01264,0.002265,0.970203,0.007068,-0.01264,0.002265,-0.029797,1.007068,-0.01264,0.002265,-0.029797,1.007068,-0.01264,0.002265,0.970203,0.007068,-0.01264,0.002265,0.970203,0.007068,-0.01264,0.002265,0.970203,0.007068,-0.01264,1.002265,-0.029797,0.007068,-0.01264,0.002265,0.970203,0.007068,-0.01264,1.002265,-0.029797,0.007068,-0.01264,0.002265,-0.029797,1.007068,-0.01264,0.002265,-0.029797,1.007068,-0.01264
1006324,-0.011323,1.004449,0.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,0.004449,1.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836,-0.011323,0.004449,1.014777,-0.024836,0.988677,0.004449,0.014777,-0.024836,-0.011323,0.004449,1.014777,-0.024836,-0.011323,1.004449,0.014777,-0.024836
1006325,0.021432,-0.033092,0.994587,0.014122,0.021432,0.966908,-0.005413,0.014122,0.021432,0.966908,-0.005413,0.014122,0.021432,-0.033092,0.994587,0.014122,0.021432,0.966908,-0.005413,0.014122,0.021432,0.966908,-0.005413,0.014122,0.021432,0.966908,-0.005413,0.014122,1.021432,-0.033092,-0.005413,0.014122,0.021432,-0.033092,0.994587,0.014122,1.021432,-0.033092,-0.005413,0.014122,0.021432,-0.033092,0.994587,0.014122,0.021432,-0.033092,0.994587,0.014122


In [11]:
minmaxSeries = pd.DataFrame([[-0.1 for i in range(evs_dummy_df.shape[1])], 
                          [ 1.1 for i in range(evs_dummy_df.shape[1])]], 
                         columns=evs_dummy_df.columns)

evs_dummy_df = evs_dummy_df.append(minmaxSeries, ignore_index=True)

scaled_evs_dummy_df = (evs_dummy_df - evs_dummy_df.min())/(evs_dummy_df.max() - evs_dummy_df.min())
scaled_evs_dummy_df = scaled_evs_dummy_df.iloc[:-2, ]

2. **combined**

In [12]:
output_tot_df = pd.concat([scaled_evs_dummy_df, total_index], axis=1)

In [13]:
feature_name = evs_dummy_df.columns
output_tot_df.loc[output_tot_df['label'] == dataname, feature_name].to_csv("%s/labeled_for_evs.csv" % dataname, index=False)
output_tot_df.loc[output_tot_df['label'] != dataname, feature_name].to_csv("%s/unlabel_for_evs.csv" % dataname, index=False)