In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy

In [17]:
# KL Divergence for each feature
def kl_divergence(feature_name, dataset):
    neg_feature_data = dataset[dataset['label'] == 0][feature_name] # 负样本数据
    pos_feature_data = dataset[dataset['label'] == 1][feature_name] # 正样本数据

    min_value = min(neg_feature_data.min(), pos_feature_data.min())
    max_value = max(neg_feature_data.max(), pos_feature_data.max())
    # min_value = pos_feature_data.min() if neg_feature_data.min() > pos_feature_data.min() else neg_feature_data.min()
    # max_value = neg_feature_data.max() if neg_feature_data.max() > pos_feature_data.max() else pos_feature_data.max()
    
    neg_hist, neg_bin = np.histogram(neg_feature_data, bins = 10, range=(min_value, max_value), density = True)
    pos_hist, pos_bin = np.histogram(pos_feature_data, bins = 10, range=(min_value, max_value), density = True)
    pos_hist[pos_hist == 0] = 1e-7
    neg_hist[neg_hist == 0] = 1e-7
    return entropy(neg_hist, pos_hist, base = 2)


In [18]:
def get_all_kl_divergence(df):
    features =[]
    feature_kl = []
    size = len(df.columns)
    i = 0
    # print(df[df.columns[19944]])
    for feature in df.columns: 
        if feature == 'label' or feature == 'file_id':
            continue
        features.append(feature)
        feature_kl.append(kl_divergence(feature, df))
        i = i + 1
        print('process:{} / {}'.format(i, feature))
    kl_feature_dataset = pd.DataFrame()
    kl_feature_dataset['feature_name'] = features
    kl_feature_dataset['kl_divergence'] = feature_kl
    final = kl_feature_dataset.sort_values('kl_divergence', ascending=False)
    return final

In [19]:
# Read the preprocessed data and calculate the KL divergence for each feature
df = pd.read_csv('TCGA_Labeled.csv', index_col = 0)
dd = get_all_kl_divergence(df)
choosed_features = dd[dd['kl_divergence'] > 3].reset_index(drop = True)
choosed_features


process:1 / OR4F5
process:2 / OR4F29
process:3 / OR4F16
process:4 / SAMD11
process:5 / NOC2L
process:6 / KLHL17
process:7 / PLEKHN1
process:8 / PERM1
process:9 / HES4
process:10 / ISG15
process:11 / AGRN
process:12 / RNF223
process:13 / C1orf159
process:14 / TTLL10
process:15 / TNFRSF18
process:16 / TNFRSF4
process:17 / SDF4
process:18 / B3GALT6
process:19 / C1QTNF12
process:20 / UBE2J2
process:21 / SCNN1D
process:22 / ACAP3
process:23 / PUSL1
process:24 / INTS11
process:25 / CPTP
process:26 / TAS1R3
process:27 / DVL1
process:28 / MXRA8
process:29 / AURKAIP1
process:30 / CCNL2
process:31 / MRPL20
process:32 / ANKRD65
process:33 / TMEM88B
process:34 / VWA1
process:35 / ATAD3C
process:36 / ATAD3B
process:37 / ATAD3A
process:38 / TMEM240
process:39 / SSU72
process:40 / FNDC10
process:41 / MIB2
process:42 / MMP23B
process:43 / CDK11B
process:44 / SLC35E2B
process:45 / CDK11A
process:46 / NADK
process:47 / GNB1
process:48 / CALML6
process:49 / TMEM52
process:50 / CFAP74
process:51 / GABRD
p

Unnamed: 0,feature_name,kl_divergence
0,STX11,5.989149
1,GYPE,5.867323
2,ADRB2,5.825714
3,CD5L,5.746667
4,ANGPT4,5.723198
...,...,...
129,ADRB1,3.013379
130,ARHGEF15,3.011971
131,ACTN2,3.004149
132,ARHGAP31,3.003712


In [22]:
# Filtering the choosed 134 features from the original dataset
index_columns = np.append(choosed_features['feature_name'].values, ['label'])
df_choosed = df[index_columns]
df_choosed.to_csv('TCGA_Labeled_Final_Features.csv')

df_choosed.columns


Index(['STX11', 'GYPE', 'ADRB2', 'CD5L', 'ANGPT4', 'GPM6A', 'NCKAP5',
       'CD300LG', 'SGCG', 'KCNA4',
       ...
       'FRMD3', 'EPAS1', 'ARHGAP6', 'PIP5K1B', 'ADRB1', 'ARHGEF15', 'ACTN2',
       'ARHGAP31', 'ERG', 'label'],
      dtype='object', length=135)