In [None]:
import pandas as pd
import numpy as np
import itertools
from sklearn import preprocessing

In [None]:
%%time
#读取kegg数据
hsa = pd.read_csv('data/KEGG/KEGG_gene_hsa.csv', sep='\t', 
                  engine='python', header=None, index_col=None, names=['keggId', 'geneName'])
hsa_brite = pd.read_csv('data/KEGG/KEGG_hsa_brite.csv', 
                        sep='\t', engine='python', header=None, index_col=None, names=['keggBriteId', 'keggId'])
brite = pd.read_csv('data/KEGG/KEGG_brite.csv', 
                    engine='python', header=0, index_col=0)

In [None]:
%%time
#读取基因表达数据
pan_gene_filtered = pd.read_hdf("data/Preprocessed_Data/pan_gene_filtered_1k.h5", 
                      key = "pan_gene_filtered")
lung_gene_filtered = pd.read_hdf("data/Preprocessed_Data/lung_gene_filtered_1k.h5", 
                      key = "lung_gene_filtered")

In [None]:
#还原tpm值
def tpm_recover(df):
    tpm=2**df-0.001
    return tpm

In [None]:
#替换负值
def replace_negative(df):
    # checking the element is < 0
    df[df < 0] = 0

In [None]:
%%time
non_lung_gene_filtered=tpm_recover(pan_gene_filtered)
lung_gene_filtered=tpm_recover(lung_gene_filtered)

In [None]:
%%time
replace_negative(non_lung_gene_filtered)
replace_negative(lung_gene_filtered)

In [None]:
for i in list(pan_gene_filtered.columns):
   # 获取各个指标的最大值和最小值
    Max = np.max(pan_gene_filtered[i])
    Min = np.min(pan_gene_filtered[i])
    pan_gene_filtered[i] = (pan_gene_filtered[i] - Min)/(Max - Min)

In [None]:
pan_gene_filtered=pan_gene_filtered.fillna(0)

In [None]:
for i in list(lung_gene_filtered.columns):
   # 获取各个指标的最大值和最小值
    Max = np.max(lung_gene_filtered[i])
    Min = np.min(lung_gene_filtered[i])
    lung_gene_filtered[i] = (lung_gene_filtered[i] - Min)/(Max - Min)

In [None]:
lung_gene_filtered

In [None]:
lung_gene_filtered=lung_gene_filtered.fillna(0)

In [None]:
%%time
pan_clinical=pd.read_hdf('data/Preprocessed_Data/PanCancer.h5',key='clinical')
lung_clinical=pd.read_hdf('data/Preprocessed_Data/Lung.h5',key='clinical')

In [None]:
pan_clinical

In [None]:
len(pan_clinical.index.intersection(list(map(lambda x: x.split('_')[0], pan_gene_filtered.index))))

In [None]:
lung_clinical.shape

In [None]:
len(lung_clinical.index.intersection(list(map(lambda x: x.split('_')[0], lung_gene_filtered.index))))

In [None]:
ens_ids=lung_gene_filtered.columns

In [None]:
print("The number of genes: {}".format(len(ens_ids)))

In [None]:
print("Total number of samples (patients): {}".format(pan_gene_filtered.shape[0]))

In [None]:
%%time
#导入ensembol-hugo字典
hugo_to_ens = pd.read_csv('data/PanCancer/hugo_to_ens.tsv', 
                       sep='\t', engine='python', index_col=None)

In [None]:
%%time
#删除多对一的情况
print(hugo_to_ens.shape)
hugo_to_ens=hugo_to_ens[~hugo_to_ens.duplicated(subset=['geneId','geneName'])]
print(hugo_to_ens.shape)

In [None]:
#刚好我们筛选的20k个基因都包含在剩余的不含多对一情况的65670个基因中（剩下的65670个ens可以覆盖基因表达数据的两万个ens）
hugo_to_ens = hugo_to_ens[hugo_to_ens['geneId'].isin(ens_ids)]
hugo_to_ens.shape

In [None]:
gene_mapping = hugo_to_ens[['geneId', 'geneName']]
gene_mapping.head()

In [None]:
sum(gene_mapping.duplicated())

In [None]:
#查看有多少基因在Hugo和基因表达数据里共存
len(set(gene_mapping['geneName']))

In [None]:
#查看基因表达数据中的ens与对照字典中的ens有多少重合
len(set(gene_mapping['geneId']))

In [None]:
sum(map(lambda x: ',' in x or ';' in x, gene_mapping.geneName))

In [None]:
%%time
hsa_hugo = pd.DataFrame(data=list(itertools.chain.from_iterable(hsa.apply(
    lambda x: list(zip(itertools.repeat(x[0]), x[1].replace(';', ',').split(', '))), axis=1))), 
                        columns=['keggId', 'geneName'])

In [None]:
print(hsa_hugo.shape)
hsa_hugo.head()

In [None]:
sum(hsa_hugo.duplicated())

In [None]:
hsa_hugo = hsa_hugo[~hsa_hugo.duplicated()]

In [None]:
sum(hsa_hugo.duplicated())

In [None]:
print(hsa_hugo.shape)

In [None]:
len(set(gene_mapping['geneName']))

In [None]:
len(set(gene_mapping['geneId']))

In [None]:
gene_mapping = pd.merge(gene_mapping, hsa_hugo, on='geneName')

In [None]:
len(set(gene_mapping['geneName']))

In [None]:
len(set(gene_mapping['geneId']))

## KEGG gene id to KEGG BRITE id

In [None]:
gene_mapping = pd.merge(gene_mapping, hsa_brite, on='keggId')

In [None]:
gene_mapping.head()

In [None]:
sum(gene_mapping.duplicated())

In [None]:
len(set(gene_mapping['geneName']))

In [None]:
len(set(gene_mapping['geneId']))

In [None]:
id_len = len(gene_mapping['keggBriteId'][0])
all(gene_mapping['keggBriteId'].apply(lambda x: len(x) == id_len))

In [None]:
brite_id_len = len(brite['keggBriteId'][0])
all(brite['keggBriteId'].apply(lambda x: len(x.split('_')[0]) == brite_id_len))

In [None]:
gene_mapping['keggBriteId'] = gene_mapping['keggBriteId'].apply(lambda x: x[-4:])

In [None]:
hsa_brite['keggBriteId'].apply(lambda x: x[-4:]).drop_duplicates()

In [None]:
gene_mapping = pd.merge(gene_mapping, brite, on='keggBriteId')

In [None]:
sum(gene_mapping.duplicated())

In [None]:
len(set(gene_mapping['geneName']))

In [None]:
len(set(gene_mapping['geneId']))


In [None]:
gene_mapping.head()

In [None]:
gene_mapping['Functional Annotation Group'].value_counts()

In [None]:
gene_mapping['Functional Annotation Subgroup'].value_counts()

In [None]:
gene_mapping['Functional Annotation'].value_counts()

In [None]:
%%time
#泛癌的treemap数据构建
pan_gene_map = pan_gene_filtered.T
pan_gene_map['geneId'] = pan_gene_map.index
#pan_gene_map['tamPixel'] = np.ones(pan_gene_map.shape[0])
pan_gene_map['order'] = pan_gene_filtered.mean(axis=0) # sort genes by mean expression values 
pan_gene_map = pd.merge(gene_mapping, pan_gene_map, on='geneId')

In [None]:
pan_gene_map.head()

In [None]:
%%time
# Sanity check
sum(pan_gene_map.duplicated())

In [None]:
len(set(pan_gene_map['geneId']))

In [None]:
pan_gene_map.shape

In [None]:
%%time
#non_lung的treemap数据构建
non_lung_tree_map =non_lung_gene_filtered.T
non_lung_tree_map['geneId'] = non_lung_tree_map.index
#non_lung_tree_map['tamPixel'] = np.ones(non_lung_tree_map.shape[0])
non_lung_tree_map['order'] = non_lung_gene_filtered.mean(axis=0) 
non_lung_tree_map = pd.merge(gene_mapping, non_lung_tree_map, on='geneId')

In [None]:
non_lung_tree_map.head()

In [None]:
%%time
# Sanity check
sum(non_lung_tree_map.duplicated())

In [None]:
len(set(non_lung_tree_map['geneId']))

In [None]:
non_lung_tree_map.shape

In [None]:
%%time
#肺癌的treemap数据构建
lung_gene_map =lung_gene_filtered.T
lung_gene_map['geneId'] = lung_gene_map.index
#lung_gene_map['tamPixel'] = np.ones(lung_gene_map.shape[0])
lung_gene_map['order'] = lung_gene_filtered.mean(axis=0) 
lung_gene_map = pd.merge(gene_mapping, lung_gene_map, on='geneId')

In [None]:
lung_gene_map.head()

In [None]:
%%time
# Sanity check
sum(lung_gene_map.duplicated())

In [None]:
len(set(lung_gene_map['geneId']))

In [None]:
lung_gene_map.shape

In [None]:
%%time
# Save dataset
with pd.HDFStore('data/KEGG/KEGG_gene_map_log2tpm_20k.h5','w') as store:
    store['pan_gene_map_log2tpm']=pan_gene_map
    #store['non_lung_gene_map_log2tpm']=non_lung_gene_map
    store['lung_gene_map_log2tpm']=lung_gene_map

In [None]:
%%time
# Save dataset
with pd.HDFStore('data/KEGG/KEGG_tree_map_tpm_MinMax.h5','w') as store:
    store['pan_tree_map']=pan_tree_map
    store['non_lung_tree_map']=non_lung_tree_map
    store['lung_tree_map']=lung_tree_map

In [None]:
%%time
# Save dataset
with pd.HDFStore('data/KEGG/KEGG_gene_map_log2tpm_MinMax_5k.h5','w') as store:
    store['pan_gene_map']=pan_gene_map
    #store['non_lung_tree_map']=non_lung_tree_map
    store['lung_gene_map']=lung_gene_map

In [None]:
 %%time
# Save dataset
pan_gene_map.to_csv("data/KEGG/KEGG_pan_gene_map.csv", index=False)
lung_gene_map.to_csv("data/KEGG/KEGG_lung_gene_map.csv", index=False)