Kidney Cancer

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler


raw_mirna_path = './data/pancanMiRs_EBadjOnProtocolPlatformWithoutRepsWithUnCorrectMiRs_08_04_16.csv'
clinical_path = './preprocessed_data/clinical_kidney.csv'
mirna_save_path = './preprocessed_data/mirna_kidney.csv'

# Read miRNA data
data_mirna = pd.read_csv(raw_mirna_path, sep=',')
data_mirna = data_mirna.drop(labels=["Correction"], axis=1)
mirna_samples = pd.read_csv(raw_mirna_path, sep=',', header=None, nrows=1)
mirna_samples = mirna_samples.values.tolist()
mirna_samples = mirna_samples[0]
mirna_samples = mirna_samples[2:]
for j in range(len(mirna_samples)):
	mirna_samples[j] = mirna_samples[j][:12]
data_mirna.columns = ['sample'] + mirna_samples
data_mirna.fillna(0.0, inplace=True)
data_mirna = data_mirna.T
data_mirna.drop(index='sample', inplace=True)

In [14]:
# Read patients ID in preprocessed clinical data
data_clin = pd.read_csv(clinical_path, header=None)
clin_samples = data_clin[[0]]
clin_samples = clin_samples.values.tolist()
clinical_samples = list()
for i in range(len(clin_samples)):
	clinical_samples.append(clin_samples[i][0])
clin_samples = clinical_samples
data_mirna = data_mirna.loc[data_mirna.index.isin(clin_samples)]

In [15]:
# Remove the rows with same patient ID
data_mirna.reset_index(inplace=True)
data_mirna = data_mirna.drop_duplicates(['index'])
data_mirna.reset_index(drop=True)
data_mirna.set_index('index', inplace=True)

In [16]:
# min-max normalization
scaler = MinMaxScaler()
mirna_0_1 = scaler.fit_transform(data_mirna)
mirna_f_df = pd.DataFrame(mirna_0_1)
mirna_f_df.index = data_mirna.index
mirna_f_df.reset_index(inplace=True)

In [17]:
# Create all zero vector
a = mirna_f_df.loc[mirna_f_df["index"] == mirna_f_df["index"].iloc[-1]]
sample_row = a.copy()
sample_row['index'] = 'xx'
for j in range(mirna_f_df.shape[1]-1):
	sample_row[j] = 0.0

In [18]:
i = 0
data_list = []
for x in clin_samples:
	if i == 0:
		mirna = mirna_f_df[mirna_f_df['index']  == x]
		if mirna.shape[0] == 0:
			mirna = sample_row.copy()
			mirna['index'] = x
		data_list.append(mirna)
		i += 1
	else:
		mirna_row = mirna_f_df[mirna_f_df['index'] == x]
		if mirna_row.shape[0] == 0:
			mirna_row = sample_row.copy()
			mirna_row['index'] = x
		data_list.append(mirna_row)

mirna = pd.concat(data_list)

In [19]:
# Save data
mirna.set_index('index', inplace=True)
mirna.to_csv(mirna_save_path, header=False, index=False)

In [20]:
# Available percentage 
(mirna.sum(axis=1) != 0).sum()/ len(mirna)

0.9041533546325878

In [21]:
# Number of missing samples
(mirna.sum(axis=1) == 0).sum()

90

In [22]:
mirna

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,733,734,735,736,737,738,739,740,741,742
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-KL-8323,0.018662,0.078157,0.404085,0.449613,0.317253,0.024939,0.088730,0.113041,0.249204,0.051379,...,0.025258,0.0,0.0,0.0,0.034593,0.016877,0.006605,0.009234,0.018697,0.018329
TCGA-KL-8324,0.003593,0.083186,0.182353,0.319910,0.258602,0.025422,0.078350,0.088689,0.168116,0.056262,...,0.048865,0.0,0.0,0.0,0.267694,0.020896,0.029816,0.008452,0.010413,0.010131
TCGA-KL-8325,0.012693,0.026698,0.187298,0.444762,0.261796,0.003689,0.007938,0.056523,0.214279,0.083511,...,0.000000,0.0,0.0,0.0,0.000000,0.231129,0.149782,0.176624,0.238073,0.208400
TCGA-KL-8326,0.010211,0.038837,0.120886,0.203941,0.055015,0.055183,0.075167,0.132415,0.082758,0.035716,...,0.000000,0.0,0.0,0.0,0.163483,0.258152,0.167780,0.119308,0.145095,0.145398
TCGA-KL-8327,0.009365,0.207815,0.225008,0.639431,0.379977,0.096012,0.215486,0.088278,0.232145,0.066980,...,0.000000,0.0,0.0,0.0,0.396836,0.103772,0.037885,0.035026,0.065321,0.036044
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-Y8-A898,0.007333,0.277975,0.157177,0.136266,0.124276,0.095535,0.243525,0.105893,0.160297,0.086057,...,0.000000,0.0,0.0,0.0,0.043652,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-Y8-A8RY,0.002887,0.168574,0.241068,0.050248,0.077346,0.076819,0.364467,0.075690,0.193310,0.058301,...,0.000000,0.0,0.0,0.0,0.057290,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-Y8-A8RZ,0.006583,0.186278,0.112424,0.096440,0.014736,0.048321,0.037845,0.117088,0.150570,0.115411,...,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
TCGA-Y8-A8S0,0.004368,0.121050,0.333514,0.054425,0.154184,0.080763,0.822802,0.045035,0.191201,0.052573,...,0.000000,0.0,0.0,0.0,0.020000,0.000000,0.000000,0.000000,0.000000,0.000000
