*Kidney Cancer*

In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import MinMaxScaler


THRESHOLD = 1000000
raw_mrna_path = './data/EBPlusPlusAdjustPANCAN_IlluminaHiSeq_RNASeqV2.geneExp.tsv'
clinical_path = './preprocessed_data/clinical_kidney.csv'
mrna_save_path = './preprocessed_data/mrna_kidney.csv'

def VarianceSelect(data, t):
	selector = VarianceThreshold(threshold=t)
	result_select = selector.fit_transform(data)
	result_support = selector.get_support(indices=True)
	return result_select, result_support

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Read mRNA data
data_mrna = pd.read_csv(raw_mrna_path, sep='\t')
mrna_samples = pd.read_csv(raw_mrna_path, sep='\t', header=None, nrows=1)
mrna_samples = mrna_samples.values.tolist()
mrna_samples = mrna_samples[0]
mrna_samples = mrna_samples[1:]
for j in range(len(mrna_samples)):
	mrna_samples[j] = mrna_samples[j][:12]
data_mrna.columns = ['sample'] + mrna_samples
data_mrna.fillna(0.0, inplace=True)
data_mrna = data_mrna.T
data_mrna.drop(index='sample', inplace=True)

In [4]:
# Read patients ID in preprocessed clinical data
data_clin = pd.read_csv(clinical_path, header=None)
clin_samples = data_clin[[0]]
clin_samples = clin_samples.values.tolist()
clinical_samples = list()
for i in range(len(clin_samples)):
	clinical_samples.append(clin_samples[i][0])
clin_samples = clinical_samples
data_mrna = data_mrna.loc[data_mrna.index.isin(clin_samples)]

In [5]:
# Remove the rows with same patient ID
data_mrna.reset_index(inplace=True)
data_mrna = data_mrna.drop_duplicates(['index'])
data_mrna.reset_index(drop=True)
data_mrna.set_index('index', inplace=True)

In [6]:
# Variance threshold
res, _ = VarianceSelect(data_mrna, THRESHOLD)
mrna_df = pd.DataFrame(res)
mrna_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2736,2737,2738,2739,2740,2741,2742,2743,2744,2745
0,0.0,20899.6,3206.37,1267.24,3606.48,1657.63,2422.5,2691.67,77.5652,6753.82,...,16210.6,7168.87,2748.68,4873.76,1836.39,1368.43,2494.93,1306.79,2508.8,8845.0
1,0.0,5456.09,3001.36,959.98,1154.23,2559.78,3149.05,2171.23,0.4956,727.048,...,7743.28,4127.37,4242.35,4478.26,1787.63,1477.39,3534.13,1731.63,3422.13,6207.41
2,0.0,4954.31,4818.19,1933.97,712.383,1526.36,3504.92,680.171,2.4779,1101.41,...,3627.58,2166.88,669.021,3540.85,2108.65,883.33,2299.45,348.139,3435.54,7059.41
3,0.0,1151.21,4749.19,1194.35,702.497,2550.49,2645.31,20.9917,0.0,9511.76,...,6283.75,3439.02,2447.7,9119.44,1832.07,692.363,2947.88,423.815,4653.28,8500.18
4,0.0,13154.1,3393.73,683.922,1572.55,1336.47,3482.35,1325.49,20.3922,3698.04,...,5196.08,3366.27,1461.18,3999.22,3165.49,1995.29,2278.43,2589.02,5109.02,5661.96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
882,3869.36,14414.7,7518.0,556.46,827.984,2778.29,922.082,32.9952,0.0,111.47,...,4463.27,2612.86,4139.56,5985.51,834.244,1911.88,6327.95,433.842,1566.83,5178.91
883,2870.48,5948.34,5874.95,256.598,716.183,5229.4,2019.08,19.8944,0.0,43.0369,...,2462.04,1686.97,6969.14,5210.31,2108.81,1545.68,5524.97,882.663,1770.61,1937.07
884,3971.52,6559.76,8046.75,159.054,757.12,2720.58,1543.79,9.1349,1.0747,56.9586,...,4387.43,1770.02,3447.61,6917.25,1961.31,2285.33,6399.25,389.575,2679.2,3789.36
885,13.1527,7693.15,3778.51,707.615,1163.17,3607.0,859.661,22.0965,2.1044,82.599,...,6574.77,1169.54,4622.91,4322.5,2489.02,1220.57,8043.14,821.255,1703.54,1910.82


In [7]:
# min-max normalization
scaler = MinMaxScaler()
mrna_0_1 = scaler.fit_transform(mrna_df)
mrna_f_df = pd.DataFrame(mrna_0_1)
mrna_f_df.index = data_mrna.index
mrna_f_df.reset_index(inplace=True)

In [8]:
# Create all zero vector
a = mrna_f_df[mrna_f_df["index"] == mrna_f_df["index"].iloc[-1]]
sample_row = a.copy()
sample_row['index'] = 'xx'
for i in range(mrna_f_df.shape[1]-1):
	sample_row[i] = 0.0

In [9]:
# Fill NaN with zero vectors
i = 0
data_list = []
for x in clin_samples:
	if i == 0:
		mrna = mrna_f_df[mrna_f_df['index']  == x]
		if mrna.shape[0] == 0:
			mrna = sample_row.copy()
			mrna['index'] = x
		i += 1
		data_list.append(mrna)
	else:
		mrna_row = mrna_f_df[mrna_f_df['index'] == x]
		if mrna_row.shape[0] == 0:
			mrna_row = sample_row.copy()
			mrna_row['index'] = x
		data_list.append(mrna_row)
mrna = pd.concat(data_list)

In [10]:
# Save data
mrna.set_index('index', inplace=True)
mrna.to_csv(mrna_save_path, index=False, header=False)

In [11]:
# Available percentage 
(mrna.sum(axis=1) != 0).sum()/ len(mrna)

0.9446219382321619

In [12]:
# Number of missing samples
(mrna.sum(axis=1) == 0).sum()

52

In [13]:
mrna

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,2736,2737,2738,2739,2740,2741,2742,2743,2744,2745
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-KL-8323,0.004753,0.058694,0.335599,0.019044,0.231036,0.218217,0.102268,0.002273,0.000050,0.003113,...,0.119260,0.037001,0.465083,0.256270,0.031348,0.058983,0.374486,0.062781,0.063224,0.152925
TCGA-KL-8324,0.000846,0.051358,0.296261,0.003181,0.021876,0.325229,0.337593,0.000097,0.000000,0.000304,...,0.061742,0.191575,0.532510,0.576761,0.129704,0.041414,0.415455,0.009090,0.137070,0.094817
TCGA-KL-8325,0.630294,0.040161,0.225456,0.013098,0.088413,0.209288,0.120154,0.000389,0.000000,0.000377,...,0.085125,0.023910,0.400168,0.208857,0.017600,0.038836,1.000000,0.058161,0.064796,0.036634
TCGA-KL-8326,0.000052,0.038355,0.166639,0.009205,0.009991,0.121117,0.077473,0.000197,0.000000,0.002436,...,0.016472,0.133006,0.749156,0.235260,0.093809,0.047130,0.378879,0.029074,0.141862,0.061631
TCGA-KL-8327,0.004497,0.046839,0.146594,0.014370,0.064189,0.165589,0.055814,0.000764,0.000060,0.002212,...,0.061525,0.024324,0.345051,0.685581,0.140844,0.013862,0.317888,0.147276,0.215478,0.078847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA-Y8-A898,0.000037,0.136166,0.173220,0.044950,0.079197,0.069165,0.265310,0.206074,0.001279,0.088531,...,0.126724,0.253156,0.137389,0.290026,0.072379,0.021303,0.105601,0.091203,0.249402,0.560212
TCGA-Y8-A8RY,0.000000,0.045363,0.075366,0.026756,0.127061,0.133917,0.179336,0.341968,0.000119,0.057202,...,0.129147,0.331535,0.466759,0.701285,0.068837,0.041216,0.279777,0.243529,0.228066,0.527812
TCGA-Y8-A8RZ,0.000000,0.035966,0.281383,0.135854,0.077487,0.244660,0.363163,0.018568,0.000344,0.443522,...,0.130868,0.174321,0.178498,0.256269,0.082953,0.031788,0.190167,0.092341,0.082249,0.266661
TCGA-Y8-A8S0,0.000000,0.035167,0.169598,0.046580,0.089124,0.100950,0.315524,0.529382,0.000247,0.017143,...,0.163329,0.324872,0.443167,0.333754,0.047047,0.051820,0.165935,0.249041,0.131271,0.358564
