### Predicting protein abundance using genomic and transcriptomic profiles

##### Name: Swathi Ramachandra Upadhya
##### ID: 18200264

Import packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.stats import zscore
from sklearn import preprocessing

Common fields 

In [2]:
RPPA_file="brca_tcga_pub2015\data_rppa.txt"
mRNA_profile_file="brca_tcga_pub2015\data_RNA_Seq_v2_expression_median.txt"
DNA_profile_file="brca_tcga_pub2015\data_linear_CNA.txt"
DNA_mutations_file="brca_tcga_pub2015\data_mutations_extended.txt"

protein_expression="Preprocessed_Data\protein_expression.csv"
gene_mutations = "Preprocessed_Data\Gene_Mutations.csv"
mRNA_DNA_expression_minmax="Preprocessed_Data\mRNA_DNA_minmax.csv"
mRNA_DNA_expression_zscores="Preprocessed_Data\mRNA_DNA_zscores.csv"
mRNA_DNA_expression_unnormalized="Preprocessed_Data\mRNA_DNA_unnormalized.csv"

mRNA_All_minmax = "Preprocessed_Data\mRNA_All_minmax.csv"
mRNA_All_zscores = "Preprocessed_Data\mRNA_All_zscores.csv"
mRNA_All_unnormalized = "Preprocessed_Data\mRNA_All_unnormalized.csv"
mRNA_All_TMM = "Preprocessed_Data\mRNA_All_TMM.csv"

Loading files

In [3]:
#Loading the file containing protein profiles
protein_profile_df = pd.read_csv(RPPA_file, sep='\t', index_col=0)
protein_profile_df.head()

Unnamed: 0_level_0,TCGA-EW-A2FR-01,TCGA-EW-A2FS-01,TCGA-EW-A2FW-01,TCGA-EW-A2FV-01,TCGA-EW-A1PC-01,TCGA-GM-A2DD-01,TCGA-BH-A0B6-01,TCGA-A2-A0CK-01,TCGA-AC-A2BK-01,TCGA-AR-A2LO-01,...,TCGA-LL-A5YM-01,TCGA-LL-A5YL-01,TCGA-LL-A5YP-01,TCGA-LL-A5YO-01,TCGA-OL-A66K-01,TCGA-OL-A66J-01,TCGA-OL-A66I-01,TCGA-A2-A4RY-01,TCGA-A2-A4S3-01,TCGA-EW-A423-01
Composite.Element.REF,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
YWHAE|14-3-3_epsilon,-0.099979,-0.21758,0.19681,-0.10105,-0.028874,0.30889,-0.16342,0.005556,0.053787,0.21318,...,0.12024,0.15326,0.037435,-0.16879,0.16306,-0.30825,0.1448,0.39357,0.13181,0.087545
EIF4EBP1|4E-BP1,1.1906,-0.85584,-0.23104,-0.63473,0.018278,-0.31175,-0.61144,-0.3425,0.72766,-0.47646,...,0.19099,-0.39995,0.25498,0.18848,-0.44997,0.11844,-0.34344,-0.378,0.50424,-0.50786
EIF4EBP1|4E-BP1_pS65,-0.10278,-0.32106,-0.34894,-0.70498,-0.44798,-0.2344,-0.34087,-0.65112,0.032733,-0.56906,...,-0.6982,-0.67111,-0.43087,0.20248,-0.92828,-1.4739,-1.131,-0.73644,-0.78896,-0.70795
EIF4EBP1|4E-BP1_pT37_T46,1.4224,-0.68572,-0.55718,-0.34155,-0.24092,0.47271,-0.54054,-0.87302,0.21937,0.48264,...,-0.50868,-0.46893,0.19375,0.93087,-0.42448,-0.7534,-0.80314,-0.89921,-0.6046,-0.14685
TP53BP1|53BP1,-0.087634,-0.35432,0.29387,-0.79471,-0.3431,-0.87133,-0.48045,0.080054,-0.49521,-1.261,...,-0.61428,-0.62054,-0.39736,-0.21278,-0.45755,-0.53277,-1.0858,-1.0568,-0.7062,-1.3477


In [4]:
#Loading file containing mRNA profiles
mRNA_profile_df = pd.read_csv(mRNA_profile_file, sep='\t', index_col=0)
mRNA_profile_df.head()

Unnamed: 0_level_0,Entrez_Gene_Id,TCGA-A1-A0SB-01,TCGA-A1-A0SD-01,TCGA-A1-A0SE-01,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SI-01,TCGA-A1-A0SJ-01,TCGA-A1-A0SK-01,TCGA-A1-A0SM-01,...,TCGA-LL-A5YM-01,TCGA-LL-A5YN-01,TCGA-LL-A5YO-01,TCGA-LL-A5YP-01,TCGA-LQ-A4E4-01,TCGA-MS-A51U-01,TCGA-OL-A66H-01,TCGA-OL-A66I-01,TCGA-OL-A66J-01,TCGA-OL-A66K-01
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBE2Q2P2,100134869,14.3935,11.3241,4.4426,10.7401,3.0048,2.9782,4.9419,28.856,7.6484,...,0.0,2.9624,3.6899,6.302,14.1288,7.9343,2.2519,1.2603,5.0428,4.3892
HMGB1P1,10357,116.387,60.263,153.1452,141.1933,79.8003,63.5491,134.8733,1119.1932,119.476,...,101.2865,100.3083,278.5626,206.4376,117.03,150.6834,115.3378,158.3599,124.6327,106.3475
LOC155060,155060,279.7612,83.6986,74.7018,314.4482,95.7054,149.794,63.6488,166.7192,80.8081,...,130.9387,367.2936,165.7717,103.9783,307.2124,540.4278,338.1985,210.746,323.1185,554.5539
RNU12-2P,26823,0.4505,0.3308,0.0,0.0,0.0,0.2943,0.3658,0.3152,0.3157,...,0.0,0.5225,0.0,0.0,0.9974,1.5649,2.0851,0.4173,0.4507,1.2434
SSX9,280660,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#Loading the file containing protein profiles
DNA_profile_df = pd.read_csv(DNA_profile_file, sep='\t', index_col=0)
DNA_profile_df.head()

Unnamed: 0_level_0,Entrez_Gene_Id,TCGA-A1-A0SB-01,TCGA-A1-A0SD-01,TCGA-A1-A0SE-01,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SI-01,TCGA-A1-A0SJ-01,TCGA-A1-A0SK-01,TCGA-A1-A0SM-01,...,TCGA-LL-A5YM-01,TCGA-LL-A5YN-01,TCGA-LL-A5YO-01,TCGA-LL-A5YP-01,TCGA-LQ-A4E4-01,TCGA-MS-A51U-01,TCGA-OL-A66H-01,TCGA-OL-A66I-01,TCGA-OL-A66J-01,TCGA-OL-A66K-01
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACAP3,116983.0,0.002,-0.521,-0.06,-0.015,-0.398,0.0,-0.022,0.475,-0.595,...,-0.704,-0.041,-0.155,0.937,-0.782,-0.022,0.003,0.238,-0.818,-0.308
ACTRT2,140625.0,0.002,-0.521,-0.06,-0.015,-0.398,0.0,-0.022,0.475,-0.595,...,-0.704,-0.041,-0.155,0.937,-0.782,-0.022,0.003,0.238,-0.818,-0.308
AGRN,375790.0,0.002,-0.521,-0.06,-0.015,-0.398,0.0,-0.022,0.475,-0.595,...,-0.704,-0.041,-0.155,0.937,-0.782,-0.022,0.003,0.238,-0.818,-0.308
ANKRD65,441869.0,0.002,-0.521,-0.06,-0.015,-0.398,0.0,-0.022,0.475,-0.595,...,-0.704,-0.041,-0.155,0.937,-0.782,-0.022,0.003,0.238,-0.818,-0.308
ATAD3A,55210.0,0.002,-0.521,-0.06,-0.015,-0.398,0.0,-0.022,0.475,-0.595,...,-0.704,-0.041,-0.155,0.937,-0.782,-0.022,0.003,0.238,-0.818,-0.308


Data Cleaning and Wrangling

Working with Protein expressions 

In [6]:
protein_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 226 entries, YWHAE|14-3-3_epsilon to CTNNA1|alpha-Catenin
Columns: 673 entries, TCGA-EW-A2FR-01 to TCGA-EW-A423-01
dtypes: float64(673)
memory usage: 1.2+ MB


In [7]:
protein_profile_df.shape

(226, 673)

In [8]:
protein_profile_df.isnull().sum()

TCGA-EW-A2FR-01    4
TCGA-EW-A2FS-01    4
TCGA-EW-A2FW-01    4
TCGA-EW-A2FV-01    4
TCGA-EW-A1PC-01    4
TCGA-GM-A2DD-01    4
TCGA-BH-A0B6-01    4
TCGA-A2-A0CK-01    4
TCGA-AC-A2BK-01    4
TCGA-AR-A2LO-01    4
TCGA-AR-A2LN-01    4
TCGA-AR-A2LQ-01    4
TCGA-AR-A2LH-01    4
TCGA-AR-A2LE-01    4
TCGA-GM-A2DL-01    4
TCGA-GM-A2DB-01    4
TCGA-GM-A2DH-01    4
TCGA-GM-A2DN-01    4
TCGA-GM-A2DM-01    4
TCGA-GM-A2DC-01    4
TCGA-GM-A2DI-01    4
TCGA-GM-A2DF-01    4
TCGA-AR-A1AM-01    4
TCGA-GM-A2D9-01    4
TCGA-GM-A2DO-01    4
TCGA-GM-A2DK-01    4
TCGA-AR-A2LK-01    4
TCGA-AR-A2LM-01    4
TCGA-B6-A2IU-01    4
TCGA-A2-A0CR-01    4
                  ..
TCGA-GM-A4E0-01    6
TCGA-A2-A4RW-01    6
TCGA-A2-A4S2-01    6
TCGA-A2-A4S0-01    6
TCGA-A7-A4SF-01    6
TCGA-A7-A4SD-01    6
TCGA-A7-A4SE-01    6
TCGA-A7-A4SA-01    6
TCGA-LQ-A4E4-01    6
TCGA-LL-A50Y-01    6
TCGA-MS-A51U-01    6
TCGA-AQ-A54O-01    6
TCGA-AQ-A54N-01    6
TCGA-A2-A4RX-01    6
TCGA-E9-A54X-01    6
TCGA-A7-A5ZV-01    6
TCGA-AC-A5XS-

In [9]:
protein_profile_df.dropna(inplace=True)
protein_profile_df.shape

(216, 673)

In [10]:
protein_profile_df = protein_profile_df.transpose()
protein_profile_df.head()

Composite.Element.REF,YWHAE|14-3-3_epsilon,EIF4EBP1|4E-BP1,EIF4EBP1|4E-BP1_pS65,EIF4EBP1|4E-BP1_pT37_T46,TP53BP1|53BP1,ACACA ACACB|ACC_pS79,ACACA|ACC1,AKT1 AKT2 AKT3|Akt,AKT1 AKT2 AKT3|Akt_pS473,AKT1 AKT2 AKT3|Akt_pT308,...,DPP4|CD26,CHEK1|Chk1_pS296;CHEK1|CHK1_pS296,COG3|COG3,GUSP4|DUSP4,ERCC5|ERCC5,IGFR1|IGF1R_pY1135_Y1136,IRF1|IRF-1,JAK2|Jak2,CDKN2A|P16INK4A;CDKN2A|p16_INK4a,PTPN11|SHP-2_pY542
TCGA-EW-A2FR-01,-0.099979,1.1906,-0.10278,1.4224,-0.087634,-0.25674,0.020495,0.15518,-0.26132,-0.67355,...,0.16226,0.045359,0.33691,1.1265,-0.16888,0.14878,0.081733,0.044856,0.60506,0.40486
TCGA-EW-A2FS-01,-0.21758,-0.85584,-0.32106,-0.68572,-0.35432,-0.33749,-0.63693,-0.2717,-0.68071,-0.5421,...,0.16487,0.11224,0.26021,0.32879,-0.20805,0.22602,0.13419,-0.032116,0.22082,0.13349
TCGA-EW-A2FW-01,0.19681,-0.23104,-0.34894,-0.55718,0.29387,0.86309,0.72192,0.37994,-1.0958,-0.73433,...,0.20789,-0.067348,0.67953,1.9027,0.059103,0.014486,0.093986,-0.044177,-0.65637,-0.30197
TCGA-EW-A2FV-01,-0.10105,-0.63473,-0.70498,-0.34155,-0.79471,0.093332,-0.075767,0.29997,-0.27198,-0.39754,...,-0.008897,-0.000412,0.16104,-0.23685,-0.15707,0.3016,-0.03334,-0.22055,0.34272,0.3612
TCGA-EW-A1PC-01,-0.028874,0.018278,-0.44798,-0.24092,-0.3431,0.92949,0.88123,-0.14121,-0.77562,-1.1793,...,-0.16637,-0.23134,-0.23725,0.12426,-0.40952,-0.069788,-0.16266,0.20387,0.70958,0.042857


Working with mRNA expressions

In [11]:
mRNA_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20440 entries, UBE2Q2P2 to AKR1C6P
Columns: 818 entries, Entrez_Gene_Id to TCGA-OL-A66K-01
dtypes: float64(817), int64(1)
memory usage: 127.7+ MB


In [12]:
mRNA_profile_df.shape

(20440, 818)

In [13]:
mRNA_profile_df.isnull().sum()

Entrez_Gene_Id     0
TCGA-A1-A0SB-01    0
TCGA-A1-A0SD-01    0
TCGA-A1-A0SE-01    0
TCGA-A1-A0SF-01    0
TCGA-A1-A0SH-01    0
TCGA-A1-A0SI-01    0
TCGA-A1-A0SJ-01    0
TCGA-A1-A0SK-01    0
TCGA-A1-A0SM-01    0
TCGA-A1-A0SN-01    0
TCGA-A1-A0SP-01    0
TCGA-A1-A0SQ-01    0
TCGA-A2-A04N-01    0
TCGA-A2-A04P-01    0
TCGA-A2-A04Q-01    0
TCGA-A2-A04R-01    0
TCGA-A2-A04T-01    0
TCGA-A2-A04U-01    0
TCGA-A2-A04V-01    0
TCGA-A2-A04W-01    0
TCGA-A2-A04X-01    0
TCGA-A2-A04Y-01    0
TCGA-A2-A0CK-01    0
TCGA-A2-A0CL-01    0
TCGA-A2-A0CM-01    0
TCGA-A2-A0CO-01    0
TCGA-A2-A0CP-01    0
TCGA-A2-A0CQ-01    0
TCGA-A2-A0CR-01    0
                  ..
TCGA-GM-A2DH-01    0
TCGA-GM-A2DI-01    0
TCGA-GM-A2DK-01    0
TCGA-GM-A2DL-01    0
TCGA-GM-A2DM-01    0
TCGA-GM-A2DN-01    0
TCGA-GM-A2DO-01    0
TCGA-GM-A3NW-01    0
TCGA-GM-A3NY-01    0
TCGA-GM-A3XG-01    0
TCGA-GM-A3XL-01    0
TCGA-GM-A3XN-01    0
TCGA-GM-A4E0-01    0
TCGA-HN-A2NL-01    0
TCGA-JL-A3YW-01    0
TCGA-JL-A3YX-01    0
TCGA-LL-A440-

In [14]:
#Dropping rows which contain null values 
mRNA_profile_df.dropna(inplace=True)
#Checking the shape after dropping the rows with null values
mRNA_profile_df.shape

(20440, 818)

In [15]:
#Transposing the data frame to get the list of patients as columns
mRNA_profile_df = mRNA_profile_df.transpose()
mRNA_profile_df.head()

Hugo_Symbol,UBE2Q2P2,HMGB1P1,LOC155060,RNU12-2P,SSX9,CXORF67,EFCAB8,SRP14P1,LOC391343,TRIM75P,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1,AKR1C6P
Entrez_Gene_Id,100134900.0,10357.0,155060.0,26823.0,280660.0,340602.0,388795.0,390284.0,391343.0,391714.0,...,7789.0,158586.0,79364.0,440590.0,79699.0,7791.0,23140.0,26009.0,387590.0,389932.0
TCGA-A1-A0SB-01,14.3935,116.387,279.7612,0.4505,0.0,0.901,0.901,1.802,0.0,0.4505,...,95.9568,519.4279,1415.9252,19.3716,1364.5681,6186.7327,1931.2986,1436.1978,552.3144,0.0
TCGA-A1-A0SD-01,11.3241,60.263,83.6986,0.3308,0.0,0.6616,0.3308,4.6315,0.3308,0.3308,...,96.27,578.2814,1225.7051,33.0825,868.0837,3559.6725,1278.9678,1195.6,86.0144,0.0
TCGA-A1-A0SE-01,4.4426,153.1452,74.7018,0.0,0.0,0.0,0.9872,5.5944,0.3291,0.3291,...,95.434,726.6146,1018.84,57.5895,960.5923,3007.8157,926.3677,1075.4422,866.1456,0.0
TCGA-A1-A0SF-01,10.7401,141.1933,314.4482,0.0,0.0,0.0,2.9988,9.4249,0.0,0.0,...,74.1138,533.3625,1053.4444,94.6772,881.2262,5343.4779,934.3482,508.0867,52.2652,0.8568


Working with DNA expressions

In [16]:
DNA_profile_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22247 entries, ACAP3 to MIR3687
Columns: 817 entries, Entrez_Gene_Id to TCGA-OL-A66K-01
dtypes: float64(817)
memory usage: 138.8+ MB


In [17]:
DNA_profile_df.shape

(22247, 817)

In [18]:
DNA_profile_df.isnull().sum()

Entrez_Gene_Id     10
TCGA-A1-A0SB-01     0
TCGA-A1-A0SD-01     0
TCGA-A1-A0SE-01     0
TCGA-A1-A0SF-01     0
TCGA-A1-A0SH-01     0
TCGA-A1-A0SI-01     0
TCGA-A1-A0SJ-01     0
TCGA-A1-A0SK-01     0
TCGA-A1-A0SM-01     0
TCGA-A1-A0SN-01     0
TCGA-A1-A0SP-01     0
TCGA-A1-A0SQ-01     0
TCGA-A2-A04N-01     0
TCGA-A2-A04P-01     0
TCGA-A2-A04Q-01     0
TCGA-A2-A04R-01     0
TCGA-A2-A04T-01     0
TCGA-A2-A04U-01     0
TCGA-A2-A04V-01     0
TCGA-A2-A04W-01     0
TCGA-A2-A04X-01     0
TCGA-A2-A04Y-01     0
TCGA-A2-A0CK-01     0
TCGA-A2-A0CL-01     0
TCGA-A2-A0CM-01     0
TCGA-A2-A0CO-01     0
TCGA-A2-A0CP-01     0
TCGA-A2-A0CQ-01     0
TCGA-A2-A0CR-01     0
                   ..
TCGA-GM-A2DH-01     0
TCGA-GM-A2DI-01     0
TCGA-GM-A2DK-01     0
TCGA-GM-A2DL-01     0
TCGA-GM-A2DM-01     0
TCGA-GM-A2DN-01     0
TCGA-GM-A2DO-01     0
TCGA-GM-A3NW-01     0
TCGA-GM-A3NY-01     0
TCGA-GM-A3XG-01     0
TCGA-GM-A3XL-01     0
TCGA-GM-A3XN-01     0
TCGA-GM-A4E0-01     0
TCGA-HN-A2NL-01     0
TCGA-JL-A3

In [19]:
DNA_profile_df.dropna(inplace=True)
DNA_profile_df.shape

(22237, 817)

In [20]:
DNA_profile_df = DNA_profile_df.transpose()
DNA_profile_df.head()

Hugo_Symbol,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,H2AFB2,H2AFB3,MPP1,MTCP1,RAB39B,SMIM9,SNORA36A,SNORA56,TMLHE,VBP1
Entrez_Gene_Id,116983.0,140625.0,375790.0,441869.0,55210.0,83858.0,219293.0,54998.0,126792.0,54991.0,...,474381.0,83740.0,4354.0,4515.0,116442.0,100133000.0,677817.0,677835.0,55217.0,7411.0
TCGA-A1-A0SB-01,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,0.002,...,-0.029,-0.029,-0.029,-0.029,-0.029,-0.029,-0.029,-0.029,-0.029,-0.029
TCGA-A1-A0SD-01,-0.521,-0.521,-0.521,-0.521,-0.521,-0.521,-0.521,-0.521,-0.521,-0.521,...,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06
TCGA-A1-A0SE-01,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,-0.06,...,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008
TCGA-A1-A0SF-01,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,...,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065


Finding list of common patients to be used for prediction

In [21]:
patients = np.intersect1d(protein_profile_df.index, mRNA_profile_df.index)
common_patients = np.intersect1d(DNA_profile_df.index, patients)
print("Number of common patients: ", len(common_patients))
print (common_patients)

Number of common patients:  672
['TCGA-A1-A0SF-01' 'TCGA-A1-A0SH-01' 'TCGA-A1-A0SJ-01' 'TCGA-A1-A0SK-01'
 'TCGA-A1-A0SQ-01' 'TCGA-A2-A04N-01' 'TCGA-A2-A04P-01' 'TCGA-A2-A04Q-01'
 'TCGA-A2-A04T-01' 'TCGA-A2-A04U-01' 'TCGA-A2-A04V-01' 'TCGA-A2-A04W-01'
 'TCGA-A2-A04Y-01' 'TCGA-A2-A0CK-01' 'TCGA-A2-A0CL-01' 'TCGA-A2-A0CM-01'
 'TCGA-A2-A0CP-01' 'TCGA-A2-A0CQ-01' 'TCGA-A2-A0CR-01' 'TCGA-A2-A0CS-01'
 'TCGA-A2-A0CT-01' 'TCGA-A2-A0CU-01' 'TCGA-A2-A0CV-01' 'TCGA-A2-A0CW-01'
 'TCGA-A2-A0D0-01' 'TCGA-A2-A0D2-01' 'TCGA-A2-A0D3-01' 'TCGA-A2-A0D4-01'
 'TCGA-A2-A0EO-01' 'TCGA-A2-A0EP-01' 'TCGA-A2-A0EQ-01' 'TCGA-A2-A0ER-01'
 'TCGA-A2-A0ES-01' 'TCGA-A2-A0EU-01' 'TCGA-A2-A0EV-01' 'TCGA-A2-A0EW-01'
 'TCGA-A2-A0EX-01' 'TCGA-A2-A0EY-01' 'TCGA-A2-A0ST-01' 'TCGA-A2-A0SU-01'
 'TCGA-A2-A0SV-01' 'TCGA-A2-A0SW-01' 'TCGA-A2-A0SX-01' 'TCGA-A2-A0T0-01'
 'TCGA-A2-A0T1-01' 'TCGA-A2-A0T2-01' 'TCGA-A2-A0T3-01' 'TCGA-A2-A0T4-01'
 'TCGA-A2-A0T5-01' 'TCGA-A2-A0T6-01' 'TCGA-A2-A0T7-01' 'TCGA-A2-A0YC-01'
 'TCGA-A2-A0YD-01' 

In [22]:
protein_profile_subset = protein_profile_df[protein_profile_df.index.isin(common_patients)]
print("Shape:" , protein_profile_subset.shape)

Shape: (672, 216)


In [23]:
#Sorting the index in order to have the same order in both the dataframes
protein_profile_subset = protein_profile_subset.sort_index()
protein_profile_subset.head()

Composite.Element.REF,YWHAE|14-3-3_epsilon,EIF4EBP1|4E-BP1,EIF4EBP1|4E-BP1_pS65,EIF4EBP1|4E-BP1_pT37_T46,TP53BP1|53BP1,ACACA ACACB|ACC_pS79,ACACA|ACC1,AKT1 AKT2 AKT3|Akt,AKT1 AKT2 AKT3|Akt_pS473,AKT1 AKT2 AKT3|Akt_pT308,...,DPP4|CD26,CHEK1|Chk1_pS296;CHEK1|CHK1_pS296,COG3|COG3,GUSP4|DUSP4,ERCC5|ERCC5,IGFR1|IGF1R_pY1135_Y1136,IRF1|IRF-1,JAK2|Jak2,CDKN2A|P16INK4A;CDKN2A|p16_INK4a,PTPN11|SHP-2_pY542
TCGA-A1-A0SF-01,-0.147,0.28175,-0.20578,0.044714,0.008776,0.73354,0.36686,0.21003,-0.044336,-0.24436,...,0.024578,-0.070867,0.25644,-0.19604,-0.04173,0.14215,-0.21625,-0.25069,0.1755,0.46743
TCGA-A1-A0SH-01,-0.067673,-0.087407,-0.32596,-0.28286,-0.45921,1.6922,1.6832,0.41123,0.11281,0.11798,...,0.061646,0.017998,0.24249,0.79744,-0.078646,0.093934,0.034296,-0.31866,0.68156,0.27594
TCGA-A1-A0SJ-01,0.2368,0.042356,-0.4575,-1.2239,-0.54766,0.35436,0.53232,0.079475,-0.6781,-0.36234,...,0.060296,-0.2545,0.07192,-0.003138,-0.31495,0.26698,-0.15772,-0.52997,-0.009889,-0.25931
TCGA-A1-A0SK-01,0.15139,2.264,0.14355,1.1153,-0.32004,-0.69362,-0.55526,-0.079882,-0.99645,-0.65934,...,-0.08897,0.17701,0.090481,0.054237,0.43935,0.19846,-0.27685,-0.26378,1.2363,-0.043742
TCGA-A1-A0SQ-01,-0.13107,-0.1602,-0.28631,-0.62308,0.20731,1.5702,1.2738,0.2731,-0.80948,-0.86584,...,0.10679,0.056123,0.2962,0.79723,-0.27705,0.062662,-0.041559,-0.31534,-0.3458,-0.28589


In [24]:
#Sorting the index in order to have the same order in both the dataframes
mRNA_profile_subset = mRNA_profile_df[mRNA_profile_df.index.isin(common_patients)]
print("Shape:" , mRNA_profile_subset.shape)

Shape: (672, 20440)


In [25]:
mRNA_profile_subset = mRNA_profile_subset.sort_index()
mRNA_profile_subset.head()

Hugo_Symbol,UBE2Q2P2,HMGB1P1,LOC155060,RNU12-2P,SSX9,CXORF67,EFCAB8,SRP14P1,LOC391343,TRIM75P,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1,AKR1C6P
TCGA-A1-A0SF-01,10.7401,141.1933,314.4482,0.0,0.0,0.0,2.9988,9.4249,0.0,0.0,...,74.1138,533.3625,1053.4444,94.6772,881.2262,5343.4779,934.3482,508.0867,52.2652,0.8568
TCGA-A1-A0SH-01,3.0048,79.8003,95.7054,0.0,0.0,0.0,0.3612,3.9727,0.0,0.0,...,87.7601,581.0946,801.3977,19.8634,1353.2389,5464.9614,1312.7898,1001.1151,455.7746,0.0
TCGA-A1-A0SJ-01,4.9419,134.8733,63.6488,0.3658,0.0,0.0,0.3658,4.7554,0.0,0.7316,...,83.7677,2731.4129,834.385,51.2117,1092.6383,3032.0988,958.3905,718.7929,43.5299,0.0
TCGA-A1-A0SK-01,28.856,1119.1932,166.7192,0.3152,0.0,1.891,0.0,3.4668,0.0,0.0,...,69.0199,610.1481,1775.6067,158.5251,1380.7123,591.2386,1798.9285,1335.6445,49.48,0.0
TCGA-A1-A0SQ-01,1.0368,41.5532,75.8447,0.0,0.0,0.0,0.4034,5.648,0.0,0.0,...,106.1019,617.2466,875.0378,25.416,969.4402,2903.0761,715.6833,691.4776,17.7509,0.0


In [26]:
#Sorting the index in order to have the same order in both the dataframes
DNA_profile_subset = DNA_profile_df[DNA_profile_df.index.isin(common_patients)]
print("Shape:" , DNA_profile_subset.shape)

Shape: (672, 22237)


In [27]:
DNA_profile_subset = DNA_profile_subset.sort_index()
DNA_profile_subset.head()

Hugo_Symbol,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,H2AFB2,H2AFB3,MPP1,MTCP1,RAB39B,SMIM9,SNORA36A,SNORA56,TMLHE,VBP1
TCGA-A1-A0SF-01,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,-0.015,...,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065,0.065
TCGA-A1-A0SH-01,-0.398,-0.398,-0.398,-0.398,-0.398,-0.398,-0.398,-0.398,-0.398,-0.398,...,0.346,0.346,0.346,0.346,0.346,0.346,0.346,0.346,0.346,0.346
TCGA-A1-A0SJ-01,-0.022,-0.022,-0.022,-0.022,-0.022,-0.022,-0.022,-0.022,-0.022,-0.022,...,0.112,0.112,0.112,0.112,0.112,0.112,0.112,0.112,0.112,0.112
TCGA-A1-A0SK-01,0.475,0.475,0.475,0.475,0.475,0.475,0.475,0.475,0.475,0.475,...,-0.125,-0.125,-0.125,-0.125,-0.125,-0.125,-0.125,-0.125,-0.125,-0.125
TCGA-A1-A0SQ-01,-0.027,-0.027,-0.027,-0.027,-0.027,-0.027,-0.027,-0.027,-0.027,-0.027,...,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008,-0.008


Selecting required protein

In [28]:
protein_expression_df = protein_profile_subset[['RB1|Rb', 'CDH1|E-Cadherin',
                                                'PTEN|PTEN', 'BRCA2|BRCA2',
                                                'CDKN2A|P16INK4A;CDKN2A|p16_INK4a', 
                                                'TP53|p53', 'CTNNB1|beta-Catenin',
                                                'CCNE1|Cyclin_E1', 'CCND1|Cyclin_D1',
                                                'CDH2|N-Cadherin', 'CDH3|P-Cadherin',
                                                'ERBB2|HER2', 'ERBB3|HER3'
                                               ]]
protein_expression_df.columns = ['RB1', 'CDH1', 'PTEN', 'BRCA2', 'CDKN2A', 
                                 'TP53', 'CTNNB1', 'CCNE1', 'CCND1', 'CDH2',
                                 'CDH3', 'ERBB2', 'ERBB3']
protein_expression_df.head()

Unnamed: 0,RB1,CDH1,PTEN,BRCA2,CDKN2A,TP53,CTNNB1,CCNE1,CCND1,CDH2,CDH3,ERBB2,ERBB3
TCGA-A1-A0SF-01,0.017184,0.70568,0.19773,0.10855,0.1755,-0.47255,0.06978,-0.75401,-0.20236,-0.222,-0.11406,0.25604,0.46141
TCGA-A1-A0SH-01,-0.081896,0.018364,0.69916,-0.28243,0.68156,-0.035545,-0.11192,-0.77956,-0.014111,0.072251,-0.031553,0.68354,0.03367
TCGA-A1-A0SJ-01,-0.10609,0.11431,-0.11055,0.10822,-0.009889,-0.43185,-0.85672,-0.29589,0.29076,-0.22532,0.12823,0.36458,0.43062
TCGA-A1-A0SK-01,-0.060244,-0.96382,-0.081596,0.30915,1.2363,0.89847,-0.85965,0.9267,0.045974,0.38979,-0.22409,-1.0413,-0.056604
TCGA-A1-A0SQ-01,0.3765,0.42549,0.65744,0.003187,-0.3458,-0.020122,0.39368,-1.0275,0.36249,0.29288,-0.14485,0.31196,0.64926


In [29]:
protein_expression_df.to_csv(protein_expression)

DNA Mutations

In [30]:
#Loading file containing mRNA profiles
DNA_mutations_df = pd.read_csv(DNA_mutations_file, sep='\t')
DNA_mutations_df.head()

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Consequence,Variant_Classification,...,DOMAINS,MOTIF_SCORE_CHANGE,PolyPhen,ENSP,Amino_acids,CCDS,EA_MAF,Allele,cDNA_position,PUBMED
0,PTGER3,5733,genome.wustl.edu;unc.edu,GRCh37,1,71512366,71512366,+,"missense_variant,splice_region_variant",Missense_Mutation,...,"Transmembrane_helices:Tmhmm,Pfam_domain:PF0000...",,probably_damaging(0.997),ENSP00000349003,L/V,CCDS655.1,,C,1106/1943,
1,FLG,2312,genome.wustl.edu;unc.edu,GRCh37,1,152285981,152285981,+,missense_variant,Missense_Mutation,...,"Low_complexity_(Seg):Seg,PROSITE_profiles:PS50324",,probably_damaging(0.988),ENSP00000357789,R/W,CCDS30860.1,,A,1417/12747,
2,GPR52,9293,genome.wustl.edu,GRCh37,1,174417411,174417411,+,synonymous_variant,Silent,...,"Transmembrane_helices:Tmhmm,Prints_domain:PR00...",,,ENSP00000356658,I,CCDS30941.1,,A,200/1472,
3,SLC35F3,148641,genome.wustl.edu;unc.edu,GRCh37,1,234452419,234452419,+,synonymous_variant,Silent,...,Pfam_domain:PF06027,,,ENSP00000355577,S,CCDS1600.1,,T,1045/2891,
4,OR2T3,343173,genome.wustl.edu,GRCh37,1,248636826,248636826,+,missense_variant,Missense_Mutation,...,"Pfam_domain:PF00001,Pfam_domain:PF10320,PROSIT...",,benign(0.001),ENSP00000352604,R/C,CCDS31117.1,,T,200/1008,


In [31]:
proteins = protein_expression_df.columns.tolist()
DNA_mutations_subset = DNA_mutations_df[DNA_mutations_df.Variant_Classification != 'Silent']
DNA_mutations_subset = DNA_mutations_subset[['Hugo_Symbol', 'Tumor_Sample_Barcode']]
DNA_mutations_subset = DNA_mutations_subset[DNA_mutations_subset['Hugo_Symbol'].isin(proteins)]
DNA_mutations_subset = DNA_mutations_subset.reset_index(drop=True)
print('Shape:\n', DNA_mutations_subset.shape)
DNA_mutations_subset.head()

Shape:
 (531, 2)


Unnamed: 0,Hugo_Symbol,Tumor_Sample_Barcode
0,RB1,TCGA-B6-A0IG-01
1,ERBB3,TCGA-BH-A18G-01
2,RB1,TCGA-A1-A0SI-01
3,RB1,TCGA-A1-A0SI-01
4,TP53,TCGA-A1-A0SI-01


In [32]:
#Creating an empty dataframe of the desired format
mutations = pd.DataFrame('0', mRNA_profile_subset.index, proteins)
mutations.head()

Unnamed: 0,RB1,CDH1,PTEN,BRCA2,CDKN2A,TP53,CTNNB1,CCNE1,CCND1,CDH2,CDH3,ERBB2,ERBB3
TCGA-A1-A0SF-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SH-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SJ-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SK-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SQ-01,0,0,0,0,0,0,0,0,0,0,0,0,0


In [33]:
#Generating mutations data
#Obtaining the column names 
columns = list(DNA_mutations_subset)
#Obtaining the list of existing common patients
tumour_samples = mRNA_profile_subset.index.tolist()
#Updating the mutations dataframe using the DNA mutations data
for index in range(DNA_mutations_subset.shape[0]):
    #Obtaining the patients which is the row in our resulting mutations dataframe
    row_index = DNA_mutations_subset[columns[1]][index]
    #Obtaining the protein which is the column index in our resulting mutations dataframe
    column_index = DNA_mutations_subset[columns[0]][index]
    if(row_index in tumour_samples):
        mutations[column_index][row_index] = 1
mutations.head()

Unnamed: 0,RB1,CDH1,PTEN,BRCA2,CDKN2A,TP53,CTNNB1,CCNE1,CCND1,CDH2,CDH3,ERBB2,ERBB3
TCGA-A1-A0SF-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SH-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SJ-01,0,0,0,0,0,0,0,0,0,0,0,0,0
TCGA-A1-A0SK-01,0,0,0,0,0,1,0,0,0,0,0,0,0
TCGA-A1-A0SQ-01,0,0,0,0,0,0,0,0,0,0,0,0,0


In [34]:
mutations.to_csv(gene_mutations)

#### Normalization Techniques

Z_scores data

In [35]:
#Applying ZScore normalization on DNA data 
DNA_profile_zscore = DNA_profile_subset.apply(zscore)
DNA_profile_zscore.head()

Hugo_Symbol,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,H2AFB2,H2AFB3,MPP1,MTCP1,RAB39B,SMIM9,SNORA36A,SNORA56,TMLHE,VBP1
TCGA-A1-A0SF-01,0.360959,0.360959,0.360959,0.360959,0.360959,0.360959,0.360959,0.360959,0.360959,0.360959,...,0.16172,0.16172,0.16172,0.16172,0.16172,0.16172,0.16172,0.16172,0.151285,0.16172
TCGA-A1-A0SH-01,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,-0.882528,...,1.243056,1.243056,1.243056,1.243056,1.243056,1.243056,1.243056,1.243056,1.161162,1.243056
TCGA-A1-A0SJ-01,0.338232,0.338232,0.338232,0.338232,0.338232,0.338232,0.338232,0.338232,0.338232,0.338232,...,0.342584,0.342584,0.342584,0.342584,0.342584,0.342584,0.342584,0.342584,0.320197,0.342584
TCGA-A1-A0SK-01,1.951843,1.951843,1.951843,1.951843,1.951843,1.951843,1.951843,1.951843,1.951843,1.951843,...,-0.569432,-0.569432,-0.569432,-0.569432,-0.569432,-0.569432,-0.569432,-0.569432,-0.53155,-0.569432
TCGA-A1-A0SQ-01,0.321998,0.321998,0.321998,0.321998,0.321998,0.321998,0.321998,0.321998,0.321998,0.321998,...,-0.119196,-0.119196,-0.119196,-0.119196,-0.119196,-0.119196,-0.119196,-0.119196,-0.111068,-0.119196


In [36]:
#Applying Zscore normalization on mRNA Data 
mRNA_profile_zscore = mRNA_profile_subset.apply(zscore)
mRNA_profile_zscore.head()

  return (a - mns) / sstd


Hugo_Symbol,UBE2Q2P2,HMGB1P1,LOC155060,RNU12-2P,SSX9,CXORF67,EFCAB8,SRP14P1,LOC391343,TRIM75P,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1,AKR1C6P
TCGA-A1-A0SF-01,0.648331,0.348611,0.770359,-0.860535,-0.0703,-0.190586,0.719543,0.931178,-0.283665,-0.568745,...,0.238689,-0.239582,-0.271725,-0.222943,-0.211122,0.792968,-0.619672,-0.946953,-0.457066,0.454464
TCGA-A1-A0SH-01,-0.75722,-0.56218,-0.70836,-0.860535,-0.0703,-0.190586,-0.527186,-0.611509,-0.283665,-0.568745,...,0.73949,-0.107967,-1.200905,-0.903627,1.252346,0.854083,0.10851,-0.031025,0.991113,-0.083768
TCGA-A1-A0SJ-01,-0.405237,0.254851,-0.925065,-0.223237,-0.0703,-0.190586,-0.525012,-0.390046,-0.283665,1.475888,...,0.592975,5.821247,-1.079296,-0.618408,0.444358,-0.369825,-0.573411,-0.555512,-0.488416,-0.083768
TCGA-A1-A0SK-01,3.940101,14.857648,-0.228301,-0.311392,-0.0703,-0.004913,-0.697917,-0.754652,-0.283665,-0.568745,...,0.05175,-0.027856,2.390553,0.35797,1.337526,-1.597758,1.043918,0.59045,-0.467061,-0.083768
TCGA-A1-A0SQ-01,-1.114818,-1.129592,-0.84262,-0.860535,-0.0703,-0.190586,-0.507239,-0.137487,-0.283665,-0.568745,...,1.41261,-0.008283,-0.929428,-0.853107,0.062384,-0.434733,-1.040418,-0.606257,-0.580936,-0.083768


In [37]:
#Since mRNA Zscore dataframe contains some null values, 
#we intend to drop the cells containing null values 
mRNA_zScore_transpose = mRNA_profile_zscore.transpose()
#Dropping rows which contain null values 
mRNA_zScore_transpose.dropna(inplace=True)
#Checking the shape after dropping the rows with null values
print("Shape: ", mRNA_zScore_transpose.shape)
mRNA_zScore_transpose.head()

Shape:  (20133, 672)


Unnamed: 0_level_0,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SJ-01,TCGA-A1-A0SK-01,TCGA-A1-A0SQ-01,TCGA-A2-A04N-01,TCGA-A2-A04P-01,TCGA-A2-A04Q-01,TCGA-A2-A04T-01,TCGA-A2-A04U-01,...,TCGA-LL-A5YL-01,TCGA-LL-A5YM-01,TCGA-LL-A5YN-01,TCGA-LL-A5YO-01,TCGA-LL-A5YP-01,TCGA-LQ-A4E4-01,TCGA-MS-A51U-01,TCGA-OL-A66I-01,TCGA-OL-A66J-01,TCGA-OL-A66K-01
Hugo_Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
UBE2Q2P2,0.648331,-0.75722,-0.405237,3.940101,-1.114818,-0.03472,-0.677415,-0.606659,3.057227,-1.208815,...,-0.430676,-1.303211,-0.764925,-0.632733,-0.158099,1.264079,0.1385,-1.074207,-0.386903,-0.505666
HMGB1P1,0.348611,-0.56218,0.254851,14.857648,-1.129592,-0.143191,1.876853,1.00734,1.705495,4.357301,...,0.041,-0.243423,-0.257935,2.386542,1.316537,-0.009862,0.4894,0.603284,0.102927,-0.168341
LOC155060,0.770359,-0.70836,-0.925065,-0.228301,-0.84262,0.125775,0.874649,1.04315,-0.234043,-0.821484,...,-0.278208,-0.47018,1.127598,-0.234706,-0.652434,0.721445,2.297999,0.069324,0.828971,2.393493
RNU12-2P,-0.860535,-0.860535,-0.223237,-0.311392,-0.860535,1.900511,0.043147,-0.188567,-0.25477,-0.860535,...,-0.860535,-0.860535,0.049767,-0.860535,-0.860535,0.87714,1.865841,-0.133513,-0.075323,1.305722
SSX9,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,...,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703,-0.0703


In [38]:
mRNA_zScore = mRNA_zScore_transpose.transpose()

In [39]:
mRNA_zScore.columns

Index(['UBE2Q2P2', 'HMGB1P1', 'LOC155060', 'RNU12-2P', 'SSX9', 'CXORF67',
       'EFCAB8', 'SRP14P1', 'LOC391343', 'TRIM75P',
       ...
       'ZXDA', 'ZXDB', 'ZXDC', 'ZYG11A', 'ZYG11B', 'ZYX', 'ZZEF1', 'ZZZ3',
       'TPTEP1', 'AKR1C6P'],
      dtype='object', name='Hugo_Symbol', length=20133)

Min-Max normalization

In [40]:
def perform_minmax_normalization(dataframe): 
    min_max_scaler = preprocessing.MinMaxScaler()
    df_values = dataframe.values #returns a numpy array
    df_scaled = min_max_scaler.fit_transform(df_values)
    df_min_max_scaled = pd.DataFrame(df_scaled, 
                                     index = dataframe.index, 
                                     columns = dataframe.columns)
    return df_min_max_scaled

In [41]:
mRNA_min_max_scaled = perform_minmax_normalization(mRNA_profile_subset)
mRNA_min_max_scaled.head()

Hugo_Symbol,UBE2Q2P2,HMGB1P1,LOC155060,RNU12-2P,SSX9,CXORF67,EFCAB8,SRP14P1,LOC391343,TRIM75P,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,TPTEP1,AKR1C6P
TCGA-A1-A0SF-01,0.266811,0.102965,0.232299,0.0,0.0,0.0,0.165596,0.256402,0.0,0.0,...,0.362558,0.154626,0.343226,0.136973,0.283236,0.203583,0.106969,0.050188,0.019658,0.021305
TCGA-A1-A0SH-01,0.074647,0.046655,0.060467,0.0,0.0,0.0,0.019946,0.108076,0.0,0.0,...,0.437102,0.171136,0.196632,0.028737,0.498917,0.208787,0.168174,0.119445,0.172767,0.0
TCGA-A1-A0SJ-01,0.122769,0.097168,0.035285,0.088368,0.0,0.0,0.0202,0.129369,0.0,0.29443,...,0.415293,0.914898,0.215818,0.07409,0.379839,0.104565,0.110857,0.079787,0.016343,0.0
TCGA-A1-A0SK-01,0.716855,1.0,0.116251,0.076144,0.0,0.01449,0.0,0.094313,0.0,0.0,...,0.334733,0.181185,0.763246,0.229344,0.51147,0.0,0.246798,0.166438,0.018601,0.0
TCGA-A1-A0SQ-01,0.025757,0.011574,0.044865,0.0,0.0,0.0,0.022276,0.153652,0.0,0.0,...,0.537294,0.18364,0.239462,0.03677,0.323545,0.099038,0.071604,0.07595,0.006561,0.0


In [42]:
DNA_min_max_scaled = perform_minmax_normalization(DNA_profile_subset)
DNA_min_max_scaled.head()

Hugo_Symbol,ACAP3,ACTRT2,AGRN,ANKRD65,ATAD3A,ATAD3B,ATAD3C,AURKAIP1,B3GALT6,C1orf159,...,H2AFB2,H2AFB3,MPP1,MTCP1,RAB39B,SMIM9,SNORA36A,SNORA56,TMLHE,VBP1
TCGA-A1-A0SF-01,0.353948,0.353948,0.353948,0.353948,0.353948,0.353948,0.353948,0.353948,0.353948,0.353948,...,0.340355,0.340355,0.340355,0.340355,0.340355,0.340355,0.340355,0.340355,0.322958,0.340355
TCGA-A1-A0SH-01,0.204979,0.204979,0.204979,0.204979,0.204979,0.204979,0.204979,0.204979,0.204979,0.204979,...,0.424917,0.424917,0.424917,0.424917,0.424917,0.424917,0.424917,0.424917,0.403198,0.424917
TCGA-A1-A0SJ-01,0.351225,0.351225,0.351225,0.351225,0.351225,0.351225,0.351225,0.351225,0.351225,0.351225,...,0.354499,0.354499,0.354499,0.354499,0.354499,0.354499,0.354499,0.354499,0.336379,0.354499
TCGA-A1-A0SK-01,0.544535,0.544535,0.544535,0.544535,0.544535,0.544535,0.544535,0.544535,0.544535,0.544535,...,0.283178,0.283178,0.283178,0.283178,0.283178,0.283178,0.283178,0.283178,0.268704,0.283178
TCGA-A1-A0SQ-01,0.34928,0.34928,0.34928,0.34928,0.34928,0.34928,0.34928,0.34928,0.34928,0.34928,...,0.318387,0.318387,0.318387,0.318387,0.318387,0.318387,0.318387,0.318387,0.302113,0.318387


#### Store pre-processed data

mRNA and DNA expressions combined 

In [43]:
def select_mRNA_DNA_profiles(mRNA_profiles, DNA_profiles):
    data = {'mRNA_RB1': mRNA_profiles['RB1'], 'DNA_RB1': DNA_profiles['RB1'], 
            'mRNA_CDH1': mRNA_profiles['CDH1'], 'DNA_CDH1': DNA_profiles['CDH1'],
            'mRNA_PTEN': mRNA_profiles['PTEN'], 'DNA_PTEN': DNA_profiles['PTEN'],
            'mRNA_BRCA2': mRNA_profiles['BRCA2'], 'DNA_BRCA2': DNA_profiles['BRCA2'],
            'mRNA_CDKN2A': mRNA_profiles['CDKN2A'], 'DNA_CDKN2A': DNA_profiles['CDKN2A'],
            'mRNA_TP53': mRNA_profiles['TP53'], 'DNA_TP53': DNA_profiles['TP53'],
            'mRNA_CTNNB1': mRNA_profiles['CTNNB1'], 'DNA_CTNNB1': DNA_profiles['CTNNB1'],
            'mRNA_CCNE1': mRNA_profiles['CCNE1'], 'DNA_CCNE1': DNA_profiles['CCNE1'],
            'mRNA_CCND1': mRNA_profiles['CCND1'], 'DNA_CCND1': DNA_profiles['CCND1'],
            'mRNA_CDH2': mRNA_profiles['CDH2'], 'DNA_CDH2': DNA_profiles['CDH2'],
            'mRNA_CDH3': mRNA_profiles['CDH3'], 'DNA_CDH3': DNA_profiles['CDH3'],
            'mRNA_ERBB2': mRNA_profiles['ERBB2'], 'DNA_ERBB2': DNA_profiles['ERBB2'],
            'mRNA_ERBB3': mRNA_profiles['ERBB3'], 'DNA_ERBB3': DNA_profiles['ERBB3'],
           } 
    
    combined_dataframe = pd.DataFrame(data, index = mRNA_profiles.index)
    return combined_dataframe

In [44]:
#Zscore
mRNA_DNA_expression_zscore = select_mRNA_DNA_profiles(mRNA_zScore, DNA_profile_zscore)
mRNA_DNA_expression_zscore.to_csv(mRNA_DNA_expression_zscores)
mRNA_DNA_expression_zscore.head()

Unnamed: 0,mRNA_RB1,DNA_RB1,mRNA_CDH1,DNA_CDH1,mRNA_PTEN,DNA_PTEN,mRNA_BRCA2,DNA_BRCA2,mRNA_CDKN2A,DNA_CDKN2A,...,mRNA_CCND1,DNA_CCND1,mRNA_CDH2,DNA_CDH2,mRNA_CDH3,DNA_CDH3,mRNA_ERBB2,DNA_ERBB2,mRNA_ERBB3,DNA_ERBB3
TCGA-A1-A0SF-01,-0.169062,0.645767,0.503546,-0.281316,0.25311,1.470038,-0.656137,0.401993,-0.31003,-0.589121,...,-0.481121,-0.492888,-0.185578,0.028002,-0.155083,-0.280362,-0.255401,-0.350705,1.256673,1.276933
TCGA-A1-A0SH-01,-0.007997,0.511411,-0.218036,-0.50868,0.822029,0.372176,-0.675902,0.271349,-0.051543,0.222722,...,-0.399177,-0.467793,0.156932,0.042018,-0.24748,-0.512646,-0.003031,0.067456,-0.1556,-0.116612
TCGA-A1-A0SJ-01,1.121529,0.058287,5.444905,1.827685,-0.125278,-1.330497,0.110958,1.252463,-0.29817,-0.937053,...,0.665814,0.308381,-0.080529,1.009162,-0.276032,1.87427,-0.221704,-0.068468,-0.061644,-1.372566
TCGA-A1-A0SK-01,-1.97811,-2.821157,-0.985888,-1.290082,-1.230318,0.332732,3.771735,2.943156,0.780958,0.577613,...,-0.671374,-0.382647,1.216341,0.406449,-0.34117,-1.310955,-0.448784,-0.572339,-1.244373,0.130346
TCGA-A1-A0SQ-01,-0.079638,-1.277374,2.149911,0.607234,0.461093,0.306436,-0.72406,-1.468016,-0.299158,0.22968,...,3.892254,2.791061,4.990736,0.037346,-0.548288,0.627413,-0.315679,-0.386201,1.675601,-0.137779


In [45]:
#Unnormalized
mRNA_DNA_unnormalized_data = select_mRNA_DNA_profiles(mRNA_profile_subset, DNA_profile_subset)
mRNA_DNA_unnormalized_data.to_csv(mRNA_DNA_expression_unnormalized)
mRNA_DNA_unnormalized_data.head()

Unnamed: 0,mRNA_RB1,DNA_RB1,mRNA_CDH1,DNA_CDH1,mRNA_PTEN,DNA_PTEN,mRNA_BRCA2,DNA_BRCA2,mRNA_CDKN2A,DNA_CDKN2A,...,mRNA_CCND1,DNA_CCND1,mRNA_CDH2,DNA_CDH2,mRNA_CDH3,DNA_CDH3,mRNA_ERBB2,DNA_ERBB2,mRNA_ERBB3,DNA_ERBB3
TCGA-A1-A0SF-01,1115.5629,0.023,16181.6603,-0.371,2284.2479,0.347,98.5328,0.023,130.6631,-0.362,...,3800.3297,-0.007,33.4155,0.0,1255.6509,-0.371,7976.8748,-0.018,12728.7274,0.394
TCGA-A1-A0SH-01,1206.9721,-0.028,9711.7551,-0.458,2747.6491,0.013,94.9831,-0.028,304.4517,-0.012,...,5316.8889,0.021,445.6623,0.006,1028.5627,-0.458,17990.8177,0.465,7003.4716,-0.001
TCGA-A1-A0SJ-01,1848.011,-0.2,60487.2428,0.436,1976.0402,-0.505,236.3054,0.355,138.6374,-0.512,...,25027.069,0.887,159.8537,0.42,958.3905,0.436,9313.946,0.308,7384.3621,-0.357
TCGA-A1-A0SK-01,88.8749,-1.293,2826.9776,-0.757,1075.9534,0.001,893.7914,1.015,864.1664,0.141,...,279.2405,0.116,1720.769,0.162,798.2981,-0.757,303.4983,-0.274,2589.6628,0.069
TCGA-A1-A0SQ-01,1166.3137,-0.707,30943.4191,-0.031,2453.6561,-0.007,86.3338,-0.707,137.9728,-0.009,...,84739.9576,3.657,6263.6409,0.004,289.2587,-0.031,5585.0731,-0.059,14427.0298,-0.007


In [46]:
#Min-max normalization
mRNA_DNA_minmax = select_mRNA_DNA_profiles(mRNA_min_max_scaled, DNA_min_max_scaled)
mRNA_DNA_minmax.to_csv(mRNA_DNA_expression_minmax)
mRNA_DNA_minmax.head()

Unnamed: 0,mRNA_RB1,DNA_RB1,mRNA_CDH1,DNA_CDH1,mRNA_PTEN,DNA_PTEN,mRNA_BRCA2,DNA_BRCA2,mRNA_CDKN2A,DNA_CDKN2A,...,mRNA_CCND1,DNA_CCND1,mRNA_CDH2,DNA_CDH2,mRNA_CDH3,DNA_CDH3,mRNA_ERBB2,DNA_ERBB2,mRNA_ERBB3,DNA_ERBB3
TCGA-A1-A0SF-01,0.232188,0.571429,0.241819,0.303489,0.290862,0.572826,0.071908,0.35159,0.018993,0.20332,...,0.014719,0.183055,0.001527,0.241286,0.068768,0.303489,0.022728,0.194079,0.503176,0.299033
TCGA-A1-A0SH-01,0.252392,0.549284,0.143802,0.274852,0.353551,0.456165,0.069271,0.337964,0.044777,0.279755,...,0.020905,0.189298,0.020815,0.242531,0.0563,0.274852,0.05239,0.3,0.274617,0.214178
TCGA-A1-A0SJ-01,0.394079,0.474598,0.913037,0.569124,0.249168,0.275236,0.174246,0.440289,0.020176,0.170561,...,0.10131,0.382386,0.007443,0.328423,0.052447,0.569124,0.026689,0.26557,0.289823,0.137701
TCGA-A1-A0SK-01,0.005262,0.0,0.039499,0.176432,0.127404,0.451973,0.662632,0.616618,0.12782,0.313169,...,0.000355,0.210479,0.080472,0.274896,0.043657,0.176432,0.0,0.137939,0.098413,0.229216
TCGA-A1-A0SQ-01,0.243405,0.254451,0.465456,0.415405,0.31378,0.449179,0.062846,0.156559,0.020077,0.280411,...,0.344902,1.0,0.293017,0.242116,0.015709,0.415405,0.015644,0.185088,0.570974,0.212889


mRNA profiles of 20,000 proteins

In [47]:
#Zscores
mRNA_zScore.to_csv(mRNA_All_zscores)

In [48]:
#Unnormalized 
mRNA_profile_subset.to_csv(mRNA_All_unnormalized)

In [49]:
#Min-Max normalized
mRNA_min_max_scaled.to_csv(mRNA_All_minmax)