# Import Dataset

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
### 1. Link notebook with google drive and access data from your personal Gdrive
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

### 2.Set the data path for dataset and model location (ex: model_loc = "/content/gdrive/My Drive/Dataset/")
dataset_dir = "/content/gdrive/My Drive/BIOINFORMATICSII/Project/"
model_loc = "/content/gdrive/My Drive/BIOINFORMATICSII/Project/"

print(os.listdir(dataset_dir))
mrna_data = pd.read_csv(dataset_dir+'BRCA_mRNA.csv', index_col=0)
miRNA_data = pd.read_csv(dataset_dir+'BRCA_miRNA.csv', index_col=0)
DNA_Methylation_data = pd.read_csv(dataset_dir+'BRCA_DNA_Methylation.csv', index_col=0)
copy_number_variation_data = pd.read_csv(dataset_dir+'BRCA_Copy Number Variation.csv', index_col=0)
multi_omics_data = pd.read_csv(dataset_dir+'BRCA_Multi_Omics.csv', index_col=0)
brca_Label = pd.read_csv(dataset_dir+'BRCA_label.csv')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
['BRCA_mRNA.csv', 'BRCA_DNA_Methylation.csv', 'BRCA_miRNA.csv', 'BRCA_Copy Number Variation.csv', 'BRCA_label.csv', 'X_test_rfe_mrna.pkl', 'X_test_rfe_miRNA.pkl', 'X_train_rfe_mrna.pkl', 'X_train_rfe_miRNA.pkl', 'X_train_rfe_dna.pkl', 'X_test_rfe_dna.pkl', 'X_test_rfe_copy_number_variation.pkl', 'X_train_rfe_copy_number_variation.pkl', 'BRCA_Multi_Omics.csv', 'X_test_rfe_multi_omics.pkl', 'X_train_rfe_multi_omics.pkl']


###Concatenate Method

In [None]:
mrna_data.shape

(18206, 671)

In [None]:
miRNA_data.shape

(368, 671)

In [None]:
DNA_Methylation_data.shape

(19049, 671)

In [None]:
copy_number_variation_data.shape

(19568, 671)

In [None]:


# Concatenate the datasets along the columns
multi = pd.concat([mrna_data, miRNA_data, DNA_Methylation_data, copy_number_variation_data], axis=0)



In [None]:
multi.shape

(57191, 671)

In [None]:

# Save the concatenated dataframe to a new CSV file
multi_omics_data.to_csv(dataset_dir + 'BRCA_Multi_Omics.csv')

print('Concatenation complete. The file BRCA_Multi_Omics.csv has been saved.')

#Data Preprocessing & EDA


## BRCA_Copy Number Variation

In [None]:
copy_number_variation_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19568 entries, A1BG to pk
Columns: 671 entries, TCGA.3C.AAAU.01 to TCGA.Z7.A8R6.01
dtypes: float64(671)
memory usage: 100.3+ MB


In [None]:
copy_number_variation_data.head(10)

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
A1BG,0.495252,1.028904,0.003059,-0.258705,-0.083232,-0.486242,-0.111192,-0.102997,0.687116,-0.213873,...,-0.493956,2.097171,-0.594226,0.478862,-1.755051,-0.835261,-0.148311,0.557439,-0.234602,-0.838154
A1CF,-0.938762,-1.575977,0.405308,0.09691,0.060782,2.050968,0.154506,0.018894,1.131535,0.145081,...,-0.299975,6.501006,0.255559,-0.184784,0.371274,0.483323,0.061306,0.627836,0.07073,-0.463861
A2M,0.750838,0.010864,0.103201,-0.16913,-2.090765,-0.216362,-0.062751,-0.056368,0.857217,-0.016369,...,1.13976,-1.938856,0.036395,-0.308274,0.11852,0.266599,-0.089133,0.344469,0.226601,-0.590817
A2ML1,0.748593,0.005894,0.098571,-0.174763,-2.103474,-0.222169,-0.067992,-0.061585,0.855364,-0.021439,...,1.138948,-1.951005,0.031519,-0.314419,0.113946,0.262571,-0.094471,0.340728,0.222425,-0.598003
A3GALT2,1.093743,-1.57568,-0.706498,0.199712,0.37657,-2.334753,0.400443,0.225047,1.226264,0.390699,...,-2.54133,-1.587373,0.371698,0.096424,-1.494803,-2.569588,0.046241,-1.244865,-0.252418,-0.264599
A4GALT,-0.876388,-0.967673,-1.643561,0.570857,0.631333,2.749902,0.620683,0.497449,0.387146,-0.143066,...,-1.655732,0.592157,-1.734085,0.276083,0.805155,-1.733704,0.592537,-0.629158,0.087808,0.082483
A4GNT,1.10946,1.481901,-0.661117,-0.422884,-0.35066,-0.173333,-0.273585,-0.151234,-0.508045,-0.384077,...,1.316971,2.246187,-1.068054,-0.615303,1.713128,-0.412105,-0.391623,-0.897733,-1.021162,-0.788319
AAAS,-1.966983,-0.040373,-1.342185,-0.15621,-0.101592,-0.175416,0.005842,0.145686,1.277645,-0.019366,...,1.79921,-2.374512,0.098871,-0.207226,0.242317,0.359954,-0.09439,0.60003,0.488395,-0.545733
AACS,-1.67642,-0.027748,1.469986,-0.017618,0.100904,0.011759,0.08419,-1.943348,-1.716941,0.046708,...,1.439089,-1.900802,0.14649,-2.367799,0.293883,0.516238,0.009733,0.666163,0.491419,-0.372678
AADAC,1.090124,1.213108,-0.571499,-0.402285,-0.312965,-0.110984,-0.226338,-0.146892,-0.444027,-0.340793,...,-0.534694,0.613451,-0.853823,-0.533347,1.399827,-0.364133,-0.347077,-0.768542,-0.8821,-0.628054


In [None]:
copy_number_variation_data.iloc[0:,1:]

Unnamed: 0,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,TCGA.A1.A0SH.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
A1BG,1.028904,0.003059,-0.258705,-0.083232,-0.486242,-0.111192,-0.102997,0.687116,-0.213873,-0.005619,...,-0.493956,2.097171,-0.594226,0.478862,-1.755051,-0.835261,-0.148311,0.557439,-0.234602,-0.838154
A1CF,-1.575977,0.405308,0.096910,0.060782,2.050968,0.154506,0.018894,1.131535,0.145081,0.042456,...,-0.299975,6.501006,0.255559,-0.184784,0.371274,0.483323,0.061306,0.627836,0.070730,-0.463861
A2M,0.010864,0.103201,-0.169130,-2.090765,-0.216362,-0.062751,-0.056368,0.857217,-0.016369,-0.034667,...,1.139760,-1.938856,0.036395,-0.308274,0.118520,0.266599,-0.089133,0.344469,0.226601,-0.590817
A2ML1,0.005894,0.098571,-0.174763,-2.103474,-0.222169,-0.067992,-0.061585,0.855364,-0.021439,-0.039804,...,1.138948,-1.951005,0.031519,-0.314419,0.113946,0.262571,-0.094471,0.340728,0.222425,-0.598003
A3GALT2,-1.575680,-0.706498,0.199712,0.376570,-2.334753,0.400443,0.225047,1.226264,0.390699,0.612866,...,-2.541330,-1.587373,0.371698,0.096424,-1.494803,-2.569588,0.046241,-1.244865,-0.252418,-0.264599
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ZYG11B,-1.793233,-0.872324,0.058733,0.242915,-2.475162,0.263210,0.085117,1.176508,0.207398,0.322067,...,-0.169592,0.192176,0.237841,-0.048833,-1.672982,-2.900353,-0.073695,-0.277665,-0.435463,-0.424808
ZYX,1.446230,0.039191,-0.257130,-0.185487,-1.970238,-0.112381,-0.106533,0.704939,-0.262491,0.373527,...,-0.542729,-0.489606,-0.134313,0.310169,-0.002235,1.155757,-0.286372,0.253634,-0.327799,-0.589030
ZZEF1,-0.790000,-1.421040,-1.377441,-0.999581,1.646581,0.848410,-0.727278,-0.501251,0.866003,-0.575828,...,-1.375911,-0.695152,0.920311,-0.919650,-0.690180,0.438043,0.811313,-0.470272,0.723732,-2.171404
ZZZ3,-1.268621,-0.811368,0.098802,0.204322,-1.930651,0.307432,0.230822,-1.278740,0.290087,0.216849,...,-2.491497,-0.100674,0.289605,-0.009609,-1.555790,-2.902013,-0.051046,-0.247150,-0.350742,-0.339178


In [None]:
# check is there any null values in the BRCA_Copy Number Variation dataset
copy_number_variation_data.isnull().any().any()

False

In [None]:
copy_number_variation_data.shape

(19568, 671)

## BRCA_miRNA

In [None]:
miRNA_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 368 entries, hsa.let.7a.1 to hsa.mir.99b
Columns: 671 entries, TCGA.3C.AAAU.01 to TCGA.Z7.A8R6.01
dtypes: float64(671)
memory usage: 1.9+ MB


In [None]:
miRNA_data.head(10)

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
hsa.let.7a.1,0.068317,-0.301684,-0.15081,0.107831,0.395211,1.135212,1.813887,0.473642,-0.470612,0.791968,...,-3.038971,-0.474111,0.165775,0.706495,-0.086149,0.511958,1.225298,0.667662,-0.211878,0.47424
hsa.let.7a.2,0.068932,-0.318009,-0.122747,0.097594,0.412879,1.140183,1.790792,0.442224,-0.461217,0.841821,...,-3.03734,-0.477443,0.172506,0.706552,-0.07438,0.50424,1.219548,0.666012,-0.210525,0.488117
hsa.let.7a.3,0.073899,-0.30131,-0.126333,0.095545,0.418441,1.143605,1.822272,0.431312,-0.470734,0.805374,...,-3.017967,-0.474484,0.164726,0.727433,-0.102883,0.496009,1.21061,0.675998,-0.220135,0.481577
hsa.let.7b,0.524562,0.419859,-0.958939,0.615389,0.500594,1.159087,2.113821,0.812876,-0.129307,0.867411,...,-3.509653,-0.855498,-0.359797,0.997679,-0.342075,0.422382,0.050804,-0.102655,0.177655,-0.078426
hsa.let.7c,-1.656853,-0.715963,-0.971038,0.711952,0.426323,-1.05887,2.135373,1.031809,0.864211,0.071249,...,-2.45447,0.188937,-0.001077,1.302563,0.506728,1.028977,1.186182,1.534247,1.070328,-0.155655
hsa.let.7d,-0.038283,0.460975,0.866585,-0.454282,-1.545556,0.348033,1.844765,-0.453824,-1.070594,0.499824,...,0.391518,-0.452204,-0.049724,0.053945,0.627048,-0.254668,-1.238797,0.305716,-0.716526,1.384395
hsa.let.7e,0.501125,-1.999304,2.074809,0.227441,-0.952282,1.610267,-0.469603,-0.611333,-0.571769,0.505744,...,-0.503044,-0.048044,1.286869,0.297414,0.303064,0.528134,0.879213,1.276369,-0.598473,1.237552
hsa.let.7f.1,-2.390084,-0.659788,1.080746,-0.735552,-0.280966,-0.141723,0.837358,0.169462,-0.000266,0.85945,...,-1.216759,0.095809,0.834571,-0.058999,0.798165,0.757211,1.455206,1.253312,-0.229774,0.851405
hsa.let.7f.2,-2.406331,-0.651943,1.088164,-0.686623,-0.233746,-0.152147,0.798862,0.184119,0.041877,0.851887,...,-1.168059,0.082452,0.833022,-0.050336,0.818886,0.741367,1.460326,1.240593,-0.190906,0.851526
hsa.let.7g,-0.760042,-1.050266,0.523204,-1.015715,-0.818574,1.243074,2.111705,-0.511843,-1.315689,0.16278,...,0.156156,0.053724,0.091651,0.653836,1.183322,0.378499,0.921697,0.556762,-0.465549,0.409391


In [None]:
# check is there any null values in the miRNA_data dataset
miRNA_data.isnull().any().any()

False

In [None]:
miRNA_data.shape

(368, 671)

## BRCA_DNA_Methylation

In [None]:
DNA_Methylation_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19049 entries, A1BG to pk
Columns: 671 entries, TCGA.3C.AAAU.01 to TCGA.Z7.A8R6.01
dtypes: float64(671)
memory usage: 97.7+ MB


In [None]:
# DNA_Methylation_dataset
DNA_Methylation_data.head(10)

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
A1BG,-1.211396,0.974531,0.633283,0.354642,-1.939963,1.328608,0.94601,0.930281,-0.519615,0.554409,...,-0.760042,1.159665,0.333349,0.775567,-0.778884,0.5171,0.177384,-0.380611,0.811548,-0.819564
A1CF,-1.801239,-1.091976,-0.258825,0.344929,-0.018208,-1.007907,1.246215,0.360814,-0.426866,-1.357345,...,-0.582298,-2.142905,-0.306365,0.816486,-0.499731,0.824666,0.807188,-0.027147,0.644539,-2.004627
A2M,-0.696763,-1.597015,-0.873714,-1.080573,-0.788607,-0.583001,-0.654375,0.302439,-1.561816,-0.836623,...,-1.001059,-2.915892,0.635853,0.027964,-1.14516,-0.32697,0.569131,-2.613169,-0.507579,-0.339844
A2ML1,0.832338,0.758754,0.753608,0.548382,0.589277,0.870132,-1.878898,0.160075,0.066117,-0.338768,...,0.892903,-2.480427,0.319169,-0.098838,0.830086,0.617726,0.551416,0.101744,0.269641,-0.094397
A3GALT2,0.857098,0.450506,0.82133,0.497375,0.400632,1.183863,-3.31677,0.227579,0.051009,0.300039,...,1.161783,-1.863239,0.350236,-0.879849,-0.098983,0.524779,0.041604,0.537643,-0.322718,0.007108
A4GALT,-0.695053,1.570301,-0.36945,1.342464,1.553188,-0.876086,-1.168542,1.418889,-0.012446,-0.980884,...,-1.081537,1.713744,-0.986258,1.658525,-0.840527,0.887775,0.890293,-0.910491,0.886293,-1.066638
A4GNT,-0.258891,0.330579,-0.315881,-0.518794,-0.299152,0.562873,0.646204,0.812077,-0.205157,-2.506433,...,1.003214,-0.080964,0.514651,1.080107,-0.263919,0.553903,0.012863,-1.371328,0.555758,-1.140229
AAAS,0.601832,-0.912745,0.098281,-0.274796,0.39116,3.94035,-0.857682,0.5373,-0.53217,-0.666725,...,-0.023642,-0.410331,0.899429,-0.651502,0.026079,0.059472,-1.045639,-0.607679,-0.144756,-1.179954
AACS,-0.100803,-0.682832,-5.269406,-0.187178,-0.41044,-0.42269,-0.588374,0.343098,0.397713,0.2346,...,-0.225294,-0.520869,0.015193,0.462241,-0.2359,0.15001,0.102895,0.085117,-0.187391,0.503692
AADAC,-5.232067,-1.232277,-2.441139,-0.23207,-1.676137,-1.472467,0.997476,0.563539,0.714475,0.57141,...,0.008583,-1.804608,0.275197,0.574451,-1.478779,-0.99982,0.20939,-1.533629,-0.931151,0.063754


In [None]:
# check is there any null values in the BRCA_DNA_Methylation dataset
DNA_Methylation_data.isnull().any().any()

False

In [None]:
DNA_Methylation_data.shape

(19049, 671)

## BRCA_mRNA

In [None]:
mrna_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18206 entries, A1BG to ZZZ3
Columns: 671 entries, TCGA.3C.AAAU.01 to TCGA.Z7.A8R6.01
dtypes: float64(671)
memory usage: 93.3+ MB


In [None]:
mrna_data.head(10)

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
A1BG,1.389927,1.522872,1.788271,-0.288543,-0.46268,-0.451169,-0.805973,-0.472368,0.112737,-0.39748,...,-1.828641,0.756934,1.095019,2.453552,-1.526821,0.108732,-0.21939,-0.11274,-0.920878,0.079873
A1BG.AS1,0.436758,1.815091,0.580557,-0.250194,-0.091285,-0.688598,-1.037605,0.658661,0.627872,0.426477,...,-2.12359,-0.55686,-1.271019,1.915429,-2.519037,-0.643504,0.082909,0.284536,-0.372892,0.539919
A2M,-0.876187,-0.508085,-0.258138,-0.030804,0.28997,-1.641876,1.534552,0.436717,0.476992,0.147622,...,-1.49346,-1.848483,-0.608446,0.584722,-0.267116,-1.271778,0.780906,0.292935,1.88364,-0.469512
A2M.AS1,1.607818,0.863019,-0.491954,-0.579638,0.572883,-0.836306,0.099614,-0.059346,-0.092982,-0.666693,...,0.851988,-2.180389,-1.614497,0.398519,-0.632821,-0.768806,0.200613,0.351478,0.015328,-1.309134
A2ML1,-0.415138,-0.406608,-0.455969,-0.464938,-0.438571,-0.479607,-0.444955,-0.539629,-0.408042,-0.457548,...,0.419145,-0.483924,-0.379297,-0.406793,-0.470376,-0.429807,-0.491028,0.421906,-0.456669,-0.394471
A2MP1,-0.374641,-0.36123,0.855983,-0.313092,-0.090844,-1.01492,1.384198,-0.508012,-0.264776,-0.3432,...,-0.814826,-0.889906,-0.57707,-0.209513,1.692166,-0.517498,1.236668,0.277833,2.279637,-0.730764
A4GALT,-1.727152,-0.964244,0.750127,0.480557,0.179004,0.420826,0.260066,0.05895,0.67689,1.261371,...,0.436093,-2.0231,1.798225,-0.706899,2.508851,-0.148959,0.397136,0.658981,0.319417,1.304757
AAAS,-0.56253,0.525889,-0.401461,-0.426938,0.685542,0.697997,0.127438,-0.117362,1.583352,0.208439,...,1.841616,-2.215116,1.540387,1.881914,1.241782,1.136479,-0.030975,0.359229,0.667426,0.433323
AACS,0.225676,-0.590749,1.077314,-0.739098,-0.623465,0.669477,1.339599,-0.443098,-1.331414,0.918643,...,1.194196,1.802331,0.876724,-1.193957,-0.788798,-1.740086,0.730476,-0.336013,-0.298045,-0.811775
AADAT,-1.337786,-0.72495,0.10188,-0.091341,-0.421059,1.06754,1.377738,-0.566328,-1.010971,-0.671109,...,0.511056,-1.219445,-0.981982,-0.682073,-0.389474,-0.562701,-0.449559,-0.483281,-0.376687,-0.699855


In [None]:
# Check if there any null values in the BRCA_mRNA dataset
mrna_data.isnull().any().any()

False

## BRCA_label

In [None]:
brca_Label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 671 entries, 0 to 670
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   671 non-null    object
dtypes: object(1)
memory usage: 5.4+ KB


In [None]:
# BRCA_Copy Number Variation dataset
brca_Label.head(10)

Unnamed: 0,Label
0,LumA
1,Her2
2,LumB
3,LumA
4,LumA
5,LumB
6,Normal
7,LumA
8,LumA
9,LumA


In [None]:
# check is there any null values in the BRCA_Copy Number Variation dataset
brca_Label.isnull().any().any()

False

In [None]:
brca_Label.shape

(671, 1)

In [None]:
label_value_counts = brca_Label['Label'].value_counts()

print(label_value_counts)

Label
LumA      353
LumB      132
Basal     113
Her2       42
Normal     31
Name: count, dtype: int64


In [None]:
#replace label with numerical data
brca_Label['Label'].replace('Normal', 0 , inplace=True)
brca_Label['Label'].replace('LumA', 1 , inplace=True)
brca_Label['Label'].replace('LumB', 2 , inplace=True)
brca_Label['Label'].replace('Basal', 3 , inplace=True)
brca_Label['Label'].replace('Her2', 4 , inplace=True)

In [None]:
label_value_counts = brca_Label['Label'].value_counts()

print(label_value_counts)

Label
1    353
2    132
3    113
4     42
0     31
Name: count, dtype: int64


There is no missing values in all dataset

## BRCA_Multi_Omics

In [None]:
multi_omics_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 57191 entries, A1BG to pk
Columns: 671 entries, TCGA.3C.AAAU.01 to TCGA.Z7.A8R6.01
dtypes: float64(671)
memory usage: 293.2+ MB


In [None]:
multi_omics_data.head(10)

Unnamed: 0,TCGA.3C.AAAU.01,TCGA.3C.AALI.01,TCGA.3C.AALJ.01,TCGA.3C.AALK.01,TCGA.5L.AAT0.01,TCGA.5T.A9QA.01,TCGA.A1.A0SB.01,TCGA.A1.A0SE.01,TCGA.A1.A0SF.01,TCGA.A1.A0SG.01,...,TCGA.UL.AAZ6.01,TCGA.UU.A93S.01,TCGA.V7.A7HQ.01,TCGA.W8.A86G.01,TCGA.WT.AB41.01,TCGA.WT.AB44.01,TCGA.XX.A899.01,TCGA.XX.A89A.01,TCGA.Z7.A8R5.01,TCGA.Z7.A8R6.01
A1BG,0.495252,1.028904,0.003059,-0.258705,-0.083232,-0.486242,-0.111192,-0.102997,0.687116,-0.213873,...,-0.493956,2.097171,-0.594226,0.478862,-1.755051,-0.835261,-0.148311,0.557439,-0.234602,-0.838154
A1CF,-0.938762,-1.575977,0.405308,0.09691,0.060782,2.050968,0.154506,0.018894,1.131535,0.145081,...,-0.299975,6.501006,0.255559,-0.184784,0.371274,0.483323,0.061306,0.627836,0.07073,-0.463861
A2M,0.750838,0.010864,0.103201,-0.16913,-2.090765,-0.216362,-0.062751,-0.056368,0.857217,-0.016369,...,1.13976,-1.938856,0.036395,-0.308274,0.11852,0.266599,-0.089133,0.344469,0.226601,-0.590817
A2ML1,0.748593,0.005894,0.098571,-0.174763,-2.103474,-0.222169,-0.067992,-0.061585,0.855364,-0.021439,...,1.138948,-1.951005,0.031519,-0.314419,0.113946,0.262571,-0.094471,0.340728,0.222425,-0.598003
A3GALT2,1.093743,-1.57568,-0.706498,0.199712,0.37657,-2.334753,0.400443,0.225047,1.226264,0.390699,...,-2.54133,-1.587373,0.371698,0.096424,-1.494803,-2.569588,0.046241,-1.244865,-0.252418,-0.264599
A4GALT,-0.876388,-0.967673,-1.643561,0.570857,0.631333,2.749902,0.620683,0.497449,0.387146,-0.143066,...,-1.655732,0.592157,-1.734085,0.276083,0.805155,-1.733704,0.592537,-0.629158,0.087808,0.082483
A4GNT,1.10946,1.481901,-0.661117,-0.422884,-0.35066,-0.173333,-0.273585,-0.151234,-0.508045,-0.384077,...,1.316971,2.246187,-1.068054,-0.615303,1.713128,-0.412105,-0.391623,-0.897733,-1.021162,-0.788319
AAAS,-1.966983,-0.040373,-1.342185,-0.15621,-0.101592,-0.175416,0.005842,0.145686,1.277645,-0.019366,...,1.79921,-2.374512,0.098871,-0.207226,0.242317,0.359954,-0.09439,0.60003,0.488395,-0.545733
AACS,-1.67642,-0.027748,1.469986,-0.017618,0.100904,0.011759,0.08419,-1.943348,-1.716941,0.046708,...,1.439089,-1.900802,0.14649,-2.367799,0.293883,0.516238,0.009733,0.666163,0.491419,-0.372678
AADAC,1.090124,1.213108,-0.571499,-0.402285,-0.312965,-0.110984,-0.226338,-0.146892,-0.444027,-0.340793,...,-0.534694,0.613451,-0.853823,-0.533347,1.399827,-0.364133,-0.347077,-0.768542,-0.8821,-0.628054


In [None]:
# Check if there any null values in the BRCA_mRNA dataset
multi_omics_data.isnull().any().any()

False

# Data Preparation

Trasnpose the row and column

## BRCA_Copy Number Variation

In [None]:
# Before transpose
copy_number_variation_data.shape

(19568, 671)

In [None]:
# Transpose mrna_data
copy_number_variation_data = copy_number_variation_data.transpose()

In [None]:
# After transpose
copy_number_variation_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
TCGA.3C.AAAU.01,0.495252,-0.938762,0.750838,0.748593,1.093743,-0.876388,1.10946,-1.966983,-1.67642,1.090124,...,-0.837399,2.020624,1.972719,1.053381,1.254989,1.267331,0.963246,-0.580418,1.715811,-0.864183
TCGA.3C.AALI.01,1.028904,-1.575977,0.010864,0.005894,-1.57568,-0.967673,1.481901,-0.040373,-0.027748,1.213108,...,-1.537275,4.002085,3.897638,1.589442,-1.781406,-1.793233,1.44623,-0.79,-1.268621,0.345052
TCGA.3C.AALJ.01,0.003059,0.405308,0.103201,0.098571,-0.706498,-1.643561,-0.661117,-1.342185,1.469986,-0.571499,...,4.165544,-0.319977,-0.301093,-0.477515,-0.867769,-0.872324,0.039191,-1.42104,-0.811368,-1.357237
TCGA.3C.AALK.01,-0.258705,0.09691,-0.16913,-0.174763,0.199712,0.570857,-0.422884,-0.15621,-0.017618,-0.402285,...,0.080744,-0.316675,-0.297884,-0.267586,0.055935,0.058733,-0.25713,-1.377441,0.098802,0.042601
TCGA.5L.AAT0.01,-0.083232,0.060782,-2.090765,-2.103474,0.37657,0.631333,-0.35066,-0.101592,0.100904,-0.312965,...,0.045801,-0.307593,-0.289062,-0.200167,0.238663,0.242915,-0.185487,-0.999581,0.204322,0.078553
TCGA.5T.A9QA.01,-0.486242,2.050968,-0.216362,-0.222169,-2.334753,2.749902,-0.173333,-0.175416,0.011759,-0.110984,...,1.970711,-0.399235,-0.392526,-0.141518,-2.45795,-2.475162,-1.970238,1.646581,-1.930651,-6.99425
TCGA.A1.A0SB.01,-0.111192,0.154506,-0.062751,-0.067992,0.400443,0.620683,-0.273585,0.005842,0.08419,-0.226338,...,0.13645,-0.179623,-0.164744,-0.084514,0.258798,0.26321,-0.112381,0.84841,0.307432,0.189832
TCGA.A1.A0SE.01,-0.102997,0.018894,-0.056368,-0.061585,0.225047,0.497449,-0.151234,0.145686,-1.943348,-0.146892,...,0.005287,-0.121831,-0.108601,0.039909,0.082111,0.085117,-0.106533,-0.727278,0.230822,0.289698
TCGA.A1.A0SF.01,0.687116,1.131535,0.857217,0.855364,1.226264,0.387146,-0.508045,1.277645,-1.716941,-0.444027,...,1.072825,-0.210171,-0.19442,-0.356381,1.164884,1.176508,0.704939,-0.501251,-1.27874,-0.088652
TCGA.A1.A0SG.01,-0.213873,0.145081,-0.016369,-0.021439,0.390699,-0.143066,-0.384077,-0.019366,0.046708,-0.340793,...,0.127335,-0.254754,-0.237731,-0.196879,0.203426,0.207398,-0.262491,0.866003,0.290087,0.104803


In [None]:
copy_number_variation_data.shape

(671, 19568)

In [None]:
# Ensure the number of rows in data match with label
if copy_number_variation_data.shape[0] == brca_Label.shape[0]:
    # Add the new column to the transposed DataFrame
    copy_number_variation_data['Label'] = brca_Label['Label'].values
else:
    raise ValueError("The number of rows in must match the number of columns in the transposed df1.")

In [None]:
copy_number_variation_data.shape

(671, 19569)

In [None]:
# Check if there any null values in the BRCA_mRNA dataset
copy_number_variation_data.isnull().any().any()

False

In [None]:
copy_number_variation_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk,Label
TCGA.3C.AAAU.01,0.495252,-0.938762,0.750838,0.748593,1.093743,-0.876388,1.10946,-1.966983,-1.67642,1.090124,...,2.020624,1.972719,1.053381,1.254989,1.267331,0.963246,-0.580418,1.715811,-0.864183,1
TCGA.3C.AALI.01,1.028904,-1.575977,0.010864,0.005894,-1.57568,-0.967673,1.481901,-0.040373,-0.027748,1.213108,...,4.002085,3.897638,1.589442,-1.781406,-1.793233,1.44623,-0.79,-1.268621,0.345052,4
TCGA.3C.AALJ.01,0.003059,0.405308,0.103201,0.098571,-0.706498,-1.643561,-0.661117,-1.342185,1.469986,-0.571499,...,-0.319977,-0.301093,-0.477515,-0.867769,-0.872324,0.039191,-1.42104,-0.811368,-1.357237,2
TCGA.3C.AALK.01,-0.258705,0.09691,-0.16913,-0.174763,0.199712,0.570857,-0.422884,-0.15621,-0.017618,-0.402285,...,-0.316675,-0.297884,-0.267586,0.055935,0.058733,-0.25713,-1.377441,0.098802,0.042601,1
TCGA.5L.AAT0.01,-0.083232,0.060782,-2.090765,-2.103474,0.37657,0.631333,-0.35066,-0.101592,0.100904,-0.312965,...,-0.307593,-0.289062,-0.200167,0.238663,0.242915,-0.185487,-0.999581,0.204322,0.078553,1
TCGA.5T.A9QA.01,-0.486242,2.050968,-0.216362,-0.222169,-2.334753,2.749902,-0.173333,-0.175416,0.011759,-0.110984,...,-0.399235,-0.392526,-0.141518,-2.45795,-2.475162,-1.970238,1.646581,-1.930651,-6.99425,2
TCGA.A1.A0SB.01,-0.111192,0.154506,-0.062751,-0.067992,0.400443,0.620683,-0.273585,0.005842,0.08419,-0.226338,...,-0.179623,-0.164744,-0.084514,0.258798,0.26321,-0.112381,0.84841,0.307432,0.189832,0
TCGA.A1.A0SE.01,-0.102997,0.018894,-0.056368,-0.061585,0.225047,0.497449,-0.151234,0.145686,-1.943348,-0.146892,...,-0.121831,-0.108601,0.039909,0.082111,0.085117,-0.106533,-0.727278,0.230822,0.289698,1
TCGA.A1.A0SF.01,0.687116,1.131535,0.857217,0.855364,1.226264,0.387146,-0.508045,1.277645,-1.716941,-0.444027,...,-0.210171,-0.19442,-0.356381,1.164884,1.176508,0.704939,-0.501251,-1.27874,-0.088652,1
TCGA.A1.A0SG.01,-0.213873,0.145081,-0.016369,-0.021439,0.390699,-0.143066,-0.384077,-0.019366,0.046708,-0.340793,...,-0.254754,-0.237731,-0.196879,0.203426,0.207398,-0.262491,0.866003,0.290087,0.104803,1


In [None]:
print(copy_number_variation_data["Label"].value_counts()[0])
print(copy_number_variation_data["Label"].value_counts()[1])
print(copy_number_variation_data["Label"].value_counts()[2])
print(copy_number_variation_data["Label"].value_counts()[3])
print(copy_number_variation_data["Label"].value_counts()[4])

31
353
132
113
42


## BRCA_miRNA

In [None]:
# Before transpose
miRNA_data.shape

(368, 671)

In [None]:
# Transpose mrna_data
miRNA_data = miRNA_data.transpose()

In [None]:
# After transpose
miRNA_data.head(10)

Unnamed: 0,hsa.let.7a.1,hsa.let.7a.2,hsa.let.7a.3,hsa.let.7b,hsa.let.7c,hsa.let.7d,hsa.let.7e,hsa.let.7f.1,hsa.let.7f.2,hsa.let.7g,...,hsa.mir.935,hsa.mir.937,hsa.mir.939,hsa.mir.940,hsa.mir.942,hsa.mir.95,hsa.mir.96,hsa.mir.98,hsa.mir.99a,hsa.mir.99b
TCGA.3C.AAAU.01,0.068317,0.068932,0.073899,0.524562,-1.656853,-0.038283,0.501125,-2.390084,-2.406331,-0.760042,...,1.79547,0.108421,0.179279,1.768897,0.226659,0.943346,1.552613,-0.143572,-1.251868,1.603022
TCGA.3C.AALI.01,-0.301684,-0.318009,-0.30131,0.419859,-0.715963,0.460975,-1.999304,-0.659788,-0.651943,-1.050266,...,-0.409131,0.569264,0.188875,-0.453994,0.804807,-0.828843,0.599794,1.103662,-0.627587,-1.506825
TCGA.3C.AALJ.01,-0.15081,-0.122747,-0.126333,-0.958939,-0.971038,0.866585,2.074809,1.080746,1.088164,0.523204,...,0.742,1.95238,-1.226285,-0.28169,-0.314543,-0.487207,1.261482,1.940594,-0.848552,0.787948
TCGA.3C.AALK.01,0.107831,0.097594,0.095545,0.615389,0.711952,-0.454282,0.227441,-0.735552,-0.686623,-1.015715,...,-0.715237,0.119549,-0.340371,-0.58699,-0.36437,-0.539524,0.856983,-0.831098,0.936474,0.030739
TCGA.5L.AAT0.01,0.395211,0.412879,0.418441,0.500594,0.426323,-1.545556,-0.952282,-0.280966,-0.233746,-0.818574,...,-0.925821,-0.24895,-1.43912,-0.707823,-0.109361,-0.490655,1.188748,-0.885905,0.449284,-0.645458
TCGA.5T.A9QA.01,1.135212,1.140183,1.143605,1.159087,-1.05887,0.348033,1.610267,-0.141723,-0.152147,1.243074,...,2.168396,-0.531098,-0.001485,-0.04238,2.139282,1.716711,0.37587,-0.922881,-1.316179,0.931785
TCGA.A1.A0SB.01,1.813887,1.790792,1.822272,2.113821,2.135373,1.844765,-0.469603,0.837358,0.798862,2.111705,...,-1.03524,-1.000344,-0.912012,-0.327511,0.306851,-0.311971,0.297224,0.390134,2.106744,0.319827
TCGA.A1.A0SE.01,0.473642,0.442224,0.431312,0.812876,1.031809,-0.453824,-0.611333,0.169462,0.184119,-0.511843,...,0.092706,-1.280723,-0.33227,-0.426424,-0.629303,-0.207033,0.066368,-0.330246,1.078635,0.089918
TCGA.A1.A0SF.01,-0.470612,-0.461217,-0.470734,-0.129307,0.864211,-1.070594,-0.571769,-0.000266,0.041877,-1.315689,...,-0.785645,-1.139447,-0.259122,-0.50587,-0.260053,-0.411471,0.268172,-0.481707,1.25571,-0.443763
TCGA.A1.A0SG.01,0.791968,0.841821,0.805374,0.867411,0.071249,0.499824,0.505744,0.85945,0.851887,0.16278,...,0.075832,1.512383,-1.105097,0.315259,-0.42001,1.331634,0.155349,0.490018,-0.062945,1.220345


In [None]:
miRNA_data.shape

(671, 368)

In [None]:
# Ensure the number of rows in data match with label
if miRNA_data.shape[0] == brca_Label.shape[0]:
    # Add the new column to the transposed DataFrame
    miRNA_data['Label'] = brca_Label['Label'].values
else:
    raise ValueError("The number of rows in must match the number of columns in the transposed df1.")

In [None]:
miRNA_data.shape

(671, 369)

In [None]:
# Check if there any null values in the BRCA_mRNA dataset
miRNA_data.isnull().any().any()

False

In [None]:
miRNA_data.head(10)

Unnamed: 0,hsa.let.7a.1,hsa.let.7a.2,hsa.let.7a.3,hsa.let.7b,hsa.let.7c,hsa.let.7d,hsa.let.7e,hsa.let.7f.1,hsa.let.7f.2,hsa.let.7g,...,hsa.mir.937,hsa.mir.939,hsa.mir.940,hsa.mir.942,hsa.mir.95,hsa.mir.96,hsa.mir.98,hsa.mir.99a,hsa.mir.99b,Label
TCGA.3C.AAAU.01,0.068317,0.068932,0.073899,0.524562,-1.656853,-0.038283,0.501125,-2.390084,-2.406331,-0.760042,...,0.108421,0.179279,1.768897,0.226659,0.943346,1.552613,-0.143572,-1.251868,1.603022,1
TCGA.3C.AALI.01,-0.301684,-0.318009,-0.30131,0.419859,-0.715963,0.460975,-1.999304,-0.659788,-0.651943,-1.050266,...,0.569264,0.188875,-0.453994,0.804807,-0.828843,0.599794,1.103662,-0.627587,-1.506825,4
TCGA.3C.AALJ.01,-0.15081,-0.122747,-0.126333,-0.958939,-0.971038,0.866585,2.074809,1.080746,1.088164,0.523204,...,1.95238,-1.226285,-0.28169,-0.314543,-0.487207,1.261482,1.940594,-0.848552,0.787948,2
TCGA.3C.AALK.01,0.107831,0.097594,0.095545,0.615389,0.711952,-0.454282,0.227441,-0.735552,-0.686623,-1.015715,...,0.119549,-0.340371,-0.58699,-0.36437,-0.539524,0.856983,-0.831098,0.936474,0.030739,1
TCGA.5L.AAT0.01,0.395211,0.412879,0.418441,0.500594,0.426323,-1.545556,-0.952282,-0.280966,-0.233746,-0.818574,...,-0.24895,-1.43912,-0.707823,-0.109361,-0.490655,1.188748,-0.885905,0.449284,-0.645458,1
TCGA.5T.A9QA.01,1.135212,1.140183,1.143605,1.159087,-1.05887,0.348033,1.610267,-0.141723,-0.152147,1.243074,...,-0.531098,-0.001485,-0.04238,2.139282,1.716711,0.37587,-0.922881,-1.316179,0.931785,2
TCGA.A1.A0SB.01,1.813887,1.790792,1.822272,2.113821,2.135373,1.844765,-0.469603,0.837358,0.798862,2.111705,...,-1.000344,-0.912012,-0.327511,0.306851,-0.311971,0.297224,0.390134,2.106744,0.319827,0
TCGA.A1.A0SE.01,0.473642,0.442224,0.431312,0.812876,1.031809,-0.453824,-0.611333,0.169462,0.184119,-0.511843,...,-1.280723,-0.33227,-0.426424,-0.629303,-0.207033,0.066368,-0.330246,1.078635,0.089918,1
TCGA.A1.A0SF.01,-0.470612,-0.461217,-0.470734,-0.129307,0.864211,-1.070594,-0.571769,-0.000266,0.041877,-1.315689,...,-1.139447,-0.259122,-0.50587,-0.260053,-0.411471,0.268172,-0.481707,1.25571,-0.443763,1
TCGA.A1.A0SG.01,0.791968,0.841821,0.805374,0.867411,0.071249,0.499824,0.505744,0.85945,0.851887,0.16278,...,1.512383,-1.105097,0.315259,-0.42001,1.331634,0.155349,0.490018,-0.062945,1.220345,1


In [None]:
print(miRNA_data["Label"].value_counts()[0])
print(miRNA_data["Label"].value_counts()[1])
print(miRNA_data["Label"].value_counts()[2])
print(miRNA_data["Label"].value_counts()[3])
print(miRNA_data["Label"].value_counts()[4])

31
353
132
113
42


## BRCA_DNA_Methylation

In [None]:
# Before transpose
DNA_Methylation_data.shape

(19049, 671)

In [None]:
# Transpose mrna_data
DNA_Methylation_data = DNA_Methylation_data.transpose()

In [None]:
# After transpose
DNA_Methylation_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
TCGA.3C.AAAU.01,-1.211396,-1.801239,-0.696763,0.832338,0.857098,-0.695053,-0.258891,0.601832,-0.100803,-5.232067,...,0.835761,-2.172061,-0.184735,0.821827,-0.842599,1.079773,0.780058,0.747015,0.378339,0.032275
TCGA.3C.AALI.01,0.974531,-1.091976,-1.597015,0.758754,0.450506,1.570301,0.330579,-0.912745,-0.682832,-1.232277,...,0.664052,-0.897618,-1.043726,0.746737,-0.188174,1.107008,0.404568,0.107306,0.271888,-0.147645
TCGA.3C.AALJ.01,0.633283,-0.258825,-0.873714,0.753608,0.82133,-0.36945,-0.315881,0.098281,-5.269406,-2.441139,...,-0.092097,-0.403899,0.163509,0.991162,-0.337345,1.143368,-0.583163,0.034379,-0.170242,0.748945
TCGA.3C.AALK.01,0.354642,0.344929,-1.080573,0.548382,0.497375,1.342464,-0.518794,-0.274796,-0.187178,-0.23207,...,0.277971,0.752473,0.722295,0.352692,-0.606283,1.327138,0.04715,0.014622,0.484366,0.17311
TCGA.5L.AAT0.01,-1.939963,-0.018208,-0.788607,0.589277,0.400632,1.553188,-0.299152,0.39116,-0.41044,-1.676137,...,-0.008991,0.693802,0.21161,0.025049,-0.391619,0.825476,0.272223,0.298979,0.029261,-0.424061
TCGA.5T.A9QA.01,1.328608,-1.007907,-0.583001,0.870132,1.183863,-0.876086,0.562873,3.94035,-0.42269,-1.472467,...,0.351988,1.575007,1.176298,0.677674,5.591173,1.014806,1.304881,1.151613,0.970602,2.038508
TCGA.A1.A0SB.01,0.94601,1.246215,-0.654375,-1.878898,-3.31677,-1.168542,0.646204,-0.857682,-0.588374,0.997476,...,-0.268241,0.144638,0.958181,-0.756234,-1.033358,-0.726589,-0.084193,-0.621557,-0.468274,-0.932571
TCGA.A1.A0SE.01,0.930281,0.360814,0.302439,0.160075,0.227579,1.418889,0.812077,0.5373,0.343098,0.563539,...,-0.279765,0.158648,-0.079927,-0.148292,-0.781399,0.116715,-0.078364,-0.887355,-0.221699,0.062171
TCGA.A1.A0SF.01,-0.519615,-0.426866,-1.561816,0.066117,0.051009,-0.012446,-0.205157,-0.53217,0.397713,0.714475,...,-0.642373,-0.263347,-0.086925,-0.473521,-0.11266,-0.967701,-0.585443,-0.625655,-0.73837,-0.91494
TCGA.A1.A0SG.01,0.554409,-1.357345,-0.836623,-0.338768,0.300039,-0.980884,-2.506433,-0.666725,0.2346,0.57141,...,-0.88378,0.150851,-0.688724,-0.388183,-0.283091,-0.634965,-0.713563,0.09445,-0.381204,0.623692


In [None]:
DNA_Methylation_data.shape

(671, 19049)

In [None]:
# Ensure the number of rows in data match with label
if DNA_Methylation_data.shape[0] == brca_Label.shape[0]:
    # Add the new column to the transposed DataFrame
    DNA_Methylation_data['Label'] = brca_Label['Label'].values
else:
    raise ValueError("The number of rows in must match the number of columns in the transposed df1.")

In [None]:
DNA_Methylation_data.shape

(671, 19050)

In [None]:
# Check if there any null values in the BRCA_mRNA dataset
DNA_Methylation_data.isnull().any().any()

False

In [None]:
DNA_Methylation_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk,Label
TCGA.3C.AAAU.01,-1.211396,-1.801239,-0.696763,0.832338,0.857098,-0.695053,-0.258891,0.601832,-0.100803,-5.232067,...,-2.172061,-0.184735,0.821827,-0.842599,1.079773,0.780058,0.747015,0.378339,0.032275,1
TCGA.3C.AALI.01,0.974531,-1.091976,-1.597015,0.758754,0.450506,1.570301,0.330579,-0.912745,-0.682832,-1.232277,...,-0.897618,-1.043726,0.746737,-0.188174,1.107008,0.404568,0.107306,0.271888,-0.147645,4
TCGA.3C.AALJ.01,0.633283,-0.258825,-0.873714,0.753608,0.82133,-0.36945,-0.315881,0.098281,-5.269406,-2.441139,...,-0.403899,0.163509,0.991162,-0.337345,1.143368,-0.583163,0.034379,-0.170242,0.748945,2
TCGA.3C.AALK.01,0.354642,0.344929,-1.080573,0.548382,0.497375,1.342464,-0.518794,-0.274796,-0.187178,-0.23207,...,0.752473,0.722295,0.352692,-0.606283,1.327138,0.04715,0.014622,0.484366,0.17311,1
TCGA.5L.AAT0.01,-1.939963,-0.018208,-0.788607,0.589277,0.400632,1.553188,-0.299152,0.39116,-0.41044,-1.676137,...,0.693802,0.21161,0.025049,-0.391619,0.825476,0.272223,0.298979,0.029261,-0.424061,1
TCGA.5T.A9QA.01,1.328608,-1.007907,-0.583001,0.870132,1.183863,-0.876086,0.562873,3.94035,-0.42269,-1.472467,...,1.575007,1.176298,0.677674,5.591173,1.014806,1.304881,1.151613,0.970602,2.038508,2
TCGA.A1.A0SB.01,0.94601,1.246215,-0.654375,-1.878898,-3.31677,-1.168542,0.646204,-0.857682,-0.588374,0.997476,...,0.144638,0.958181,-0.756234,-1.033358,-0.726589,-0.084193,-0.621557,-0.468274,-0.932571,0
TCGA.A1.A0SE.01,0.930281,0.360814,0.302439,0.160075,0.227579,1.418889,0.812077,0.5373,0.343098,0.563539,...,0.158648,-0.079927,-0.148292,-0.781399,0.116715,-0.078364,-0.887355,-0.221699,0.062171,1
TCGA.A1.A0SF.01,-0.519615,-0.426866,-1.561816,0.066117,0.051009,-0.012446,-0.205157,-0.53217,0.397713,0.714475,...,-0.263347,-0.086925,-0.473521,-0.11266,-0.967701,-0.585443,-0.625655,-0.73837,-0.91494,1
TCGA.A1.A0SG.01,0.554409,-1.357345,-0.836623,-0.338768,0.300039,-0.980884,-2.506433,-0.666725,0.2346,0.57141,...,0.150851,-0.688724,-0.388183,-0.283091,-0.634965,-0.713563,0.09445,-0.381204,0.623692,1


In [None]:
print(DNA_Methylation_data["Label"].value_counts()[0])
print(DNA_Methylation_data["Label"].value_counts()[1])
print(DNA_Methylation_data["Label"].value_counts()[2])
print(DNA_Methylation_data["Label"].value_counts()[3])
print(DNA_Methylation_data["Label"].value_counts()[4])

31
353
132
113
42


## BRCA_mRNA

In [None]:
# Before transpose
mrna_data.shape

(18206, 671)

In [None]:
# Transpose mrna_data
mrna_data = mrna_data.transpose()

In [None]:
# After transpose
mrna_data.head(10)

Unnamed: 0,A1BG,A1BG.AS1,A2M,A2M.AS1,A2ML1,A2MP1,A4GALT,AAAS,AACS,AADAT,...,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3
TCGA.3C.AAAU.01,1.389927,0.436758,-0.876187,1.607818,-0.415138,-0.374641,-1.727152,-0.56253,0.225676,-1.337786,...,0.38225,0.445598,2.359316,2.242234,1.190506,1.753777,2.013674,-0.424991,1.214399,1.66734
TCGA.3C.AALI.01,1.522872,1.815091,-0.508085,0.863019,-0.406608,-0.36123,-0.964244,0.525889,-0.590749,-0.72495,...,2.927253,0.455835,0.5139,0.17977,1.588651,1.298863,-0.140439,0.635518,0.302547,-1.162286
TCGA.3C.AALJ.01,1.788271,0.580557,-0.258138,-0.491954,-0.455969,0.855983,0.750127,-0.401461,1.077314,0.10188,...,-0.38182,1.900756,-0.505798,0.658038,-1.008467,2.105007,-0.334864,0.573943,-0.379244,-0.547104
TCGA.3C.AALK.01,-0.288543,-0.250194,-0.030804,-0.579638,-0.464938,-0.313092,0.480557,-0.426938,-0.739098,-0.091341,...,-0.029094,-0.210541,-0.185209,-0.06678,-0.759323,0.916528,-0.278901,0.51552,-0.939663,0.057166
TCGA.5L.AAT0.01,-0.46268,-0.091285,0.28997,0.572883,-0.438571,-0.090844,0.179004,0.685542,-0.623465,-0.421059,...,-0.232629,-0.709315,0.770811,1.161026,0.369677,-1.152811,0.075213,-0.100236,-0.485591,1.268702
TCGA.5T.A9QA.01,-0.451169,-0.688598,-1.641876,-0.836306,-0.479607,-1.01492,0.420826,0.697997,0.669477,1.06754,...,0.845089,1.473857,-0.869225,-0.460787,-0.008572,-1.553161,-1.397447,-0.472842,0.190737,-0.622851
TCGA.A1.A0SB.01,-0.805973,-1.037605,1.534552,0.099614,-0.444955,1.384198,0.260066,0.127438,1.339599,1.377738,...,0.051136,-1.22728,0.95557,-0.08316,1.287459,-1.086313,1.450555,1.176967,1.560267,0.954347
TCGA.A1.A0SE.01,-0.472368,0.658661,0.436717,-0.059346,-0.539629,-0.508012,0.05895,-0.117362,-0.443098,-0.566328,...,0.09178,0.231947,0.830896,0.605248,-0.282021,-0.380918,0.230984,-0.169919,-0.492343,0.581995
TCGA.A1.A0SF.01,0.112737,0.627872,0.476992,-0.092982,-0.408042,-0.264776,0.67689,1.583352,-1.331414,-1.010971,...,-1.173102,-0.059546,0.746254,0.15886,-0.112952,0.07701,0.050613,0.889745,-0.513877,-1.422932
TCGA.A1.A0SG.01,-0.39748,0.426477,0.147622,-0.666693,-0.457548,-0.3432,1.261371,0.208439,0.918643,-0.671109,...,-0.69979,-0.359577,-0.703121,-0.338345,0.205879,-1.682207,-0.605257,-0.606344,1.306838,-1.189883


In [None]:
mrna_data.shape

(671, 18206)

In [None]:
brca_Label.shape

(671, 1)

In [None]:
# Ensure the number of rows in data match with label
if mrna_data.shape[0] == brca_Label.shape[0]:
    # Add the new column to the transposed DataFrame
    mrna_data['Label'] = brca_Label['Label'].values
else:
    raise ValueError("The number of rows in must match the number of columns in the transposed df1.")

In [None]:
mrna_data.shape

(671, 18207)

In [None]:
# Check if there any null values in the BRCA_mRNA dataset
mrna_data.isnull().any().any()

False

In [None]:
mrna_data.head(10)

Unnamed: 0,A1BG,A1BG.AS1,A2M,A2M.AS1,A2ML1,A2MP1,A4GALT,AAAS,AACS,AADAT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,Label
TCGA.3C.AAAU.01,1.389927,0.436758,-0.876187,1.607818,-0.415138,-0.374641,-1.727152,-0.56253,0.225676,-1.337786,...,0.445598,2.359316,2.242234,1.190506,1.753777,2.013674,-0.424991,1.214399,1.66734,1
TCGA.3C.AALI.01,1.522872,1.815091,-0.508085,0.863019,-0.406608,-0.36123,-0.964244,0.525889,-0.590749,-0.72495,...,0.455835,0.5139,0.17977,1.588651,1.298863,-0.140439,0.635518,0.302547,-1.162286,4
TCGA.3C.AALJ.01,1.788271,0.580557,-0.258138,-0.491954,-0.455969,0.855983,0.750127,-0.401461,1.077314,0.10188,...,1.900756,-0.505798,0.658038,-1.008467,2.105007,-0.334864,0.573943,-0.379244,-0.547104,2
TCGA.3C.AALK.01,-0.288543,-0.250194,-0.030804,-0.579638,-0.464938,-0.313092,0.480557,-0.426938,-0.739098,-0.091341,...,-0.210541,-0.185209,-0.06678,-0.759323,0.916528,-0.278901,0.51552,-0.939663,0.057166,1
TCGA.5L.AAT0.01,-0.46268,-0.091285,0.28997,0.572883,-0.438571,-0.090844,0.179004,0.685542,-0.623465,-0.421059,...,-0.709315,0.770811,1.161026,0.369677,-1.152811,0.075213,-0.100236,-0.485591,1.268702,1
TCGA.5T.A9QA.01,-0.451169,-0.688598,-1.641876,-0.836306,-0.479607,-1.01492,0.420826,0.697997,0.669477,1.06754,...,1.473857,-0.869225,-0.460787,-0.008572,-1.553161,-1.397447,-0.472842,0.190737,-0.622851,2
TCGA.A1.A0SB.01,-0.805973,-1.037605,1.534552,0.099614,-0.444955,1.384198,0.260066,0.127438,1.339599,1.377738,...,-1.22728,0.95557,-0.08316,1.287459,-1.086313,1.450555,1.176967,1.560267,0.954347,0
TCGA.A1.A0SE.01,-0.472368,0.658661,0.436717,-0.059346,-0.539629,-0.508012,0.05895,-0.117362,-0.443098,-0.566328,...,0.231947,0.830896,0.605248,-0.282021,-0.380918,0.230984,-0.169919,-0.492343,0.581995,1
TCGA.A1.A0SF.01,0.112737,0.627872,0.476992,-0.092982,-0.408042,-0.264776,0.67689,1.583352,-1.331414,-1.010971,...,-0.059546,0.746254,0.15886,-0.112952,0.07701,0.050613,0.889745,-0.513877,-1.422932,1
TCGA.A1.A0SG.01,-0.39748,0.426477,0.147622,-0.666693,-0.457548,-0.3432,1.261371,0.208439,0.918643,-0.671109,...,-0.359577,-0.703121,-0.338345,0.205879,-1.682207,-0.605257,-0.606344,1.306838,-1.189883,1


In [None]:
print(mrna_data["Label"].value_counts()[0])
print(mrna_data["Label"].value_counts()[1])
print(mrna_data["Label"].value_counts()[2])
print(mrna_data["Label"].value_counts()[3])
print(mrna_data["Label"].value_counts()[4])

31
353
132
113
42


## BRCA_Multi_Omics

In [None]:
# Before transpose
multi_omics_data.shape

(57191, 671)

In [None]:
# Transpose mrna_data
multi_omics_data = multi_omics_data.transpose()

In [None]:
# After transpose
multi_omics_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk
TCGA.3C.AAAU.01,0.495252,-0.938762,0.750838,0.748593,1.093743,-0.876388,1.10946,-1.966983,-1.67642,1.090124,...,0.835761,-2.172061,-0.184735,0.821827,-0.842599,1.079773,0.780058,0.747015,0.378339,0.032275
TCGA.3C.AALI.01,1.028904,-1.575977,0.010864,0.005894,-1.57568,-0.967673,1.481901,-0.040373,-0.027748,1.213108,...,0.664052,-0.897618,-1.043726,0.746737,-0.188174,1.107008,0.404568,0.107306,0.271888,-0.147645
TCGA.3C.AALJ.01,0.003059,0.405308,0.103201,0.098571,-0.706498,-1.643561,-0.661117,-1.342185,1.469986,-0.571499,...,-0.092097,-0.403899,0.163509,0.991162,-0.337345,1.143368,-0.583163,0.034379,-0.170242,0.748945
TCGA.3C.AALK.01,-0.258705,0.09691,-0.16913,-0.174763,0.199712,0.570857,-0.422884,-0.15621,-0.017618,-0.402285,...,0.277971,0.752473,0.722295,0.352692,-0.606283,1.327138,0.04715,0.014622,0.484366,0.17311
TCGA.5L.AAT0.01,-0.083232,0.060782,-2.090765,-2.103474,0.37657,0.631333,-0.35066,-0.101592,0.100904,-0.312965,...,-0.008991,0.693802,0.21161,0.025049,-0.391619,0.825476,0.272223,0.298979,0.029261,-0.424061
TCGA.5T.A9QA.01,-0.486242,2.050968,-0.216362,-0.222169,-2.334753,2.749902,-0.173333,-0.175416,0.011759,-0.110984,...,0.351988,1.575007,1.176298,0.677674,5.591173,1.014806,1.304881,1.151613,0.970602,2.038508
TCGA.A1.A0SB.01,-0.111192,0.154506,-0.062751,-0.067992,0.400443,0.620683,-0.273585,0.005842,0.08419,-0.226338,...,-0.268241,0.144638,0.958181,-0.756234,-1.033358,-0.726589,-0.084193,-0.621557,-0.468274,-0.932571
TCGA.A1.A0SE.01,-0.102997,0.018894,-0.056368,-0.061585,0.225047,0.497449,-0.151234,0.145686,-1.943348,-0.146892,...,-0.279765,0.158648,-0.079927,-0.148292,-0.781399,0.116715,-0.078364,-0.887355,-0.221699,0.062171
TCGA.A1.A0SF.01,0.687116,1.131535,0.857217,0.855364,1.226264,0.387146,-0.508045,1.277645,-1.716941,-0.444027,...,-0.642373,-0.263347,-0.086925,-0.473521,-0.11266,-0.967701,-0.585443,-0.625655,-0.73837,-0.91494
TCGA.A1.A0SG.01,-0.213873,0.145081,-0.016369,-0.021439,0.390699,-0.143066,-0.384077,-0.019366,0.046708,-0.340793,...,-0.88378,0.150851,-0.688724,-0.388183,-0.283091,-0.634965,-0.713563,0.09445,-0.381204,0.623692


In [None]:
multi_omics_data.shape

(671, 57191)

In [None]:
brca_Label.shape

(671, 1)

In [None]:
# Ensure the number of rows in data match with label
if multi_omics_data.shape[0] == brca_Label.shape[0]:
    # Add the new column to the transposed DataFrame
    multi_omics_data['Label'] = brca_Label['Label'].values
else:
    raise ValueError("The number of rows in must match the number of columns in the transposed df1.")

In [None]:
multi_omics_data.shape

(671, 57192)

In [None]:
# Check if there any null values in the BRCA_mRNA dataset
multi_omics_data.isnull().any().any()

False

In [None]:
multi_omics_data.head(10)

Unnamed: 0,A1BG,A1CF,A2M,A2ML1,A3GALT2,A4GALT,A4GNT,AAAS,AACS,AADAC,...,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,pk,Label
TCGA.3C.AAAU.01,0.495252,-0.938762,0.750838,0.748593,1.093743,-0.876388,1.10946,-1.966983,-1.67642,1.090124,...,-2.172061,-0.184735,0.821827,-0.842599,1.079773,0.780058,0.747015,0.378339,0.032275,LumA
TCGA.3C.AALI.01,1.028904,-1.575977,0.010864,0.005894,-1.57568,-0.967673,1.481901,-0.040373,-0.027748,1.213108,...,-0.897618,-1.043726,0.746737,-0.188174,1.107008,0.404568,0.107306,0.271888,-0.147645,Her2
TCGA.3C.AALJ.01,0.003059,0.405308,0.103201,0.098571,-0.706498,-1.643561,-0.661117,-1.342185,1.469986,-0.571499,...,-0.403899,0.163509,0.991162,-0.337345,1.143368,-0.583163,0.034379,-0.170242,0.748945,LumB
TCGA.3C.AALK.01,-0.258705,0.09691,-0.16913,-0.174763,0.199712,0.570857,-0.422884,-0.15621,-0.017618,-0.402285,...,0.752473,0.722295,0.352692,-0.606283,1.327138,0.04715,0.014622,0.484366,0.17311,LumA
TCGA.5L.AAT0.01,-0.083232,0.060782,-2.090765,-2.103474,0.37657,0.631333,-0.35066,-0.101592,0.100904,-0.312965,...,0.693802,0.21161,0.025049,-0.391619,0.825476,0.272223,0.298979,0.029261,-0.424061,LumA
TCGA.5T.A9QA.01,-0.486242,2.050968,-0.216362,-0.222169,-2.334753,2.749902,-0.173333,-0.175416,0.011759,-0.110984,...,1.575007,1.176298,0.677674,5.591173,1.014806,1.304881,1.151613,0.970602,2.038508,LumB
TCGA.A1.A0SB.01,-0.111192,0.154506,-0.062751,-0.067992,0.400443,0.620683,-0.273585,0.005842,0.08419,-0.226338,...,0.144638,0.958181,-0.756234,-1.033358,-0.726589,-0.084193,-0.621557,-0.468274,-0.932571,Normal
TCGA.A1.A0SE.01,-0.102997,0.018894,-0.056368,-0.061585,0.225047,0.497449,-0.151234,0.145686,-1.943348,-0.146892,...,0.158648,-0.079927,-0.148292,-0.781399,0.116715,-0.078364,-0.887355,-0.221699,0.062171,LumA
TCGA.A1.A0SF.01,0.687116,1.131535,0.857217,0.855364,1.226264,0.387146,-0.508045,1.277645,-1.716941,-0.444027,...,-0.263347,-0.086925,-0.473521,-0.11266,-0.967701,-0.585443,-0.625655,-0.73837,-0.91494,LumA
TCGA.A1.A0SG.01,-0.213873,0.145081,-0.016369,-0.021439,0.390699,-0.143066,-0.384077,-0.019366,0.046708,-0.340793,...,0.150851,-0.688724,-0.388183,-0.283091,-0.634965,-0.713563,0.09445,-0.381204,0.623692,LumA


In [None]:
print(multi_omics_data["Label"].value_counts()[0])
print(multi_omics_data["Label"].value_counts()[1])
print(multi_omics_data["Label"].value_counts()[2])
print(multi_omics_data["Label"].value_counts()[3])
print(multi_omics_data["Label"].value_counts()[4])

353
132
113
42
31


# Feature Selection

## BRCA_Copy Number Variation

###Training of SVM

In [None]:
varNumber_train_data, varNumber_test_data = train_test_split(copy_number_variation_data,test_size=0.30,random_state=1)
target="Label"
varNumber_X_train = varNumber_train_data.select_dtypes(include=["number"]).copy()
varNumber_y_train = varNumber_train_data[target]
varNumber_X_test = varNumber_test_data.select_dtypes(include=["number"]).copy()
varNumber_y_test = varNumber_test_data[target]

In [None]:
print("X train :", varNumber_X_train.shape)
print("X test :", varNumber_X_test.shape)
print("Y train :", varNumber_y_train.shape)
print("Y test :", varNumber_y_test.shape)

X train : (469, 19569)
X test : (202, 19569)
Y train : (469,)
Y test : (202,)


In [None]:
# Define the SVM model
model = SVC(kernel="linear")

# Define RFE with the SVM model, choosing a number of features to select
rfe = RFE(estimator=model, n_features_to_select=19000, step=0.05)

# Fit RFE on the training data
rfe.fit(varNumber_X_train, varNumber_y_train)


In [None]:
# Transform the training and test sets to include only the selected features
X_train_rfe_varNumber = rfe.transform(varNumber_X_train)
X_test_rfe_varNumber = rfe.transform(varNumber_X_test)

In [None]:
print(X_train_rfe_varNumber.shape)
print(X_test_rfe_varNumber.shape)

(469, 19000)
(202, 19000)


###Result

In [None]:
# Convert the transformed arrays back to DataFrames
# Get the selected feature names
selected_features_varNumber = varNumber_X_train.columns[rfe.support_]

X_train_rfe_varNumber= pd.DataFrame(X_train_rfe_varNumber, columns=selected_features_varNumber, index=varNumber_X_train.index)
X_test_rfe_varNumber = pd.DataFrame(X_test_rfe_varNumber, columns=selected_features_varNumber, index=varNumber_X_test.index)

In [None]:
# Save the dataframes
X_train_rfe_varNumber.to_pickle('X_train_rfe_copy_number_variation.pkl')
X_test_rfe_varNumber.to_pickle('X_test_rfe_copy_number_variation.pkl')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

X_train_rfe_varNumber.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_copy_number_variation.pkl')
X_test_rfe_varNumber.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_copy_number_variation.pkl')


## BRCA_miRNA

###Training of SVM

In [None]:
miRNA_train_data, miRNA_test_data = train_test_split(miRNA_data,test_size=0.30,random_state=1)
target="Label"
miRNA_X_train = miRNA_train_data.select_dtypes(include=["number"]).copy()
miRNA_y_train = miRNA_train_data[target]
miRNA_X_test = miRNA_test_data.select_dtypes(include=["number"]).copy()
miRNA_y_test = miRNA_test_data[target]

In [None]:
print("X train :", miRNA_X_train.shape)
print("X test :", miRNA_X_test.shape)
print("Y train :", miRNA_y_train.shape)
print("Y test :", miRNA_y_test.shape)

X train : (469, 369)
X test : (202, 369)
Y train : (469,)
Y test : (202,)


In [None]:
# Define the SVM model
model = SVC(kernel="linear")

# Define RFE with the SVM model, choosing a number of features to select
rfe = RFE(estimator=model, n_features_to_select=300, step=0.05)

# Fit RFE on the training data
rfe.fit(miRNA_X_train, miRNA_y_train)


In [None]:
# Transform the training and test sets to include only the selected features
x_train_rfe_miRNA = rfe.transform(miRNA_X_train)
x_test_rfe_miRNA = rfe.transform(miRNA_X_test)

In [None]:
print(x_test_rfe_miRNA.shape)
print(x_train_rfe_miRNA.shape)


(202, 300)
(469, 300)


### Result

In [None]:
# Convert the transformed arrays back to DataFrames
# Get the selected feature names
selected_features_miRNA = miRNA_X_train.columns[rfe.support_]

X_train_rfe_miRNA= pd.DataFrame(x_train_rfe_miRNA, columns=selected_features_miRNA, index=miRNA_X_train.index)
X_test_rfe_miRNA = pd.DataFrame(x_test_rfe_miRNA, columns=selected_features_miRNA, index=miRNA_X_test.index)

In [None]:
# Save the dataframes
X_train_rfe_miRNA.to_pickle('X_train_rfe_miRNA.pkl')
X_test_rfe_miRNA.to_pickle('X_test_rfe_miRNA.pkl')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

X_train_rfe_miRNA.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_miRNA.pkl')
X_test_rfe_miRNA.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_miRNA.pkl')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## BRCA_DNA_Methylation

###Training of SVM

In [None]:
dna_train_data, dna_test_data = train_test_split(DNA_Methylation_data,test_size=0.30,random_state=1)
target="Label"
dna_X_train = dna_train_data.select_dtypes(include=["number"]).copy()
dna_y_train = dna_train_data[target]
dna_X_test = dna_test_data.select_dtypes(include=["number"]).copy()
dna_y_test = dna_test_data[target]

In [None]:
print("X train :", dna_X_train.shape)
print("X test :", dna_X_test.shape)
print("Y train :", dna_y_train.shape)
print("Y test :", dna_y_test.shape)

X train : (469, 19050)
X test : (202, 19050)
Y train : (469,)
Y test : (202,)


In [None]:
# Define the SVM model
model = SVC(kernel="linear")

# Define RFE with the SVM model, choosing a number of features to select
rfe = RFE(estimator=model, n_features_to_select=19000, step=0.05)

# Fit RFE on the training data
rfe.fit(dna_X_train, dna_y_train)


In [None]:
# Transform the training and test sets to include only the selected features
x_train_rfe_dna = rfe.transform(dna_X_train)
x_test_rfe_dna= rfe.transform(dna_X_test)

In [None]:
print(x_test_rfe_dna.shape)
print(x_train_rfe_dna.shape)


(202, 19000)
(469, 19000)


### Result

In [None]:
# Convert the transformed arrays back to DataFrames
# Get the selected feature names
selected_features_dna = dna_X_train.columns[rfe.support_]

X_train_rfe_dna = pd.DataFrame(x_train_rfe_dna, columns=selected_features_dna, index=dna_X_train.index)
X_test_rfe_dna = pd.DataFrame(x_test_rfe_dna, columns=selected_features_dna, index=dna_X_test.index)

In [None]:
# Save the dataframes
X_train_rfe_dna.to_pickle('X_train_rfe_dna.pkl')
X_test_rfe_dna.to_pickle('X_test_rfe_dna.pkl')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

X_train_rfe_dna.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_dna.pkl')
X_test_rfe_dna.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_dna.pkl')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## BRCA_mRNA

###Training of SVM

In [None]:
mrna_train_data, mrna_test_data = train_test_split(mrna_data,test_size=0.30,random_state=1)
target="Label"
mrna_X_train = mrna_train_data.select_dtypes(include=["number"]).copy()
mrna_y_train = mrna_train_data[target]
mrna_X_test = mrna_test_data.select_dtypes(include=["number"]).copy()
mrna_y_test = mrna_test_data[target]

In [None]:
print("X train :", mrna_X_train.shape)
print("X test :", mrna_X_test.shape)
print("Y train :", mrna_y_train.shape)
print("Y test :", mrna_y_test.shape)

X train : (469, 18207)
X test : (202, 18207)
Y train : (469,)
Y test : (202,)


In [None]:
# Define the SVM model
model = SVC(kernel="linear")

# Define RFE with the SVM model, choosing a number of features to select
rfe = RFE(estimator=model, n_features_to_select=18000, step=0.05)

# Fit RFE on the training data
rfe.fit(mrna_X_train, mrna_y_train)


In [None]:
# Transform the training and test sets to include only the selected features
x_train_rfe_mrna = rfe.transform(mrna_X_train)
x_test_rfe_mrna = rfe.transform(mrna_X_test)

In [None]:
print(x_test_rfe_mrna.shape)
print(x_train_rfe_mrna.shape)


(202, 18000)
(469, 18000)


### Result

In [None]:
# Convert the transformed arrays back to DataFrames
# Get the selected feature names
selected_features_mrna = mrna_X_train.columns[rfe.support_]

X_train_rfe_mrna = pd.DataFrame(x_train_rfe_mrna, columns=selected_features_mrna, index=mrna_X_train.index)
X_test_rfe_mrna = pd.DataFrame(x_test_rfe_mrna, columns=selected_features_mrna, index=mrna_X_test.index)

In [None]:
# Save the dataframes
X_train_rfe_mrna.to_pickle('X_train_rfe_mrna.pkl')
X_test_rfe_mrna.to_pickle('X_test_rfe_mrna.pkl')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

X_train_rfe_mrna.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_mrna.pkl')
X_test_rfe_mrna.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_mrna.pkl')


Mounted at /content/gdrive


## BRCA_Multi_Omics

###Training of SVM

In [None]:
multiOmics_train_data, multiOmics_test_data = train_test_split(multi_omics_data,test_size=0.30,random_state=1)
target="Label"
multiOmics_X_train = multiOmics_train_data.select_dtypes(include=["number"]).copy()
multiOmics_y_train = multiOmics_train_data[target]
multiOmics_X_test = multiOmics_test_data.select_dtypes(include=["number"]).copy()
multiOmics_y_test = multiOmics_test_data[target]

In [None]:
print("X train :", multiOmics_X_train.shape)
print("X test :", multiOmics_X_test.shape)
print("Y train :", multiOmics_y_train.shape)
print("Y test :", multiOmics_y_test.shape)

X train : (469, 57191)
X test : (202, 57191)
Y train : (469,)
Y test : (202,)


In [None]:
# Define the SVM model
model = SVC(kernel="linear")

# Define RFE with the SVM model, choosing a number of features to select
rfe = RFE(estimator=model, n_features_to_select=57000, step=0.05)

# Fit RFE on the training data
rfe.fit(multiOmics_X_train, multiOmics_y_train)


In [None]:
# Transform the training and test sets to include only the selected features
x_train_rfe_multiOmics = rfe.transform(multiOmics_X_train)
x_test_rfe_multiOmics = rfe.transform(multiOmics_X_test)

In [None]:
print(x_test_rfe_multiOmics.shape)
print(x_train_rfe_multiOmics.shape)


(202, 57000)
(469, 57000)


###Result

In [None]:
# Convert the transformed arrays back to DataFrames
# Get the selected feature names
selected_features_multiOmics = multiOmics_X_train.columns[rfe.support_]

x_train_rfe_multiOmics = pd.DataFrame(x_train_rfe_multiOmics, columns=selected_features_multiOmics, index=multiOmics_X_train.index)
x_test_rfe_multiOmics = pd.DataFrame(x_test_rfe_multiOmics, columns=selected_features_multiOmics, index=multiOmics_X_test.index)

In [None]:
# Save the dataframes
x_train_rfe_multiOmics.to_pickle('X_train_rfe_multi_omics.pkl')
x_test_rfe_multiOmics.to_pickle('X_test_rfe_multi_omics.pkl')

In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

x_train_rfe_multiOmics.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_multi_omics.pkl')
x_test_rfe_multiOmics.to_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_multi_omics.pkl')


Mounted at /content/gdrive


# With SMOTE

In [None]:
df1 = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_mrna.pkl')
df2 = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_mrna.pkl')


In [None]:
### 1. Link notebook with google drive and access data from your personal Gdrive
from google.colab import drive
# Mount Google Drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


## BRCA_Copy Number Variation

In [None]:
X_train_rfe_varNumber = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_copy_number_variation.pkl')
X_test_rfe_varNumber = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_copy_number_variation.pkl')

In [None]:
X_test_rfe_varNumber.shape

(202, 19000)

In [None]:
X_train_rfe_varNumber.shape

(469, 19000)

In [None]:
varNumber_y_train.shape

(469,)

In [None]:
#before SMOTE
varNumber_y_train.value_counts()

Label
1    245
2     96
3     82
4     25
0     21
Name: count, dtype: int64

In [None]:
oversample = SMOTE()
X_resampled_varNumber, y_resampled_varNumber = oversample.fit_resample(X_train_rfe_varNumber, varNumber_y_train)
y_resampled_varNumber = pd.DataFrame(y_resampled_varNumber, columns=['Label'])

In [None]:
# after SMOTE
print(y_resampled_varNumber["Label"].value_counts()[0])
print(y_resampled_varNumber["Label"].value_counts()[1])
print(y_resampled_varNumber["Label"].value_counts()[2])
print(y_resampled_varNumber["Label"].value_counts()[3])
print(y_resampled_varNumber["Label"].value_counts()[4])

245
245
245
245
245


In [None]:
y_resampled_varNumber.shape

(1225, 1)

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# Define the architecture parameters
input_dim = X_resampled_varNumber.shape[1]   # Number of features
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16  # Size of the encoded representations

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
sdae.fit(X_resampled_varNumber, X_resampled_varNumber, epochs=250, batch_size=32, shuffle=True, validation_data=(X_test_rfe_varNumber, X_test_rfe_varNumber), callbacks=[early_stopping])

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(X_resampled_varNumber)
X_test_encoded = encoder.predict(X_test_rfe_varNumber)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_resampled_varNumber))  # Number of classes in the target

classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
])


# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train_encoded, y_resampled_varNumber, epochs=250, batch_size=32, validation_data=(X_test_encoded, varNumber_y_test), callbacks=[early_stopping])

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(varNumber_y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Accuracy: 0.6237623762376238


## BRCA_miRNA

In [None]:
X_train_rfe_miRNA.shape

(469, 300)

In [None]:
miRNA_y_test

In [None]:
print(miRNA_y_train.value_counts())

Label
1    245
2     96
3     82
4     25
0     21
Name: count, dtype: int64


In [None]:
oversample = SMOTE()
X_resampled_miRNA, y_resampled_miRNA = oversample.fit_resample(X_train_rfe_miRNA, miRNA_y_train)
y_resampled_miRNA = pd.DataFrame(y_resampled_miRNA, columns=['Label'])

In [None]:
print(y_resampled_miRNA["Label"].value_counts()[0])
print(y_resampled_miRNA["Label"].value_counts()[1])
print(y_resampled_miRNA["Label"].value_counts()[2])
print(y_resampled_miRNA["Label"].value_counts()[3])
print(y_resampled_miRNA["Label"].value_counts()[4])

245
245
245
245
245


In [None]:
y_resampled_miRNA.shape

(1225, 1)

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# Define the architecture parameters
input_dim = X_resampled_miRNA.shape[1]   # Number of features
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16  # Size of the encoded representations

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
sdae.fit(X_resampled_miRNA, X_resampled_miRNA, epochs=250, batch_size=32, shuffle=True, validation_data=(X_test_rfe_miRNA, X_test_rfe_miRNA), callbacks=[early_stopping])

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(X_resampled_miRNA)
X_test_encoded = encoder.predict(X_test_rfe_miRNA)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_resampled_miRNA))  # Number of classes in the target

classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
])


# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train_encoded, y_resampled_miRNA, epochs=250, batch_size=32, validation_data=(X_test_encoded, miRNA_y_test), callbacks=[early_stopping])

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(miRNA_y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Accuracy: 0.6633663366336634


## BRCA_DNA_Methylation

In [None]:
X_train_rfe_dna.shape

(469, 19000)

In [None]:
print(dna_y_train.value_counts())

Label
1    245
2     96
3     82
4     25
0     21
Name: count, dtype: int64


In [None]:
oversample = SMOTE()
X_resampled_dna, y_resampled_dna = oversample.fit_resample(X_train_rfe_dna, dna_y_train)
y_resampled_dna = pd.DataFrame(y_resampled_dna, columns=['Label'])

In [None]:
print(y_resampled_dna["Label"].value_counts()[0])
print(y_resampled_dna["Label"].value_counts()[1])
print(y_resampled_dna["Label"].value_counts()[2])
print(y_resampled_dna["Label"].value_counts()[3])
print(y_resampled_dna["Label"].value_counts()[4])

245
245
245
245
245


In [None]:
y_resampled_dna.shape

(1225, 1)

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# Define the architecture parameters
input_dim = X_resampled_dna.shape[1]   # Number of features
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16  # Size of the encoded representations

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
sdae.fit(X_resampled_dna, X_resampled_dna, epochs=250, batch_size=32, shuffle=True, validation_data=(X_test_rfe_dna, X_test_rfe_dna), callbacks=[early_stopping])

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(X_resampled_dna)
X_test_encoded = encoder.predict(X_test_rfe_dna)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_resampled_dna))  # Number of classes in the target

classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
])


# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train_encoded, y_resampled_dna, epochs=250, batch_size=32, validation_data=(X_test_encoded, dna_y_test), callbacks=[early_stopping])

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(dna_y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoc

## BRCA_mRNA

In [None]:
X_train_rfe_mrna.shape

(469, 18000)

In [None]:
mrna_y_train.shape

(469,)

In [None]:
oversample = SMOTE()
X_resampled_mrna, y_resampled_mrna = oversample.fit_resample(X_train_rfe_mrna, mrna_y_train)
y_resampled_mrna = pd.DataFrame(y_resampled_mrna, columns=['Label'])

In [None]:
print(y_resampled_mrna["Label"].value_counts()[0])
print(y_resampled_mrna["Label"].value_counts()[1])
print(y_resampled_mrna["Label"].value_counts()[2])
print(y_resampled_mrna["Label"].value_counts()[3])
print(y_resampled_mrna["Label"].value_counts()[4])

245
245
245
245
245


In [None]:
y_resampled_mrna.shape

(1225, 1)

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# Define the architecture parameters
input_dim = X_resampled_mrna.shape[1]   # Number of features
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16  # Size of the encoded representations

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
sdae.fit(X_resampled_mrna, X_resampled_mrna, epochs=250, batch_size=32, shuffle=True, validation_data=(X_test_rfe_mrna, X_test_rfe_mrna), callbacks=[early_stopping])

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(X_resampled_mrna)
X_test_encoded = encoder.predict(X_test_rfe_mrna)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_resampled_mrna))  # Number of classes in the target

classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
])


# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train_encoded, y_resampled_mrna, epochs=250, batch_size=32, validation_data=(X_test_encoded, mrna_y_test), callbacks=[early_stopping])

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(mrna_y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoc

## Multi-Omics

In [None]:
X_train_rfe_multiOmics = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_train_rfe_multi_omics.pkl')
X_test_rfe_multiOmics = pd.read_pickle('/content/gdrive/My Drive/BIOINFORMATICSII/Project/X_test_rfe_multi_omics.pkl')

In [None]:
X_test_rfe_multiOmics.shape

(202, 57000)

In [None]:
X_train_rfe_multiOmics.shape

(469, 57000)

In [None]:
#before SMOTE
multiOmics_y_train.value_counts()

Label
1    245
2     96
3     82
4     25
0     21
Name: count, dtype: int64

In [None]:
oversample = SMOTE()
X_resampled_multiOmics, y_resampled_multiOmics = oversample.fit_resample(X_train_rfe_multiOmics, multiOmics_y_train)
y_resampled_multiOmics = pd.DataFrame(y_resampled_multiOmics, columns=['Label'])

In [None]:
# after SMOTE
print(y_resampled_multiOmics["Label"].value_counts()[0])
print(y_resampled_multiOmics["Label"].value_counts()[1])
print(y_resampled_multiOmics["Label"].value_counts()[2])
print(y_resampled_multiOmics["Label"].value_counts()[3])
print(y_resampled_multiOmics["Label"].value_counts()[4])

245
245
245
245
245


In [None]:
y_resampled_multiOmics.shape

(1225, 1)

In [None]:
X_resampled_multiOmics.shape

(1225, 57000)

In [None]:
from keras.callbacks import EarlyStopping

In [None]:
# Define the architecture parameters
input_dim = X_resampled_multiOmics.shape[1]   # Number of features
hidden1_dim = 64
hidden2_dim = 32
encoding_dim = 16  # Size of the encoded representations

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Build the SDAE model
input_layer = Input(shape=(input_dim,))
hidden1 = Dense(hidden1_dim, activation='relu')(input_layer)
encoded1 = Dense(hidden2_dim, activation='relu')(hidden1)
encoded2 = Dense(encoding_dim, activation='relu')(encoded1)
decoded1 = Dense(hidden2_dim, activation='relu')(encoded2)
decoded2 = Dense(hidden1_dim, activation='relu')(decoded1)
output_layer = Dense(input_dim, activation='linear')(decoded2)

sdae = Model(inputs=input_layer, outputs=output_layer)

# Compile the SDAE model
sdae.compile(optimizer='adam', loss='mean_squared_error')

# Train the SDAE
sdae.fit(X_resampled_multiOmics, X_resampled_multiOmics, epochs=250, batch_size=32, shuffle=True, validation_data=(X_test_rfe_multiOmics, X_test_rfe_multiOmics), callbacks=[early_stopping])

# Extract the encoder part for feature extraction
encoder = Model(inputs=input_layer, outputs=encoded2)

# Extract features using the encoder
X_train_encoded = encoder.predict(X_resampled_multiOmics)
X_test_encoded = encoder.predict(X_test_rfe_multiOmics)

# Define a classifier (example: simple MLP classifier)
num_classes = len(np.unique(y_resampled_multiOmics))  # Number of classes in the target

classifier = Sequential([
    Dense(64, activation='relu', input_dim=encoding_dim),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')  # Softmax for multiclass classification
])


# Compile and train the classifier
classifier.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
classifier.fit(X_train_encoded, y_resampled_multiOmics, epochs=250, batch_size=32, validation_data=(X_test_encoded, multiOmics_y_test), callbacks=[early_stopping])

# Predict probabilities
y_pred_prob = classifier.predict(X_test_encoded)

# Convert probabilities to class labels
y_pred = np.argmax(y_pred_prob, axis=1)

# Evaluate the classifier
accuracy = accuracy_score(multiOmics_y_test, y_pred)
print(f"Accuracy: {accuracy}")

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Accuracy: 0.6485148514851485
