<a href="https://colab.research.google.com/github/Vivek-1116/SDAE-and-VAE-for-Cancer-Classification-through-Multi-omics-Feature-Extraction/blob/main/DATA_PRE_PROCESSING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MULTI-OMICS DATA PRE-PROCESSING
Dataset for this project is made up of Multi-Omics Cancer Benchmark TCGA Pre-processed data.

Data Source : http://acgt.cs.tau.ac.il/multi_omic_benchmark/download.html

Data Description : Lung Squamous Cell Carcinoma (LUSC) Data

Omics Involved : Gene expression, DNA methylation & miRNA expression

##RAW DATA

IMPORT LIBRARIES

In [None]:
import numpy as np #Fundamental package for scientific computing
import pandas as pd #For data manipulation and analysis
import matplotlib.pyplot as plt #2D plotting library
import seaborn as sns #Python data visualization library based on matplotlib
import warnings #Typically issued in situations where it is useful to alert the user of some condition in a program
from collections import Counter #Supports iterations
warnings.filterwarnings('ignore') #Ignores all warnings

MOUNT DRIVE TO GOOGLE COLAB & SET DATA PATH

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')
data_path = '/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/Raw/' #Change this path accordingly

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


IMPORT DATASET (3 DIFFERENT OMICS & 1 CLINICAL DATA)

In [None]:
print("RAW MULTIOMICS DATASET") 

dataframe0 = pd.read_csv(data_path + "lung.csv",sep=',') #Reading clinical dataset 
print("CLINICAL DATA :", dataframe0.shape)

dataframe1 = pd.read_csv(data_path + "exp.csv",sep=',') #Reading Gene Expression dataset
print("GENE EXPRESSION :", dataframe1.shape)

dataframe2 = pd.read_csv(data_path + "mirna.csv",sep=',') #Reading miRNA dataset
print("MIRNA :", dataframe2.shape)

dataframe3 = pd.read_csv(data_path + "methy.csv",sep=',') #Reading DNA Methylation dataset 
print("DNA METHYLATION :", dataframe3.shape)

RAW MULTIOMICS DATASET
CLINICAL DATA : (626, 164)
GENE EXPRESSION : (20531, 553)
MIRNA : (1046, 388)
DNA METHYLATION : (5000, 413)


DATA TRANSPOSITION

In [None]:
print("TRANSPOSED DATA")

dat1 = np.transpose(dataframe1)
print ("GENE EXPRESSION : ", dat1.shape)

dat2 = np.transpose(dataframe2)
print ("MIRNA : ", dat2.shape)

dat3 = np.transpose(dataframe3)
print ("DNA METHYLATION : ", dat3.shape)

TRANSPOSED DATA
GENE EXPRESSION :  (553, 20531)
MIRNA :  (388, 1046)
DNA METHYLATION :  (413, 5000)


REMOVAL OF DUPLICATION DATA

In [None]:
print("REMOVING DATA DUPLICATION")

d0 = dataframe0.drop_duplicates()
print ("SURVIVAL : ", d0.shape)

d1 = dat1.drop_duplicates() 
print ("GENE EXPRESSION : ", d1.shape)

d2 = dat2.drop_duplicates() 
print ("MIRNA : ", d2.shape)

d3 = dat3.drop_duplicates() 
print ("DNA METHYLATION : ", d3.shape)

REMOVING DATA DUPLICATION
SURVIVAL :  (626, 164)
GENE EXPRESSION :  (553, 20531)
MIRNA :  (388, 1046)
DNA METHYLATION :  (413, 5000)


##EXPLORATORY DATA ANALYSIS (EDA)

CHECKING FOR NON-NULL OBSERVATIONS & TOTAL NUMBER OF ENTRIES

In [None]:
print("Gene Expression:")
d1.info()
print("MiRNA:")
d2.info()
print("DNA Methylation:")
d3.info()

Gene Expression:
<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, PatientID to TCGA.XC.AA0X.01
Columns: 20531 entries, 0 to 20530
dtypes: object(20531)
memory usage: 86.6+ MB
MiRNA:
<class 'pandas.core.frame.DataFrame'>
Index: 388 entries, PatientID to TCGA.XC.AA0X.01
Columns: 1046 entries, 0 to 1045
dtypes: object(1046)
memory usage: 3.1+ MB
DNA Methylation:
<class 'pandas.core.frame.DataFrame'>
Index: 413 entries, PatientID to TCGA.XC.AA0X.01
Columns: 5000 entries, 0 to 4999
dtypes: object(5000)
memory usage: 15.8+ MB


SUMMARY STATISTICS OF ALL OBSERVED FEATURES & LABELS

In [None]:
print("Gene Expression:")
d1.describe()

Gene Expression:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,20491,20492,20493,20494,20495,20496,20497,20498,20499,20500,20501,20502,20503,20504,20505,20506,20507,20508,20509,20510,20511,20512,20513,20514,20515,20516,20517,20518,20519,20520,20521,20522,20523,20524,20525,20526,20527,20528,20529,20530
count,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,...,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0,553.0
unique,20.0,539.0,548.0,553.0,553.0,2.0,553.0,308.0,11.0,2.0,359.0,477.0,548.0,176.0,84.0,6.0,6.0,30.0,188.0,553.0,552.0,325.0,553.0,2.0,30.0,309.0,548.0,553.0,546.0,553.0,86.0,265.0,553.0,535.0,553.0,553.0,421.0,222.0,553.0,409.0,...,553.0,553.0,553.0,553.0,296.0,552.0,552.0,553.0,552.0,407.0,552.0,553.0,553.0,544.0,553.0,553.0,413.0,553.0,508.0,552.0,65.0,553.0,553.0,553.0,553.0,553.0,552.0,553.0,553.0,553.0,552.0,553.0,553.0,540.0,553.0,553.0,553.0,553.0,553.0,193.0
top,0.0,0.0,0.0,112.3263,673.824,0.0,112.8129,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,673.6896,0.0,0.0,1022.0492,0.0,0.0,0.0,0.0,1020.2186,0.0,49.852,0.0,0.0,112.2935,0.0,3070.7156,2047.3269,0.0,0.0,1023.7334,0.0,...,1023.3111,1122.1628,112.7252,112.9733,0.0,6.1285,233.7079,49.5105,1354.1667,0.0,51.8135,112.8136,112.26,0.0,448.957,112.4159,0.0,112.9288,0.0,237.1722,0.0,112.3959,575.1384,224.9797,575.2612,112.6998,223.3503,1267.7074,1022.8669,575.7302,52.6316,224.4735,2047.8646,0.0,778.7611,5119.6368,1122.3707,673.9358,6.1751,0.0
freq,534.0,15.0,5.0,1.0,1.0,552.0,1.0,242.0,543.0,552.0,192.0,73.0,4.0,376.0,470.0,548.0,548.0,524.0,366.0,1.0,2.0,229.0,1.0,552.0,524.0,243.0,6.0,1.0,8.0,1.0,467.0,289.0,1.0,19.0,1.0,1.0,129.0,329.0,1.0,140.0,...,1.0,1.0,1.0,1.0,256.0,2.0,2.0,1.0,2.0,143.0,2.0,1.0,1.0,10.0,1.0,1.0,139.0,1.0,37.0,2.0,489.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,14.0,1.0,1.0,1.0,1.0,1.0,358.0


In [None]:
print("MiRNA:")
d2.describe()

MiRNA:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,1006,1007,1008,1009,1010,1011,1012,1013,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023,1024,1025,1026,1027,1028,1029,1030,1031,1032,1033,1034,1035,1036,1037,1038,1039,1040,1041,1042,1043,1044,1045
count,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,...,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0
unique,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,33.0,383.0,388.0,388.0,388.0,388.0,2.0,388.0,2.0,332.0,338.0,388.0,388.0,388.0,388.0,388.0,11.0,160.0,388.0,243.0,12.0,2.0,4.0,3.0,2.0,210.0,126.0,24.0,124.0,...,384.0,242.0,388.0,84.0,388.0,23.0,377.0,100.0,139.0,69.0,388.0,388.0,314.0,2.0,2.0,18.0,2.0,388.0,388.0,388.0,388.0,26.0,342.0,332.0,20.0,382.0,85.0,373.0,387.0,20.0,2.0,2.0,388.0,113.0,387.0,379.0,388.0,388.0,388.0,388.0
top,7679.403313,6379.716329,7166.618752,17129.5073,1573.710113,1535.963682,1023.890151,24.557719,8770.266499,511.224618,575.813626,0.0,0.0,5629.453654,10751.56235,56.652552,36976.07532,0.0,24.683783,0.0,0.0,0.0,24.224433,510.065453,56.815047,25087.09067,11740.84423,0.0,0.0,24.324846,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.102295,0.0,13.142429,0.0,0.0,0.0,0.0,0.0,511.217417,56.676871,0.0,0.0,0.0,0.0,0.0,1022.365877,18943.0404,56.99649,13066.67966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.281229,0.0,0.0,0.0,62.889651,56.352158,510.746011,32767.19806
freq,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,356.0,6.0,1.0,1.0,1.0,1.0,387.0,1.0,387.0,57.0,51.0,1.0,1.0,1.0,1.0,1.0,378.0,229.0,1.0,146.0,377.0,387.0,385.0,386.0,387.0,179.0,263.0,365.0,265.0,...,5.0,147.0,1.0,305.0,1.0,366.0,12.0,289.0,250.0,320.0,1.0,1.0,75.0,387.0,387.0,371.0,387.0,1.0,1.0,1.0,1.0,363.0,47.0,57.0,369.0,7.0,304.0,16.0,2.0,369.0,387.0,387.0,1.0,276.0,2.0,10.0,1.0,1.0,1.0,1.0


In [None]:
print("DNA Methylation:")
d3.describe()

DNA Methylation:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,4960,4961,4962,4963,4964,4965,4966,4967,4968,4969,4970,4971,4972,4973,4974,4975,4976,4977,4978,4979,4980,4981,4982,4983,4984,4985,4986,4987,4988,4989,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,...,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
unique,413.0,413.0,409.0,413.0,413.0,413.0,413.0,413.0,401.0,413.0,413.0,413.0,413.0,397.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,405.0,411.0,413.0,371.0,413.0,413.0,413.0,...,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,412.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
top,0.055092,0.443131,0.947105,0.306076,0.065933,0.095614,0.156484,0.048311,0.384277,0.119321,0.053262,0.05501,0.292386,0.407832,0.01488,0.035765,0.047959,0.488015,0.939078,0.152303,0.148785,0.203653,0.013452,0.020353,0.05265,0.016435,0.106421,0.358962,0.497355,0.25902,0.094387,0.077683,0.107052,0.921328,0.790536,0.154553,0.466006,0.365633,0.248878,0.248808,...,0.024018,0.426242,0.036846,0.017782,0.019843,0.026726,0.040711,0.017047,0.025436,0.479141,0.438008,0.018462,0.028356,0.037934,0.030159,0.289771,0.048331,0.020912,0.040691,0.025365,0.03437,0.448768,0.070535,0.016238,0.0683,0.045009,0.064184,0.03309,0.960463,0.030912,0.022613,0.016445,0.455114,0.014005,0.021548,0.451401,0.027349,0.054007,0.030229,0.358823
freq,1.0,1.0,5.0,1.0,1.0,1.0,1.0,1.0,13.0,1.0,1.0,1.0,1.0,17.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,9.0,3.0,1.0,42.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


CHECKING FOR MISSING VALUES

In [None]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN GENE EXPRESSION")
d1.isnull().sum() 

CHECKING MISSING VALUES IN GENE EXPRESSION


0        0
1        0
2        0
3        0
4        0
        ..
20526    0
20527    0
20528    0
20529    0
20530    0
Length: 20531, dtype: int64

In [None]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN MIRNA")
d2.isnull().sum()

CHECKING MISSING VALUES IN MIRNA


0       0
1       0
2       0
3       0
4       0
       ..
1041    0
1042    0
1043    0
1044    0
1045    0
Length: 1046, dtype: int64

In [None]:
#Displays total NaNs found in each column
print("CHECKING MISSING VALUES IN DNA METHYLATION")
d3.isnull().sum()

CHECKING MISSING VALUES IN DNA METHYLATION


0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Length: 5000, dtype: int64

CHECKING FOR MISSING VALUES IN FORM OF ZEROS

In [None]:
zeros_genes = (d1 == 0).sum() #Checking for Zeros in Gene Expression
print("Printing Columns with Missing Values:")
zeros_genes

Printing Columns with Missing Values:


0        534
1         15
2          5
3          0
4          0
        ... 
20526      0
20527      0
20528      0
20529      0
20530    358
Length: 20531, dtype: int64

In [None]:
zeros_mirna = (d2 == 0).sum() #Checking for Zeros in MIRNA
print("Printing Columns with Missing Values:")
zeros_mirna

Printing Columns with Missing Values:


0        0
1        0
2        0
3        0
4        0
        ..
1041    10
1042     0
1043     0
1044     0
1045     0
Length: 1046, dtype: int64

In [None]:
zeros_methy = (d3 == 0).sum() #Checking for Zeros in DNA Methylation
print("Printing Columns with Missing Values:")
zeros_methy

Printing Columns with Missing Values:


0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    0
4997    0
4998    0
4999    0
Length: 5000, dtype: int64

REPLACING ZEROS WITH NaNs

In [None]:
d1_new = d1.replace(0, np.nan) 
d2_new = d2.replace(0, np.nan) 
d3_new = d3.replace(0, np.nan) 

DROPPING COLUMNS CONTAINING NULL VALUES

In [None]:
#Dropping columns containing NaN values
GENE = d1_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
MIRNA = d2_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)
METHY = d3_new.dropna(axis=1,how='any',thresh=None,subset=None, inplace=False)

OMICS DATA WITHOUT NULL VALUES

In [None]:
GENE.info()

<class 'pandas.core.frame.DataFrame'>
Index: 553 entries, PatientID to TCGA.XC.AA0X.01
Columns: 13434 entries, 3 to 20529
dtypes: object(13434)
memory usage: 56.7+ MB


In [None]:
MIRNA.info()

<class 'pandas.core.frame.DataFrame'>
Index: 388 entries, PatientID to TCGA.XC.AA0X.01
Columns: 230 entries, 0 to 1045
dtypes: object(230)
memory usage: 700.2+ KB


In [None]:
METHY.info()

<class 'pandas.core.frame.DataFrame'>
Index: 413 entries, PatientID to TCGA.XC.AA0X.01
Columns: 5000 entries, 0 to 4999
dtypes: object(5000)
memory usage: 15.8+ MB


DROPPING ALL COLUMNS EXCEPT SAMPLE ID & TYPE IN CLINICAL DATA

In [None]:
dataframe0.drop(dataframe0.columns.difference(['sampleID','sample_type']), 1, inplace=True) #Dropping all columns except Sample_ID & Sample type

CLINICAL DATA SEGMENTATION

In [None]:
dataframe0

Unnamed: 0,sampleID,sample_type
0,TCGA-18-3406-01,Primary Tumor
1,TCGA-18-3406-11,Solid Tissue Normal
2,TCGA-18-3407-01,Primary Tumor
3,TCGA-18-3407-11,Solid Tissue Normal
4,TCGA-18-3408-01,Primary Tumor
...,...,...
621,TCGA-O2-A52V-01,Primary Tumor
622,TCGA-O2-A52W-01,Solid Tissue Normal
623,TCGA-O2-A5IB-01,Solid Tissue Normal
624,TCGA-O2-A5IC-01,Solid Tissue Normal


REPLACING SAMPLE TYPE ID WITH VALUES 

In [None]:
d4=dataframe0.replace(to_replace=["Primary Tumor","Solid Tissue Normal"],value=["1","0"]) #Replacing "-" in patient ID into "." to sync IDs with omics

RENAMING COLUMN HEADS

In [None]:
Sample=d4.rename(columns={"sampleID":"PatientID","sample_type":"Class"})

In [None]:
Sample['PatientID']=Sample['PatientID'].str.replace('\-','.')

FINALIZED CLINICAL DATA

In [None]:
Sample

Unnamed: 0,PatientID,Class
0,TCGA.18.3406.01,1
1,TCGA.18.3406.11,0
2,TCGA.18.3407.01,1
3,TCGA.18.3407.11,0
4,TCGA.18.3408.01,1
...,...,...
621,TCGA.O2.A52V.01,1
622,TCGA.O2.A52W.01,0
623,TCGA.O2.A5IB.01,0
624,TCGA.O2.A5IC.01,0


STORING PROCESSED MULTIOMICS DATA

In [None]:
Processed_Gene=GENE.to_csv(r'/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/processed_Gene_latest.csv')
Processed_MiRNA=MIRNA.to_csv(r'/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/processed_MiRNA_latest.csv')
Processed_Methy=METHY.to_csv(r'/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/processed_Methy_latest.csv')

##MULTI-OMICS DATA INTEGRATION

IMPORT PRE-PROCESSED MULTIOMICS DATA FOR INTEGRATION

In [None]:
print("----PRE-PROCESSED----") 

dframe1 = pd.read_csv('/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/' + "processed_Gene_latest.csv",skiprows=1)
print("GENE EXPRESSION :", dframe1.shape)

dframe2 = pd.read_csv('/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/' + "processed_MiRNA_latest.csv",skiprows=1)
print("MIRNA :", dframe2.shape)

dframe3 = pd.read_csv('/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/' + "processed_Methy_latest.csv",skiprows=1)
print("DNA METHYLATION :", dframe3.shape)

----PRE-PROCESSED----
GENE EXPRESSION : (552, 13435)
MIRNA : (387, 231)
DNA METHYLATION : (412, 5001)


SETTING PATIENT ID AS DATA INDEX

In [None]:
Gene = dframe1.set_index('PatientID')
MiRNA = dframe2.set_index('PatientID')
Methy = dframe3.set_index('PatientID')
Clinical = Sample.set_index('PatientID')

INTEGRATION OF MULTIOMICS

In [None]:
integrate = [Gene,MiRNA,Methy,Clinical] #Integrating all 3 omics with class embedded
Multiomics = pd.concat(integrate, axis=1, join='inner')

MULTIOMICS DATASET

In [None]:
Multiomics

Unnamed: 0_level_0,?|10357,?|10431,?|155060,?|57714,?|653553,?|8225,A1BG|1,A2LD1|87769,A2M|2,A4GALT|53947,AAAS|8086,AACS|65985,AADAT|51166,AAGAB|79719,AAK1|22848,AAMP|14,AARS2|57505,AARSD1|80755,AARS|16,AASDHPPT|60496,AASDH|132949,AASS|10157,AATF|26574,AATK|9625,ABAT|18,ABCA11P|79963,ABCA1|19,ABCA2|20,ABCA3|21,ABCA5|23461,ABCA6|23460,ABCA7|10347,ABCA9|10350,ABCB10|23456,ABCB1|5243,ABCB6|10058,ABCB7|22,ABCB8|11194,ABCB9|23457,ABCC10|89845,...,rs2208123,rs2235751,rs2385226,rs2468330,rs2521373,rs264581,rs2804694,rs2857639,rs2959823,rs348937,rs3818562,rs3936238,rs4331560,rs472920,rs4742386,rs5926356,rs5931272,rs5936512,rs5987737,rs6426327,rs6471533,rs654498,rs6546473,rs6626309,rs6982811,rs6991394,rs715359,rs739259,rs7660805,rs7746156,rs798149,rs845016,rs877309,rs9292570,rs9363764,rs939290,rs951295,rs966367,rs9839873,Class
PatientID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
TCGA.18.5592.01,236.8295,1141.0830,88.0285,747.6943,409.8256,498.2193,12.0537,111.0547,2676.6396,2246.0049,833.5312,1965.1173,246.9181,1057.0724,1256.8715,3962.7431,569.7891,693.2700,3203.7257,930.3260,163.6380,164.3686,1859.9215,64.2864,21.9158,51.7432,1202.8089,1341.2474,62.8253,487.2614,36.1611,341.5213,14.9758,1291.2063,26.6642,2960.3360,550.8173,702.4016,191.1789,627.8879,...,0.445828,0.498636,0.973635,0.974207,0.928097,0.460344,0.952743,0.609471,0.906139,0.810115,0.163124,0.154739,0.022984,0.544173,0.832685,0.098743,0.976876,0.943812,0.952602,0.020949,0.519868,0.841502,0.016631,0.923785,0.179910,0.497850,0.419557,0.465892,0.850265,0.461360,0.012726,0.481794,0.239229,0.087837,0.536541,0.022941,0.540579,0.026454,0.045728,1
TCGA.18.5595.01,245.4566,999.4328,68.3494,671.5825,821.0437,423.4260,26.6591,45.0227,7857.6064,193.7039,961.1458,2001.1344,72.8871,889.1095,896.4833,2716.3925,537.1384,513.0459,6287.2944,502.8361,279.3534,346.5683,1592.4560,33.1849,188.3154,101.7754,640.6693,1015.8820,1530.9132,307.7141,14.4640,295.2354,7.6574,442.9949,4.8213,3476.2365,483.5508,228.5876,150.0851,374.3619,...,0.852261,0.516386,0.334989,0.473545,0.055859,0.954779,0.447732,0.615397,0.859717,0.455601,0.959690,0.399521,0.430007,0.691078,0.528738,0.889545,0.015282,0.925424,0.049640,0.037420,0.953146,0.358960,0.017422,0.911405,0.379057,0.500063,0.976334,0.552217,0.036871,0.546697,0.013850,0.396258,0.956542,0.482447,0.566233,0.966587,0.532004,0.510593,0.933625,1
TCGA.21.5782.01,308.5506,923.5955,146.0674,212.3596,1083.1461,767.4157,155.0562,198.5281,9403.2921,1151.6854,1142.6966,1384.2697,39.3258,1355.0562,1052.8090,3691.0112,988.5169,613.4831,7869.6629,719.1011,321.3483,278.6517,2089.8876,48.3146,86.5169,117.4045,1119.1011,861.7978,574.1573,234.8315,51.6854,525.8427,32.5843,277.5281,17.9775,2008.1910,302.2472,831.4607,292.9326,1108.9888,...,0.452553,0.036205,0.967153,0.471226,0.032291,0.438903,0.574025,0.963228,0.610111,0.402136,0.929609,0.030298,0.948934,0.549283,0.696018,0.469880,0.443723,0.696970,0.491719,0.042476,0.956615,0.916568,0.017187,0.513279,0.681604,0.329247,0.511011,0.524875,0.046657,0.028637,0.967372,0.933384,0.544448,0.298030,0.932454,0.030718,0.972964,0.466552,0.930599,1
TCGA.21.5783.01,192.7190,526.9117,150.0900,906.6365,748.4178,607.0371,144.2199,47.5353,5312.0566,340.8233,969.9239,1179.2371,190.1527,1441.6768,1964.2339,2394.7628,663.5749,453.7537,3939.4995,681.6466,291.4707,593.3926,903.1528,115.8335,306.2765,15.7551,1544.1125,1215.5258,372.7574,236.3119,28.7406,138.7679,10.7415,265.9235,21.4829,640.6666,548.9752,1006.2126,238.8841,418.3359,...,0.867139,0.068455,0.029902,0.383294,0.934068,0.032984,0.968455,0.069826,0.936634,0.238830,0.758543,0.971294,0.025164,0.500824,0.960496,0.115242,0.019951,0.039448,0.034378,0.875392,0.510459,0.920425,0.024096,0.064971,0.139338,0.068760,0.971782,0.033474,0.629115,0.032588,0.967869,0.583177,0.020746,0.357359,0.449217,0.963890,0.455323,0.529334,0.936669,1
TCGA.21.5784.01,161.6057,1196.5056,51.7527,212.7613,3455.4904,557.7795,144.1336,106.4647,20321.8932,1511.4453,470.6403,1242.9503,26.9822,1051.8633,543.6249,3648.3468,355.1697,462.2360,2652.2172,872.7192,205.2416,335.7293,1096.5388,65.9073,168.9705,100.3118,770.5407,426.8495,3009.1784,260.9753,60.5994,516.6427,23.0012,389.2514,37.5981,1725.8653,259.6483,669.2469,101.2098,877.1425,...,0.878958,0.046282,0.415254,0.025814,0.426774,0.029040,0.388852,0.038840,0.936436,0.524348,0.462386,0.443197,0.459078,0.966929,0.956077,0.942897,0.013271,0.489159,0.027884,0.024321,0.521091,0.088205,0.462810,0.449937,0.029276,0.051736,0.503357,0.441174,0.956625,0.019335,0.393124,0.503706,0.018432,0.639100,0.030030,0.590748,0.522493,0.031209,0.955941,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TCGA.O2.A52S.01,176.7177,1188.3278,226.8212,1100.5795,172.1854,452.8146,133.1664,75.6954,3629.9421,382.8642,1100.1656,707.7815,125.8278,826.9868,1319.9503,2686.6722,516.5025,791.3907,10120.8609,1137.0033,240.0662,200.7450,3512.0033,258.2781,50.9106,132.8146,2937.8932,1688.7417,482.6159,334.4371,26.0762,227.2351,20.6954,658.5265,34.3543,2513.3320,409.3543,670.9437,146.8212,419.2881,...,0.880683,0.045593,0.030224,0.963950,0.924719,0.041458,0.950275,0.954434,0.928485,0.301883,0.958972,0.026601,0.464005,0.038441,0.060987,0.112787,0.975272,0.046932,0.025996,0.892462,0.073984,0.275673,0.024864,0.941064,0.058028,0.066683,0.452329,0.040967,0.283380,0.749446,0.024437,0.084381,0.030087,0.952404,0.569576,0.043652,0.964282,0.367595,0.931168,1
TCGA.O2.A52V.01,188.7215,1248.0303,147.4945,510.5578,1253.0728,811.4298,148.8055,86.9041,5406.0132,1188.7803,829.0787,731.5895,61.3510,1471.5831,1637.5670,3304.9690,452.1315,614.3502,2855.3419,795.0415,162.6221,1217.3548,1477.8863,41.1808,476.5206,66.1330,1495.5016,750.9192,222.7125,619.3928,66.3935,630.3183,27.7340,758.4830,11.7659,676.2517,329.4464,902.1956,113.3606,846.3074,...,0.547567,0.032090,0.031295,0.443925,0.922467,0.959101,0.025450,0.624942,0.553258,0.393817,0.953462,0.035437,0.030303,0.051817,0.639068,0.870381,0.401359,0.503059,0.043866,0.944060,0.962645,0.876272,0.511247,0.928551,0.060193,0.937401,0.033606,0.403305,0.049309,0.035960,0.965751,0.067980,0.449615,0.472282,0.068062,0.958774,0.514518,0.431195,0.107549,1
TCGA.O2.A52W.01,260.3332,789.3606,854.3794,388.5008,2354.1107,985.4917,52.7243,81.4992,5299.1080,5458.3557,751.2090,1027.9420,49.4358,890.9189,1089.1994,3845.7818,997.2703,537.8829,3266.5234,675.9807,195.0564,679.2047,1721.6550,65.5562,296.0774,178.2053,1082.1762,219.7743,472.3267,365.9323,13.4336,522.8372,14.5083,396.0236,20.4191,304.2826,645.8893,565.2875,162.3536,1877.4852,...,0.884849,0.946784,0.021832,0.968483,0.936312,0.460789,0.491155,0.547796,0.064783,0.360556,0.410325,0.023759,0.020351,0.621506,0.350479,0.118179,0.016919,0.043427,0.957663,0.029629,0.454733,0.573819,0.015640,0.071451,0.966129,0.961666,0.560211,0.024349,0.035562,0.970618,0.971247,0.265095,0.573803,0.506155,0.546692,0.719723,0.530074,0.460235,0.931966,0
TCGA.O2.A5IB.01,160.1624,460.8626,569.2226,932.1086,201.2780,985.0905,14.9281,43.5304,16530.5698,26.8903,706.3365,1427.0501,456.0703,789.4036,835.7295,1313.0990,1310.7481,593.7167,5865.0160,934.5048,392.1725,365.8147,1921.9915,7.9872,105.1651,110.9159,228.1683,337.8594,501.5974,430.5112,4.5261,1315.4952,7.4547,431.8424,16.2407,497.7982,475.2396,688.7646,156.4297,758.7859,...,0.860961,0.024698,0.977430,0.969208,0.423610,0.446033,0.446225,0.960414,0.914394,0.079533,0.468437,0.349740,0.965156,0.036333,0.510305,0.491987,0.646609,0.962580,0.028517,0.029790,0.961354,0.488548,0.018625,0.967592,0.036266,0.529104,0.367031,0.024477,0.512597,0.340154,0.013990,0.031028,0.014438,0.019634,0.948490,0.945375,0.034040,0.501818,0.902703,0


SAVE FINALISED DATASET

In [None]:
#Save Finalised Lung Dataset
Complete_Data = Multiomics.to_csv(r'/content/gdrive/My Drive/PSM2 VIVEK/LUSC Dataset/Complete_MultiOmics.csv')