### **Drive Connection**

In [None]:
!pip install keras==2.9

In [None]:
genes_52 = ['CFH',
 'RHBDL1',
 'OR51B2',
 'OR10G9',
 'MIR592',
 'OR5B17',
 'DEF6',
 'ZNF671',
 'FLACC1',
 'GBP4',
 'FBXO47',
 'SERPINA3',
 'OR8D4',
 'DGKB',
 'SYNGR2',
 'CHAD',
 'VTRNA1-2',
 'OR8J3',
 'AGR3',
 'RP11-12M5.1',
 'RPL13AP',
 'ARHGAP40',
 'RP4-761J14.9',
 'AC073508.1',
 'AOX1',
 'PTPRQ',
 'CYP2F1',
 'PPP2R3A',
 'CARD6',
 'CTD-2370N5.3',
 'EMBP1',
 'SNORD32B',
 'RP11-344P13.6',
 'FJX1',
 'OR2T4',
 'FMOD',
 'MT1DP',
 'ARHGDIB',
 'RP11-159D12.2',
 'MESTIT1_1',
 'PRELP',
 'OR51B6',
 'SLC2A5',
 'OR10S1',
 'CTD-2298J14.2',
 'SYNPR-AS1',
 'TUSC7',
 'KLK3',
 'TXLNB',
 'IFI27',
 'MNDA',
 'KRTAP19-1']

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
import os
os.chdir('/content/drive/My Drive/TCGA Multiomic')

### **Libraries**

In [None]:
# Load the TensorBoard notebook extension
#%load_ext tensorboard
#%tensorflow_version 1.14
#!pip uninstall -y keras-nightly
#!pip install h5py==2.10.0
#!pip install keras==2.2.5 
#pip install keras==2.2.5 

In [None]:
%matplotlib inline  

import imp
import numpy as np
import os
import pandas as pd
#import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from keras.models import Model,Sequential
from keras.layers import Input,Dense,Dropout, concatenate, ReLU, LeakyReLU, Lambda
from keras.optimizers import Adam, SGD, Adadelta
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from keras.callbacks import TensorBoard
#from sklearn.externals import joblib


### **Reading RNA Seq Dataset**

In [None]:
dfRNA = pd.read_csv('https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FHiSeqV2.gz',compression='gzip',sep='\t',index_col=0)
dfRNA = dfRNA.reindex(sorted(dfRNA.columns), axis=1)
dfRNA=dfRNA.T
RNAcols = dfRNA.columns

In [None]:
print("RNA Matrix Before:", dfRNA.shape)
dfRNA.head(2) 

### **Reading CNV Dataset**

In [None]:
dfCNV = pd.read_csv('/content/drive/My Drive/CNV/dataset/TCGA_BRCA_CNV.gz',compression='gzip',sep='\t',index_col=0)
dfCNV = dfCNV.reindex(sorted(dfCNV.columns), axis=1)
dfCNV = dfCNV.T
CNVcols = dfCNV.columns

In [None]:
print("CNV Matrix Before:", dfCNV.shape)
dfCNV.head(2)

## **Reading Methylation Dataset**

In [None]:
dfMethyl = pd.read_csv('/content/drive/My Drive/methylation/dataset/genelevelMethyl.csv', index_col=0)
dfMethyl = dfMethyl.reindex(sorted(dfMethyl.columns), axis=1)
dfMethyl = dfMethyl.T
METHYLcols = dfMethyl.columns

In [None]:
dfMethyl.index = dfMethyl.index.str.replace('.', '-')

#### **ALL COMMON INDEXES**

In [None]:
"""commonIndexesAll = list(set.intersection(set(dfMethyl.index), set(dfCNV.index), set(dfRNA.index)))
print(len(commonIndexesAll))
dfRNA = dfRNA.loc[commonIndexesAll]
dfCNV = dfCNV.loc[commonIndexesAll]
dfMethyl = dfMethyl.loc[commonIndexesAll]
"""

### **Reading Clinical Data for which PAM50 labels are available**

In [None]:
#dfPAM50=pd.read_csv('https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix',sep="\t",index_col=0)
dfPAM50=pd.read_csv('/content/drive/My Drive/CNV/dataset/BRCA_clinicalMatrix.gz',compression='gzip',sep='\t',index_col=0)
print("Clinical Data Before:", dfPAM50.shape)
dfPAM50=dfPAM50['PAM50Call_RNAseq']
dfPAM50=dfPAM50[dfPAM50.isna()==False]

### **Finding Common Indexes**

In [None]:
commonIndexes = np.sort(list( set.intersection(set(dfPAM50.index),set(dfMethyl.index), set(dfCNV.index))  ))
len(commonIndexes)

In [None]:
dfPAM50 = dfPAM50.loc[commonIndexes]
dfMethyl50 = dfMethyl.loc[commonIndexes]
dfCNV50 = dfCNV.loc[commonIndexes]
dfRNA50 = dfRNA.loc[commonIndexes]
dfPAM50.value_counts()

### **Garbage Collection and creation of numpy arrays**

In [None]:
YPAM50=dfPAM50.values

In [None]:
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> CLEANING UP MEMORY SPACE <<<<<<<<<<<<<<<<<<<<<<<<<<<<
#del dfPAM50; del dfRNA; del dfMethyl; del dfCNV; del dfRNA50; del dfCNV50; del dfMethyl50; del dfRNAEx; del dfCNVEx; del dfMethylEx; 
import gc
gc.collect()

In [None]:
label_encoder=LabelEncoder()
YPAM50=label_encoder.fit_transform(YPAM50)
le_name_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(le_name_mapping)
from collections import Counter
print(Counter(YPAM50))

In [None]:
Y = YPAM50.reshape(-1, 1)
Y.shape

In [None]:
rna_genes = genes_52

In [None]:
cnv_genes = genes_52


In [None]:
methyl_genes = genes_52

In [None]:

XRNA = dfRNA50[list(set(rna_genes)-set(
    ['RPL13AP', 'ARHGAP40', 'FLACC1', 'MESTIT1_1', 'CTD-2298J14.2', 'TUSC7', 'RP11-159D12.2',
     'RP11-12M5.1', 'CTD-2370N5.3', 'AC073508.1', 'RP11-344P13.6', 'MIR592', 'SYNPR-AS1', 'RP4-761J14.9', 'EMBP1']))]

XCNV = dfCNV50[list(set(cnv_genes) - set(
    ['RPL13AP', 'FLACC1', 'MESTIT1_1', 'KRTAP19-1', 'CTD-2298J14.2', 'RP11-159D12.2', 'RP11-12M5.1',
     'CTD-2370N5.3', 'AC073508.1', 'RP11-344P13.6', 'SYNPR-AS1', 'RP4-761J14.9', 'VTRNA1-2', 'EMBP1']))]

XMethyl = dfMethyl50[list(set(methyl_genes))]

In [None]:
XCNV.shape,  XMethyl.shape, XRNA.shape 

In [None]:
X = np.array(np.concatenate((XCNV, XMethyl, XRNA), axis=1))

X.shape

In [None]:
gc.collect()

In [None]:
from keras.layers import Softmax


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

for seed in [0,1,2,3,4,55,666,7777,88888,999999]:
  total = 0
  i = 0
  n_splits = 5
  skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

  np.random.seed(seed)
  input = Input(shape=(X.shape[1],))
  hidden1 = Dense(40, activation='relu')(input)
  dropout1 = Dropout(0.3, seed=seed)(hidden1)
  hidden2 = Dense(10, activation='relu')(dropout1)
  dropout2 = Dropout(0.3, seed=seed)(hidden2)
  hidden3 = Dense(5, activation=None)(dropout2)
  softmax = Softmax()(hidden3)
  #classifier = Model(inputs=input, outputs=softmax)
  
  adam=Adam(learning_rate=0.002)
  
  for train_idx, test_idx in skf.split(X, Y):
    X_train = X[train_idx]
    Y_train = Y[train_idx]
    X_test = X[test_idx]
    Y_test = Y[test_idx]
    
    sm=SMOTE(random_state=seed)
    X_train_res,Y_train_res=sm.fit_resample(X_train,Y_train)
    
    classifier = Model(inputs=input, outputs=softmax)
    classifier.compile(optimizer=adam, loss='sparse_categorical_crossentropy',metrics=['accuracy'])
    classifier.fit(X_train_res, Y_train_res, epochs=100, batch_size=16, verbose=0)
    y_pred = classifier.predict(X_test)
    total += accuracy_score(Y_test, y_pred.argmax(axis=-1))
    i += 1
  print(f'5 fold Accuracy for seed {seed}: ', total/5)

In [None]:
gc.collect()