# Trial code with stratified k fold

In [1]:
import tensorflow as tf
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, Dropout, GaussianNoise
from keras.optimizers import Adam, SGD
from keras.utils import to_categorical 
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline

Using TensorFlow backend.


In [2]:
df = pd.read_excel('Overall Colorectal Cancer Generated data.xlsx')
df.set_index('Name',inplace=True)

In [3]:
df_outputs= df['Marker']
df.drop('Marker',axis=1,inplace=True)
df_inputs=df

In [4]:
#splitting the data to training and testing
train_inputs, test_inputs, train_outputs, test_outputs = train_test_split(df_inputs, df_outputs, test_size=0.3, shuffle=True)

In [5]:
train_outputs = to_categorical(train_outputs)
test_outputs = to_categorical(test_outputs)

In [6]:
#Parameters used in the network
PIs = 0
number_of_glycans = len(df_inputs.columns) #it's actually the number of glycans
iterations = 500


In [7]:
def neural_network():
    model =tf.keras.Sequential()

    model.add(tf.keras.layers.Dense(units = 24,
                             activation='sigmoid',
                             ))

    model.add(tf.keras.layers.Dense(units = 10,
                             activation='sigmoid'))

    model.add(tf.keras.layers.Dense(units = 2,
                             activation='softmax'))
    
    model.compile(
        optimizer = tf.keras.optimizers.Adam(learning_rate=0.001),
        loss = 'categorical_crossentropy',
        metrics = ['accuracy'])
    
    return model

    #model = Sequential([
        #GaussianNoise(0.1,
        #    input_shape= (number_of_genes + PIs,)),
        #Dense(units = 24,
              #activation= 'sigmoid',
              #input_shape= (number_of_glycans + PIs,)),
       # Dropout(0.2),
       # Dense(units = 10,
       #       activation='sigmoid'),
       # Dense(units = 2,
       #       activation = 'softmax')])

In [8]:
#model.compile(
    #optimizer = Adam(lr=0.01),
    #loss = 'categorical_crossentropy',
    #metrics = ['accuracy'])

In [9]:
model = neural_network()

In [10]:
model.fit(
    train_inputs.to_numpy(),
    train_outputs,
    epochs = iterations,
    batch_size = 32,
    verbose = 1,
    shuffle = False)

Train on 1433 samples
Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500

<tensorflow.python.keras.callbacks.History at 0x16add016358>

In [11]:
print(model.evaluate(
    test_inputs,
    test_outputs,
    batch_size = 32,
    verbose = 1))

[0.19943678912108506, 0.9203252]


In [12]:
# Wrap Keras model so it can be used by scikit-learn
ANN = KerasClassifier(build_fn = neural_network,
                                 epochs=100,
                                 batch_size= 32,
                                 verbose = 1)

In [13]:
pipeline = Pipeline([('transformer', StandardScaler()), ('estimator', ANN)])

In [18]:
c = cross_val_score(pipeline,
                    df_inputs.to_numpy(), df_outputs,
                    cv= StratifiedKFold(n_splits=5, shuffle=True, random_state=101),
                    scoring='f1',
                    verbose=0)

Train on 1638 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
  32/1638 [..............................] - ETA: 0s - loss: 0.1838 - accuracy: 0.9688

KeyboardInterrupt: 

In [19]:
print(c)
print("Results: %.2f (%.2f)" % (c.mean(), c.std()))

[0.90452261 0.91803279 0.93366093 0.91725768 0.90537084]
Results: 0.92 (0.01)


In [16]:
#Calculate the labels for the test set
labels_test = model.predict(test_inputs)
labels_test = pd.DataFrame(data = labels_test)

In [17]:
#Calculate again the labels for the train set
labels_train = model.predict(train_inputs)
labels_train = pd.DataFrame(data = labels_train)