In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import math
import random
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV, StratifiedKFold, train_test_split, cross_val_score
from sklearn.preprocessing import  StandardScaler, LabelEncoder, OneHotEncoder, LabelBinarizer, MinMaxScaler, MultiLabelBinarizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
#train_X, train_Y, val_X, val_Y, test_X, test_Y, lb = data()

seed=1 # set a seed
np.random.seed(seed)
whole_X=np.random.uniform(0,1,(1900,28))
n=whole_X.shape[0]
p0=whole_X.shape[1] # the number of original variables

In [3]:
random.seed(seed)
art=np.array(random.sample(range(p0),10)) # the indexes of significant variables

In [4]:
# make the variables significant by shifting their values at half of the samples
np.random.seed(seed)
sign=np.random.choice([-1,1],len(art))
u=np.random.uniform(0.1,0.3,len(art))
u=u*sign
mat=np.reshape(np.tile(u,int(0.5*n)),(int(0.5*n),len(art)))
whole_X[:int(0.5*n),art]=whole_X[:int(0.5*n),art]+mat

In [5]:
# define labels
whole_Y=np.zeros((n,2))
whole_Y[:int(0.5*n),0]=1
whole_Y[int(0.5*n):,1]=1

In [6]:
# permute (shuffle) and standardize the data
t=np.random.RandomState(seed).permutation(n)
whole_X=whole_X[t]
whole_Y=whole_Y[t]
whole_X=preprocessing.scale(whole_X)

In [7]:
# data splitting
train_X=whole_X[:int(0.8*0.7*n)]
train_Y=whole_Y[:int(0.8*0.7*n)]
val_X=whole_X[int(0.8*0.7*n):int(0.8*n)]
val_Y=whole_Y[int(0.8*0.7*n):int(0.8*n)]
test_X=whole_X[int(0.8*n):]
test_Y=whole_Y[int(0.8*n):]

In [8]:
train_X

array([[ 0.87329573, -1.44324436,  0.81153141, ..., -0.73834841,
        -0.15128311, -0.93853261],
       [ 1.16797989, -1.39601828, -0.72136325, ...,  0.14389252,
         0.5797688 , -1.57298542],
       [-1.23807199, -0.94253513, -0.87217423, ...,  0.46124848,
         0.27846806, -0.37127371],
       ...,
       [ 1.74921042,  1.61597942, -1.34783588, ..., -0.42020446,
        -0.65669324,  0.1893029 ],
       [ 0.11059884, -0.8583228 ,  0.51963101, ..., -0.56545262,
         1.69962963, -0.93356977],
       [-0.36926745, -1.08250683, -0.90500698, ...,  1.34702218,
        -0.71989324, -0.86034265]])

In [9]:
train_Y

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [10]:
#data function for reading and processing the train and test sets
#necessary as an input for the optimisation algorithm
def data():
    #define input processing function
    def process_attributes(df, train, val, test):
        
        #define and fit the scaler to the full dataset
        cs = MinMaxScaler()
        cs.fit(df_inputs.select_dtypes(np.number))
        
        #scale the numerical input variables
        trainContinuous = cs.transform(train.select_dtypes(np.number))
        valContinuous = cs.transform(val.select_dtypes(np.number))
        testContinuous = cs.transform(test.select_dtypes(np.number))
        
        #uncomment the code below to accommodate for any categorical columns
        zipBinarizer = LabelBinarizer().fit(df["Gender"])
        trainCategorical = zipBinarizer.transform(train["Gender"])
        valCategorical = zipBinarizer.transform(val["Gender"])
        testCategorical = zipBinarizer.transform(test["Gender"])
        
        # construct our training and testing data points by concatenating
        # the categorical features with the continuous features
        trainX = np.hstack([trainContinuous, trainCategorical])
        valX = np.hstack([valContinuous, valCategorical])
        testX = np.hstack([testContinuous, testCategorical])
        
        
        # return the concatenated training and testing data
        return (trainX, valX, testX)
    
    #read the excel datasets
    df = pd.read_excel('Cleaned_Dataframe.xlsx')
    df.set_index('Sample',inplace=True)


    #separate cancer markers and input data
    df_outputs= df['Status']
    df_inputs = df.drop('Status',axis=1)
    
    X_train, X_test_val, y_train, y_test_val = train_test_split(df_inputs, df_outputs, random_state=100, stratify=df_outputs, test_size=0.4)
    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, random_state=100, stratify=y_test_val, test_size=0.5)
    #process the input sets
    (X_train_sc, X_val_sc, X_test_sc) = process_attributes(df_inputs, X_train, X_val, X_test)
    
    #encode the categorical output variables
    #encode categorical outputs
    lb = LabelBinarizer()
    lb.fit(y_train)
    train_outputs= lb.transform(y_train)
    val_outputs= lb.transform(y_val)
    test_outputs= lb.transform(y_test)
    
    lb2=MultiLabelBinarizer()
    lb2.fit(train_outputs)
    Y_train = lb2.transform(train_outputs)
    Y_val = lb2.transform(val_outputs)
    Y_test = lb2.transform(test_outputs)
    #Y_train = to_categorical(train_outputs)
    #Y_val = to_categorical(val_outputs)
    #Y_test = to_categorical(test_outputs)

    return X_train_sc, Y_train, X_val_sc, Y_val, X_test_sc, Y_test, lb

train_X2, train_Y2, val_X2, val_Y2, test_X2, test_Y2, lb = data()

In [20]:
print('there are',len(train_X2)+len(test_X2)+len(val_X2), 'samples in total', 'with', train_X2.shape[1], 'features')

there are 1957 samples in total with 26 features


In [12]:
train_Y2

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [0, 1],
       [1, 0],
       [0, 1]])

In [13]:
train_Y

array([[0., 1.],
       [1., 0.],
       [0., 1.],
       ...,
       [0., 1.],
       [1., 0.],
       [1., 0.]])

In [21]:
n_dim=train_X2.shape[1]
No=np.array(range(n_dim))
No

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25])