In [1]:
import math
import time
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, Flatten, Dropout, MaxPooling1D
from sklearn import tree
from sklearn.metrics import accuracy_score
pd.options.display.max_rows = 8
pd.options.display.max_columns = 9
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('mode.chained_assignment', None)
dataset = pd.read_csv("Dataset_Github_Labeled.csv")
#addNoise(dataset, 1) #second number for 2^power size i.e. 1->2 times, 2-> 4 times, 3->8 times, takes a while
x= dataset.drop(['class'], axis=1)
y= dataset['class']
for i in range (0,dataset.shape[0]):
    if y[i].startswith('High-grade'):
        y[i] = 'High-grade'
    elif y[i].startswith('Low-grade'):
        y[i] = 'Low-grade'
    elif y[i].startswith('Normal'):
        y[i] = 'Normal'



Using TensorFlow backend.


In [4]:
def addNoise(data,powerIN):
    sd = dataset.std(axis=0)
    m = dataset.mean(axis = 0)
    original = (data.shape[0])
    datacopy = data.copy()
    frames = [data,datacopy]
    for i in range(powerIN): #how many duplicates
        data2 = pd.concat(frames,ignore_index=True)
    for i in range(original, data.shape[0]):
        print(i)
        for j in range (1,1365):
            data.iloc[i,j]+=(np.random.normal(m[j], sd[j], 1))[0]
    return data

def splitData(dataset):
    training, validation, test = np.split(dataset.sample(frac=1), [int(.6*len(dataset)), int(.8*len(dataset))]) 
    x_train = training.drop(['class'], axis=1)
    y_train = training['class']
    x_validation=validation.drop(['class'], axis=1)
    y_validation=validation['class']
    x_test=test.drop(['class'], axis=1)
    y_test=test['class']
    lbl_encoder = LabelEncoder()
    y_train= lbl_encoder.fit_transform(y_train)
    y_test= lbl_encoder.fit_transform(y_test)
    y_validation= lbl_encoder.fit_transform(y_validation)
    return x_train, y_train, x_validation, y_validation, x_test, y_test

# Convert numeric features into Dense Tensors, and construct the feature columns
def construct_feature_columns(input_features_DataFrame):
    tensorSet = ([])
    for elem in input_features_DataFrame:
        tensorSet.append( tf.feature_column.numeric_column(str(elem)) )
    return tensorSet

# Create the input function for training + evaluation. boolean = True for training.
def input_fn(features, labels, training=True, batch_size=32 ):
    dataf = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    if training:
        dataf = dataf.shuffle(200).repeat()
    return dataf.batch(batch_size=batch_size)

def trainModels(feature_columns, x_train, y_train,x_test,y_test):
    modelCNN,accuracy,predictions = trainCNN(x_train,y_train,x_test, y_test)
    print('STARTING TO TRAIN THE MODELS--------------------------------------------------')
    modelSVM = trainSVM(x_train, y_train)
    print('FINISHED THE SVM--------------------------------------------------------------')
    modelDNN = trainDNN(feature_columns, x_train, y_train)
    print('FINISHED THE DNN--------------------------------------------------------------')
    modelTREE = trainTree(feature_columns, x_train, y_train)
    print('FINISHED THE TREE-------------------------------------------------------------')

    return modelSVM, modelDNN, modelTREE, modelCNN, accuracy,predictions

def trainTree(feature_columns, x_train, y_train):
    treeclassifier = tree.DecisionTreeClassifier()
    treeclassifier = treeclassifier.fit(x_train, y_train)

    return treeclassifier

def trainCNN(trainX, trainY, testX, testY):
    trainY = keras.utils.to_categorical(trainY)
    testY = keras.utils.to_categorical(testY)

    verbose, epochs, batch_size = 0, 10, 32
    trainX = np.expand_dims(trainX, axis=2)
    testX = np.expand_dims(testX, axis=2)

    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainY.shape[1]

    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])


    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=verbose)
    _, accuracy = model.evaluate(testX, testY, batch_size=batch_size, verbose=0)
    OP = model.predict_proba(testX)

    return model, accuracy, OP

def trainDNN(feature_columns, x_train, y_train):
    learning_rate=0.001
    if (tf.__version__[0] == '2'):
        optimizer_adam= tf.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer_adam= tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    hidden_units=[37,30,19]
    model=tf.estimator.DNNClassifier(hidden_units=hidden_units, feature_columns=feature_columns,  optimizer=optimizer_adam, n_classes=3)
    model.train(input_fn=lambda: input_fn(features=x_train, labels=y_train, training=True), steps=1000) # originally steps=1000 from template
    return model

def trainSVM(x_train, y_train):
    model = SVC(kernel='linear', probability = True)
    model.fit(x_train, y_train)
    return model



In [5]:
#split into train/test/val
x_train, y_train, x_validation, y_validation, x_test, y_test = splitData(dataset)
#construct feature columns
x_labels = x.head(0)
feature_columns=construct_feature_columns(x_labels)

modelSVM, modelDNN, modelTREE, modelCNN, accuracy, predictionsCNN = trainModels(feature_columns, x_train, y_train,x_test,y_test)
ptemp = list(modelDNN.predict(input_fn=lambda: input_fn(features=x_test, labels=y_test, training=False)))
predictionsDNN = []
for i in range(len(ptemp)):
    predictionsDNN.append(ptemp[i]["probabilities"])
predictionsSVM = modelSVM.predict_proba(x_test) 
predictionsTREE = modelTREE.predict_proba(x_test)


print ('DNN:' ,predictionsDNN[0])
print ('TREE:', predictionsTREE[0])
print ('SVM:' ,predictionsSVM[0])
print ('CNN: ',predictionsCNN[0])

STARTING TO TRAIN THE MODELS--------------------------------------------------
FINISHED THE SVM--------------------------------------------------------------
INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\R-k-l\\AppData\\Local\\Temp\\tmps71b0q47', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x0000023680A21B00>, '_task_type': 'worker', '_task_id': 

In [6]:
# Doesnt work yet
def cross_validation_sklearn(trained_model, num_folds, X, y):
    i=1
    cv = StratifiedKFold(n_splits=num_folds)
    total_accuracy=0
    total_f1=0
    total_precision=0
    total_recall=0
    total_auc=0
    for train_index, test_index, in cv.split(X, y):
        sm = SMOTE()
        print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
        model.fit(X_train_oversampled, y_train_oversampled) 
        y_pred = model.predict(X_test)
        y_score= model.predict_proba(X_test)[:,1]

        # Compute results.
        accuracy=model.score(X_test, y_test)
        f1=f1_score(y_test, y_pred, average="macro")
        recall=recall_score(y_test, y_pred, average="macro")
        precision=precision_score(y_test, y_pred, average="macro")
        auc=roc_auc_score(y_test, y_score)
        average_precision=average_precision_score(y_test, y_score)

        # Show results. 
        print('__________________________________________________________')
        print('For fold:', i)
        print('Accuracy: {0:0.4f}'.format(accuracy))
        print("f1 score: {0:0.4f}".format(f1))
        print("recall score: {0:0.4f}".format(recall))
        print("precision score: {0:0.4f}".format(precision))
        print('AUC score: {0:0.4f}'.format(auc))
        disp = plot_precision_recall_curve(model, X_test, y_test)
        disp.ax_.set_title('2-class Precision-Recall curve: '
                       'Average Precision={0:0.2f}'.format(average_precision))

        # Update logic for filling in average values for table. 
        total_accuracy+=accuracy
        total_f1+=f1
        total_recall+=recall
        total_precision+=precision
        total_auc+=auc
        i+=1


    print("total_accuracy:", round(total_accuracy/number_splits, 4))
    print("total_f1:", round(total_f1/number_splits, 4))
    print("total_recall:", round(total_recall/number_splits, 4))
    print("total_precision:", round(total_precision/number_splits, 4))
    print("total_auc:", round(total_auc/number_splits, 4))

In [47]:
def max_prediction(predictionsDNN): # used for confusion matrix
    predictionsDNN=list(predictionsDNN)
    aList=[]
    for elem in predictionsDNN:
        aList.append(np.argmax(elem))
    return aList
 

In [55]:
if (tf.__version__[0] == '2'):
    confusion_matrix_DNN = tf.math.confusion_matrix(labels=list(y_test), 
            predictions=max_prediction(predictionsDNN), num_classes=3)
    confusion_matrix_CNN = tf.math.confusion_matrix(labels=list(y_test), 
            predictions=max_prediction(predictionsCNN), num_classes=3)    
else:
    confusion_matrix_DNN = tf.confusion_matrix(labels=list(y_test), 
            predictions=max_prediction(predictionsDNN), num_classes=3)
    confusion_matrix_CNN = tf.confusion_matrix(labels=list(y_test), 
            predictions=max_prediction(predictionsCNN), num_classes=3) 
print("DNN: ")
print(confusion_matrix_DNN)
print("CNN: ")
print(confusion_matrix_CNN)
from sklearn.metrics import confusion_matrix
print("SVM:")
print(confusion_matrix(y_test, max_prediction(predictionsSVM)))
print("Tree:")
print(confusion_matrix(y_test, max_prediction(predictionsTREE)))

DNN: 
tf.Tensor(
[[24  0  0]
 [ 0 19  2]
 [ 4  1 15]], shape=(3, 3), dtype=int32)
CNN: 
tf.Tensor(
[[24  0  0]
 [ 0 21  0]
 [ 8  3  9]], shape=(3, 3), dtype=int32)
SVM:
[[24  0  0]
 [ 0 17  4]
 [ 3  0 17]]
Tree:
[[22  0  2]
 [ 2 14  5]
 [ 5  5 10]]


In [45]:
max_prediction_in_tf(predictionsSVM)

[1,
 2,
 2,
 0,
 1,
 0,
 2,
 1,
 1,
 1,
 0,
 1,
 2,
 2,
 0,
 0,
 0,
 1,
 2,
 1,
 1,
 2,
 0,
 0,
 2,
 0,
 1,
 2,
 1,
 0,
 2,
 0,
 1,
 0,
 2,
 2,
 2,
 2,
 0,
 1,
 0,
 0,
 2,
 0,
 1,
 1,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 1,
 2,
 2,
 0,
 0,
 2]