In [1]:
import time
from IPython import display
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import metrics
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, Flatten, Dropout, MaxPooling1D
from sklearn import tree
from sklearn.metrics import accuracy_score, confusion_matrix
pd.options.display.max_rows = 8
pd.options.display.max_columns = 9
pd.options.display.float_format = '{:.6f}'.format
pd.set_option('mode.chained_assignment', None)

dataset = pd.read_csv("Dataset_Github_Labeled.csv")
#addNoise(dataset, 1) #second number for 2^power size i.e. 1->2 times, 2-> 4 times, 3->8 times, takes a while
x= dataset.drop(['class'], axis=1)
y= dataset['class']
for i in range (0,dataset.shape[0]):
    if y[i].startswith('High-grade'):
        y[i] = 'High-grade'
    elif y[i].startswith('Low-grade'):
        y[i] = 'Low-grade'
    elif y[i].startswith('Normal'):
        y[i] = 'Normal'

def addNoise(data,powerIN):
    sd = dataset.std(axis=0)
    m = dataset.mean(axis = 0)
    original = (data.shape[0])
    datacopy = data.copy()
    frames = [data,datacopy]
    for i in range(powerIN): #how many duplicates
        data2 = pd.concat(frames,ignore_index=True)
    for i in range(original, data.shape[0]):
        for j in range (1,1365):
            data.iloc[i,j]+=(np.random.normal(m[j], sd[j], 1))[0]
    return data

def splitData(dataset):
    training, validation, test = np.split(dataset.sample(frac=1), [int(.6*len(dataset)), int(.8*len(dataset))]) 
    x_train = training.drop(['class'], axis=1)
    y_train = training['class']
    x_validation=validation.drop(['class'], axis=1)
    y_validation=validation['class']
    x_test=test.drop(['class'], axis=1)
    y_test=test['class']
    lbl_encoder = LabelEncoder()
    y_train= lbl_encoder.fit_transform(y_train)
    y_test= lbl_encoder.fit_transform(y_test)
    y_validation= lbl_encoder.fit_transform(y_validation)
    return x_train, y_train, x_validation, y_validation, x_test, y_test

# Convert numeric features into Dense Tensors, and construct the feature columns
def construct_feature_columns(input_features_DataFrame):
  tensorSet = ([])
  for elem in input_features_DataFrame:
    tensorSet.append( tf.feature_column.numeric_column(str(elem)) )
  return tensorSet

# Create the input function for training + evaluation. boolean = True for training.
def input_fn(features, labels, training=True, batch_size=32 ):
    dataf = tf.data.Dataset.from_tensor_slices((dict(features), labels))
    if training:
        dataf = dataf.shuffle(200).repeat()
    return dataf.batch(batch_size=batch_size)

def trainModels(feature_columns, x_train, y_train,x_test,y_test):
    modelCNN,accuracy,predictions = trainCNN(x_train,y_train,x_test, y_test)
    modelSVM = trainSVM(x_train, y_train)
    modelDNN = trainDNN(feature_columns, x_train, y_train)
    modelTREE = trainTree(feature_columns, x_train, y_train)
    return modelSVM, modelDNN, modelTREE, modelCNN, accuracy,predictions

def trainTree(feature_columns, x_train, y_train):
    treeclassifier = tree.DecisionTreeClassifier()
    treeclassifier = treeclassifier.fit(x_train, y_train)

    return treeclassifier

def trainCNN(trainX, trainY, testX, testY):
    trainY = keras.utils.to_categorical(trainY)
    testY = keras.utils.to_categorical(testY)

    verbose, epochs, batch_size = 0, 10, 32
    trainX = np.expand_dims(trainX, axis=2)
    testX = np.expand_dims(testX, axis=2)

    n_timesteps, n_features, n_outputs = trainX.shape[1], trainX.shape[2], trainY.shape[1]

    model = Sequential()
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(n_timesteps,n_features)))
    model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(n_outputs, activation='softmax'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])


    model.fit(trainX, trainY, epochs=epochs, batch_size=batch_size, verbose=verbose)
    _, accuracy = model.evaluate(testX, testY, batch_size=batch_size, verbose=0)
    OP = model.predict_proba(testX)

    return model, accuracy, OP

def trainDNN(feature_columns, x_train, y_train):
    learning_rate=0.001
    if (tf.__version__[0] == '2'):
        optimizer_adam= tf.optimizers.Adam(learning_rate=learning_rate)
    else:
        optimizer_adam= tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    hidden_units=[37,30,19]
    model=tf.estimator.DNNClassifier(hidden_units=hidden_units, feature_columns=feature_columns,  optimizer=optimizer_adam, n_classes=3)
    model.train(input_fn=lambda: input_fn(features=x_train, labels=y_train, training=True), steps=1000) # originally steps=1000 from template
    return model

def trainSVM(x_train, y_train):
    model = SVC(kernel='linear', probability = True)
    model.fit(x_train, y_train)
    return model

def getAccuracy(x_test, y_test, modelSVM, modelDNN, modelTREE, accuracyCNN):
    pSVM = modelSVM.predict(x_test)
    accuracySVM = accuracy_score(pSVM,y_test)
    pTREE = modelTREE.predict(x_test)
    accuracyTREE = accuracy_score(pTREE, y_test)
    pDNN = modelDNN.evaluate(input_fn=lambda: input_fn(features=x_test, labels=y_test, training=False), steps=1)
    accuracyDNN = pDNN["accuracy"]

    return accuracySVM, accuracyDNN, accuracyTREE, accuracyCNN




Using TensorFlow backend.


In [2]:
#split into train/test/val
x_train, y_train, x_validation, y_validation, x_test, y_test = splitData(dataset)
#construct feature columns
x_labels = x.head(0)
feature_columns=construct_feature_columns(x_labels)

modelSVM, modelDNN, modelTREE, modelCNN, accuracyCNN, predictionsCNN = trainModels(feature_columns, x_train, y_train,x_test,y_test)
ptemp = list(modelDNN.predict(input_fn=lambda: input_fn(features=x_test, labels=y_test, training=False)))
predictionsDNN = []
for i in range(len(ptemp)):
    predictionsDNN.append(ptemp[i]["probabilities"])
predictionsSVM = modelSVM.predict_proba(x_test) 
predictionsTREE = modelTREE.predict_proba(x_test)

accuracySVM, accuracyDNN, accuracyTREE, accuracyCNN = getAccuracy(x_test, y_test, modelSVM, modelDNN, modelTREE, accuracyCNN)

print(y_train)
predictionsENS = []

for i in range(len(predictionsDNN)):
    tempDNN = predictionsDNN[i]
    tempCNN = predictionsCNN[i]
    tempTREE = predictionsTREE[i]
    tempSVM = predictionsSVM[i]
    score0 = accuracyDNN*tempDNN[0] + accuracyCNN*tempCNN[0] + accuracyTREE*accuracyTREE*tempTREE[0] + accuracySVM*tempSVM[0]
    score1 = accuracyDNN*tempDNN[1] + accuracyCNN*tempCNN[1] + accuracyTREE*accuracyTREE*tempTREE[1] + accuracySVM*tempSVM[1]
    score2 = accuracyDNN*tempDNN[2] + accuracyCNN*tempCNN[2] + accuracyTREE*accuracyTREE*tempTREE[2] + accuracySVM*tempSVM[2]
    if score0 > score1 and score0 > score2:
        predictionsENS.append(0)
    elif score1 > score0 and score1 > score2:
        predictionsENS.append(1)
    elif score2 > score0 and score2 > score1:
        predictionsENS.append(2)


finalAccuracy = accuracy_score(predictionsENS, y_test)
print('DNN:' , accuracyDNN)
print('CNN:', accuracyCNN)
print('SVM:', accuracySVM)
print('TREE: ', accuracyTREE)
print('...................')
print('ENS:' ,finalAccuracy)
print(confusion_matrix(predictionsENS, y_test))

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\R-k-l\\AppData\\Local\\Temp\\tmpsptn3_0h', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x00000275AF4BBAC8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
Instructions for 

In [45]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import precision_score, f1_score, recall_score
import matplotlib.pyplot as plt
def cross_validation_svm(hyperparameters, X, y, num_folds=5):
    """
    Parameters:
        model-sklearn untrained model to be cross validated. 
        X-DataFrame without labels
        y-labels for same size DataFrame as X
        num_folds-number of folds to split for cross validation. 

    Returns:
        sklearn model, best fold for model_untrained. 
    """
    # Perform Cross Validation. 
    cv = StratifiedKFold(n_splits=num_folds)
    i=1
    best_accuracy=0
    best_model_test_index=[]
    for train_index, test_index, in cv.split(X, y):
        # Select the indices. 
        sm = SMOTE()
        # print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y[train_index], y[test_index]
        X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)
        
        model = SVC(kernel=hyperparameters[0], probability=hyperparameters[1])
        model.fit(X_train_oversampled, y_train_oversampled ) 
        y_pred = model.predict(X_test)
        y_score= model.predict_proba(X_test)[:,1]

        # Compute results.
        accuracy=model.score(X_test, y_test)
        if accuracy > best_accuracy:
            best_accuracy=accuracy
            best_model=model
            best_model_test_index = test_index
#         f1=f1_score(y_test, y_pred, average="macro")
#         recall=recall_score(y_test, y_pred, average="macro")
#         precision=precision_score(y_test, y_pred, average="macro")
#         auc=roc_auc_score(y_test, y_score)
#         average_precision=average_precision_score(y_test, y_score)

        # Show results. 
        print('__________________________________________________________')
        print('For fold:', i)
        print('Accuracy: {0:0.4f}'.format(accuracy))
#         print("f1 score: {0:0.4f}".format(f1))
#         print("recall score: {0:0.4f}".format(recall))
#         print("precision score: {0:0.4f}".format(precision))
#         print('AUC score: {0:0.4f}'.format(auc))
#         disp = plot_precision_recall_curve(model, X_test, y_test)
#         disp.ax_.set_title('2-class Precision-Recall curve: '
#                        'Average Percision={0:0.2f}'.format(average_precision))
        i+=1
    return best_model, best_model_test_index

In [46]:
# model = SVC(kernel='linear', probability = True)
hyperparam = ['linear', True]
best_model, abc = cross_validation_svm(hyperparam, x, y, num_folds=5)
# best_model.score(x.iloc[ind], y[ind])

__________________________________________________________
For fold: 1
Accuracy: 0.7846
__________________________________________________________
For fold: 2
Accuracy: 0.8615
__________________________________________________________
For fold: 3
Accuracy: 0.6615
__________________________________________________________
For fold: 4
Accuracy: 0.7846
__________________________________________________________
For fold: 5
Accuracy: 0.7969


In [47]:
best_model.score(x.iloc[abc], y[abc])

0.8615384615384616

In [6]:
# This code block will create n_iter_search models, and order them by accuracy based on hyperparameter values. 

# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

# XGBoost
import numpy as np
import xgboost as xgb
from time import time
import scipy.stats as stats
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
model=xgb.XGBClassifier()
# specify parameters and distributions to sample from
param_dist = {'learning_rate' : [0.05, 0.1, 0.15, 0.2],
              'max_depth ' : [2,3,4]
             }

# run randomized search
n_iter_search = 10
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=n_iter_search)

start = time()
random_search.fit(x, y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.cv_results_)

RandomizedSearchCV took 367.45 seconds for 10 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.759 (std: 0.067)
Parameters: {'max_depth ': 3, 'learning_rate': 0.15}

Model with rank: 1
Mean validation score: 0.759 (std: 0.067)
Parameters: {'max_depth ': 4, 'learning_rate': 0.15}

Model with rank: 3
Mean validation score: 0.759 (std: 0.074)
Parameters: {'max_depth ': 2, 'learning_rate': 0.1}

Model with rank: 3
Mean validation score: 0.759 (std: 0.074)
Parameters: {'max_depth ': 4, 'learning_rate': 0.1}

Model with rank: 3
Mean validation score: 0.759 (std: 0.074)
Parameters: {'max_depth ': 3, 'learning_rate': 0.1}

