In [None]:
import numpy as np 
import matplotlib.pyplot as plt
import os
import itertools
import pandas as pd
import seaborn as sns
import numpy
import matplotlib.pyplot
import numpy.linalg 
import numpy.random
from skopt import gp_minimize
from skopt.space import Real, Integer
from sklearn.model_selection import KFold
from mpl_toolkits import mplot3d
from sklearn.feature_selection import mutual_info_classif
from skfeature.function.similarity_based import fisher_score



def parseData(filename):
    f = open(filename, "r")
    line = f.readline()
    X = numpy.zeros((0, 30))
    y = []
    while line:
        line = line.strip()
        a = line.split(",")

        if (a[1] == "M"):
            y.append(1)
        else:
            y.append(0)

        del a[0:2]
        b = [eval(i) for i in a]
        X = numpy.vstack((X, b))
        line = f.readline()
    f.close()
    return X, y

def deletepng():
    folder_path = (r'/work/')
    test = os.listdir(folder_path)
    for images in test:
        if images.endswith(".png"):
            os.remove(os.path.join(folder_path, images))

def sigmoid(x):
    return 1.0/(1 + numpy.exp(-x))

def loss(y, prediction):
    loss = -numpy.mean(y * (numpy.log(prediction)) - (1 - y) * numpy.log(1 - prediction))
    return loss

def gradient(X, y, predictions):
    m = X.shape[0]
    dw = (1/m) * numpy.dot(X.T, (predictions - y))
    db = (1/m) * numpy.sum((predictions - y))
    return dw, db

def plot_2d_boundary(X, y, w, b, filename):
    x_ticks = [min(X[:, 0]), max(X[:, 0])]
    m = -w[0]/w[1]
    c = -b/w[1]
    y_ticks = m * x_ticks + c

    C1 = numpy.zeros((0, 2))
    C2 = numpy.zeros((0, 2))

    for i in range(len(y)):
        if y[i] == 1:
            C1 = numpy.vstack((C1, X[i]))
        else:
            C2 = numpy.vstack((C2, X[i]))

    matplotlib.pyplot.xlim(numpy.min(X[:,0]), numpy.max(X[:,0]))
    matplotlib.pyplot.ylim(numpy.min(X[:,1]), numpy.max(X[:,1]))
    matplotlib.pyplot.plot(C1[:, 0], C1[:, 1], 'ro')
    matplotlib.pyplot.plot(C2[:, 0], C2[:, 1], 'bo')
    matplotlib.pyplot.plot(x_ticks, y_ticks, 'k-')
    matplotlib.pyplot.savefig(filename)
    matplotlib.pyplot.close('all')

def plot_3d_boundary(X, y, w, b, filename):
    C1 = numpy.zeros((0, 3))
    C2 = numpy.zeros((0, 3))

    for i in range(len(y)):
        if y[i] == 1:
            C1 = numpy.vstack((C1, X[i]))
        else:
            C2 = numpy.vstack((C2, X[i]))
    fig = plt.figure(figsize = (10,10))
    ax = plt.axes(projection='3d')
    #x1_vals = np.linspace(10, 25, 50)
    #x2_vals = np.linspace(10, 30, 50)
    x1_vals = np.linspace(numpy.min(X[:, 0]), numpy.max(X[:, 0]), 50)
    x2_vals = np.linspace(numpy.min(X[:, 1]), numpy.max(X[:, 1]), 50)
    x1_mesh, x2_mesh = np.meshgrid(x1_vals, x2_vals)
    x3_mesh = (-b - w[0]*x1_mesh - w[1]*x2_mesh) / w[2]
    ax.scatter(C1[:, [0]], C1[:, [1]], C1[:, [2]], c = 'r', s = 15)
    ax.scatter(C2[:, [0]], C2[:, [1]], C2[:, [2]], c = 'b', s = 15)
    ax.plot_surface(x1_mesh, x2_mesh, x3_mesh, alpha=0.5)
    #ax.view_init(0, -100)
    ax.view_init(10, 20)
    matplotlib.pyplot.savefig(filename)
    matplotlib.pyplot.close('all')

def plot_data_2d(X, y, filename):
    C1 = numpy.zeros((0, 2))
    C2 = numpy.zeros((0, 2))

    for i in range(len(y)):
        if y[i] == 1:
            C1 = numpy.vstack((C1, X[i]))
        else:
            C2 = numpy.vstack((C2, X[i]))

    matplotlib.pyplot.xlim(numpy.min(X[:,0]), numpy.max(X[:,0]))
    matplotlib.pyplot.ylim(numpy.min(X[:,1]), numpy.max(X[:,1]))
    matplotlib.pyplot.plot(C1[:, 0], C1[:, 1], 'ro')
    matplotlib.pyplot.plot(C2[:, 0], C2[:, 1], 'bo')
    matplotlib.pyplot.savefig(filename)
    matplotlib.pyplot.close('all')


def plot_data_3d(X, y, filename):
    C1 = numpy.zeros((0, 3))
    C2 = numpy.zeros((0, 3))

    for i in range(len(y)):
        if y[i] == 1:
            C1 = numpy.vstack((C1, X[i]))
        else:
            C2 = numpy.vstack((C2, X[i]))
    fig = plt.figure(figsize = (10,10))
    ax = plt.axes(projection='3d')
    ax.scatter(C1[:, [0]], C1[:, [1]], C1[:, [2]], c = 'r', s = 15)
    ax.scatter(C2[:, [0]], C2[:, [1]], C2[:, [2]], c = 'b', s = 15)
    matplotlib.pyplot.savefig(filename)
    matplotlib.pyplot.close('all')

def normalize(X):
    m, n = X.shape
    for i in range(n):
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    return X

def train(X, y, batchSize, learningRate, iterations):
    m, n = X.shape
    w = numpy.zeros((n, 1))
    b = 0

    y = y.reshape(m, 1)

    x = normalize(X)

    losses = []

    for i in range(iterations):
        for j in range((m-1) // batchSize + 1):
            start = j * batchSize
            end = start + batchSize
            x_batch = X[start:end]
            y_batch = y[start:end]

            predictions = sigmoid(numpy.dot(x_batch, w) + b)

            dw, db = gradient(x_batch, y_batch, predictions)

            w -= learningRate * dw
            b -= learningRate * db

        #l = loss(y, sigmoid(numpy.dot(X, w) + b))
        #losses.append(l)
    
    return w, b#, losses

def predict(X, w, b):
    x = normalize(X)

    predictions = sigmoid(numpy.dot(X, w) + b)

    prediction_class = []
    prediction_class = [1 if i > 0.5 else 0 for i in predictions]

    return numpy.array(prediction_class)

def accuracy(y, predictions):
    accuracy = numpy.sum(y == predictions) / len(y)
    return accuracy

def CV_kfold(X, y, batchSize, learningRate, iterations, k):
    kf = KFold(n_splits=k)
    atrains = []
    atests = []
    awholes = []
    wks = []
    bks = []

    for i, (train_i, test_i) in enumerate(kf.split(Xs)):
        train_X, test_X = X[train_i], X[test_i]
        train_y, test_y = y[train_i], y[test_i]
        wk, bk = train(train_X, train_y, batchSize, learningRate, iterations)

        predictions_train = predict(train_X, wk, bk)
        predictions_test = predict(test_X, wk, bk)
        predictions_whole = predict(X, wk, bk)

        accuracy_train = accuracy(train_y, predictions_train)
        accuracy_test = accuracy(test_y, predictions_test)
        accuracy_whole = accuracy(y, predictions_whole)

        atrains.append(accuracy_train)
        atests.append(accuracy_test)
        awholes.append(accuracy_whole)

        wks.append(wk)
        bks.append(bk)
    
    return atrains, atests, awholes, wks, bks

def print_kfold_accuracy(atrains, atests, awholes):
    for i in range(len(atrains)):
        print("Fold: ", i + 1)
        print("Training accuracy: ", atrains[i])
        print("Testing accuracy: ", atests[i])
        print("Overall accuracy: ", awholes[i])

def information_gain(X, y, color, filename):
    importances = mutual_info_classif(X, y)
    feat_importances = pd.Series(importances)
    feat_importances.plot(kind = "barh", color = color)
    plt.title("Information Gain")
    plt.savefig(filename)
    plt.close('all')
    return importances

def fishers_score(X, y, color, filename):
    ranks = fisher_score.fisher_score(X, y)
    feat_importances = pd.Series(ranks)
    feat_importances.plot(kind = "barh", color = color)
    plt.title("Fisher Score")
    plt.savefig(filename)
    plt.close('all')
    return ranks

def correlation_coefficient(X, y, filename):
    dataframe = pd.DataFrame(X)
    correlation = dataframe.corr()
    plt.figure(figsize = (30, 20))
    sns.heatmap(correlation, annot = True)
    plt.title("Correlation Matrix")
    plt.savefig(filename)
    plt.close('all')
    return correlation

def dispersion_ratio(X, color, filename):
    X = X + 1
    am = numpy.mean(X, axis = 0)
    gm = numpy.power(numpy.prod(X, axis = 0), 1 / X.shape[0])
    ratio = am / gm
    ratios = pd.Series(ratio)
    ratios.plot(kind = "barh", color = color)
    plt.title("Dispersion Ratio")
    plt.savefig(filename)
    plt.close('all')
    return ratio

def superimpose_bar(importances, ranks, color1, color2, filename):
    info_importances = pd.Series(importances)
    info_importances = info_importances * 60
    fisher_importances = pd.Series(ranks)
    info_importances.plot(kind = "barh", color = color1)
    fisher_importances.plot(kind = "barh", color = color2, alpha = 0.5)
    plt.title("Super-imposed feature importances")
    plt.savefig(filename)
    plt.close('all')

def weighted_sum_importance(importances, ranks, color, filename):
    info_importances = pd.Series(importances)
    info_importances = info_importances * 60
    fisher_importances = pd.Series(ranks)
    weighted_sum = info_importances + fisher_importances
    weighted_sum.plot(kind = "barh", color = color)
    plt.title("Weighted sum importances")
    plt.savefig(filename)
    plt.close('all')

def grid_best_performance(X, y, batchSize, learningRate, iterations, folds):
    best_overall_accs = []
    best_testing_accs = []
    best_training_accs = []
    lr = learningRate * 100
    for i in range(5, batchSize, 5):
        for j in range(1, int(lr), 1):
            for m in range(250, iterations, 250):
                for n in range(2, folds, 1):
                    atrains, atests, awholes, wks, bks = CV_kfold(X, y, i, j/100, m, n)
                    best_overall_accs.append(max(awholes))
                    best_testing_accs.append(max(atests))
                    best_training_accs.append(max(atrains))
    boa = max(best_overall_accs)
    btesta = max(best_testing_accs)
    btraina = max(best_training_accs)
    return btraina, btesta, boa

def grid_best_performance_bs(X, y, batchSize, learningRate, iterations, folds):
    best_overall_accs = []
    best_testing_accs = []
    best_training_accs = []
    for i in range(5, batchSize, 5):
        atrains, atests, awholes, wks, bks = CV_kfold(X, y, i, learningRate, iterations, folds)
        best_overall_accs.append(max(awholes))
        best_testing_accs.append(max(atests))
        best_training_accs.append(max(atrains))
    boa = max(best_overall_accs)
    btesta = max(best_testing_accs)
    btraina = max(best_training_accs)
    boai = best_overall_accs.index(boa) * 5 + 5
    btestai = best_testing_accs.index(btesta) * 5 + 5
    btrainai = best_training_accs.index(btraina) * 5 + 5
    return btraina, btesta, boa, btrainai, btestai, boai

def grid_best_performance_lr(X, y, batchSize, learningRate, iterations, folds):
    best_overall_accs = []
    best_testing_accs = []
    best_training_accs = []
    lr = learningRate * 100
    for i in range(1, int(lr), 1):
        atrains, atests, awholes, wks, bks = CV_kfold(X, y, batchSize, i/100, iterations, folds)
        best_overall_accs.append(max(awholes))
        best_testing_accs.append(max(atests))
        best_training_accs.append(max(atrains))
    boa = max(best_overall_accs)
    btesta = max(best_testing_accs)
    btraina = max(best_training_accs)
    boai = best_overall_accs.index(boa) / 100 + 0.01
    btestai = best_testing_accs.index(btesta) / 100 + 0.01
    btrainai = best_training_accs.index(btraina) / 100 + 0.01
    return btraina, btesta, boa, btrainai, btestai, boai

def grid_best_performance_it(X, y, batchSize, learningRate, iterations, folds):
    best_overall_accs = []
    best_testing_accs = []
    best_training_accs = []
    for i in range(50, iterations, 50):
        atrains, atests, awholes, wks, bks = CV_kfold(X, y, batchSize, learningRate, i, folds)
        best_overall_accs.append(max(awholes))
        best_testing_accs.append(max(atests))
        best_training_accs.append(max(atrains))
    boa = max(best_overall_accs)
    btesta = max(best_testing_accs)
    btraina = max(best_training_accs)
    boai = best_overall_accs.index(boa) * 50 + 50
    btestai = best_testing_accs.index(btesta) * 50 + 50
    btrainai = best_training_accs.index(btraina) * 50 + 50
    return btraina, btesta, boa, btrainai, btestai, boai

def grid_best_performance_k(X, y, batchSize, learningRate, iterations, folds):
    best_overall_accs = []
    best_testing_accs = []
    best_training_accs = []
    for i in range(2, folds, 1):
        atrains, atests, awholes, wks, bks = CV_kfold(X, y, batchSize, learningRate, iterations, i)
        best_overall_accs.append(max(awholes))
        best_testing_accs.append(max(atests))
        best_training_accs.append(max(atrains))
    boa = max(best_overall_accs)
    btesta = max(best_testing_accs)
    btraina = max(best_training_accs)
    boai = best_overall_accs.index(boa) + 2
    btestai = best_testing_accs.index(btesta) + 2
    btrainai = best_training_accs.index(btraina) + 2
    return btraina, btesta, boa, btrainai, btestai, boai

def objective(params):
    batchSize, learningRate, iterations = params
    w, b, l = train(Xs, y, batchSize, learningRate, iterations)
    predictions = predict(val_x, w, b)
    val_acc = accuracy(val_y, predictions)
    return -val_acc

def objective4D(params):
    batchSize, learningRate, iterations, k = params
    atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, batchSize, learningRate, iterations, k)
    accs = []
    for i in range(len(wks)):
        predictions = predict(val_x, wks[i], bks[i])
        val_acc = accuracy(val_y, predictions)
        accs.append(val_acc)
    return -numpy.mean(accs)

def printCVtestAccuracy(test_x, test_y, wks, bks):
    for i in range(len(wks)):
        predictions = predict(test_x, wks[i], bks[i])
        acc = accuracy(test_y, predictions)
        print("Fold: ", i + 1)
        print("Testing Accuracy: ", acc)

def printCVAllAccuracy(X, y, val_x, val_y, test_x, test_y, wks, bks):
    for i in range(len(wks)):
        predictions = predict(X, wks[i], bks[i])
        trainAcc = accuracy(y, predictions)
        predictions = predict(val_x, wks[i], bks[i])
        valAcc = accuracy(val_y, predictions)
        predictions = predict(test_x, wks[i], bks[i])
        testAcc = accuracy(test_y, predictions)
        print("Fold: ", i + 1)
        print("Training Accuracy: ", trainAcc)
        print("Validation Accuracy: ", valAcc)
        print("Testing Accuracy: ", testAcc)
    
def printAccuracy(X, y, w, b):
    predictions = predict(X, w, b)
    acc = accuracy(y, predictions)
    print("Accuracy: ", acc)

def splitData(X, y, trainSplit, valSplit, testSplit):
    trainStop = int(trainSplit * len(X))
    valStop = int((trainSplit + valSplit) * len(X))
    train_x = X[0:trainStop, :]
    train_y = y[0:trainStop]
    val_x = X[trainStop:valStop, :]
    val_y = y[trainStop:valStop]
    test_x = X[valStop:, :]
    test_y = y[valStop:]
    return train_x, train_y, val_x, val_y, test_x, test_y

if __name__ == "__main__":
    X, y = parseData("wdbc.data")
    y = numpy.array(y)
    Xs = X[:, [6, 20, 26]]
    #Xs = X[:, [1, 4, 6, 8, 10, 11, 14, 15, 18, 20, 21, 28, 29]]
    #Xs = X
    Xs, y, val_x, val_y, test_x, test_y = splitData(Xs, y, 0.7, 0.15, 0.15)

    #w, b = train(Xs, y, 50, 0.01, 1000)
    #w, b = train(Xs, y, 20, 0.05, 1000)
    #w, b = train(Xs, y, 96, 0.26673740016620473, 2207)
    #w, b = train(Xs, y, 97, 0.25115243547839416, 975)
    #w, b = train(Xs, y, 5, 1.0, 1612)
    #printAccuracy(test_x, test_y, w, b)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 20, 0.05, 1000, 5)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 96, 0.26673740016620473, 2207, 13)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 25, 0.2939820078373507, 2426, 25)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 17, 0.14644249958668085, 2375, 25)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 5, 0.8560886757562707, 2500, 8)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 5, 0.001, 1894, 22)
    #atrains, atests, awholes, wks, bks = CV_kfold(Xs, y, 250, 0.001, 2295, 2) #all features
    #print_kfold_accuracy(atrains, atests, awholes)
    #printCVtestAccuracy(test_x, test_y, wks, bks)
    #printCVAllAccuracy(Xs, y, val_x, val_y, test_x, test_y, wks, bks)


    space = [
        Integer(5, 250, name = 'batchSize'),
        Real(0.001, 1, name = 'learningRate'),
        Integer(50, 2500, name = 'iterations')
    ]
    space2 = [
        Integer(5, 250, name = 'batchSize'),
        Real(0.001, 1, name = 'learningRate'),
        Integer(50, 2500, name = 'iterations'),
        Integer(2, 25, name = 'k')
    ]
    #optimal = gp_minimize(objective, space, n_calls = 20)
    #print(optimal)
    #optimal2 = gp_minimize(objective4D, space2, n_calls = 50)
    #print(optimal2)

    #infog = information_gain(X, y, "#4DBEEE", "info_gain_bar.png")
    #fishers = fishers_score(X, y, "#E50000", "fisher_scores_bar.png")
    #correlation_coefficient(X, y, "correlation_heatmap.png")
    #dispersion_ratio(X, "#4DBEEE", "dispersion_ratio_bar.png")
    #superimpose_bar(infog, fishers, "#4DBEEE", "#E50000", "super_imposed_bar.png")
    #weighted_sum_importance(infog, fishers, "#9A0EEA", "weighted_sum_bar.png")
    #btraina, btesta, boa, btrainai, btestai, boai = grid_best_performance_bs(Xs, y, 20, 0.05, 1000, 5)
    #btraina, btesta, boa, btrainai, btestai, boai = grid_best_performance_lr(Xs, y, 20, 0.05, 1000, 5)
    #btraina, btesta, boa, btrainai, btestai, boai = grid_best_performance_it(Xs, y, 20, 0.05, 1000, 5)
    #btraina, btesta, boa, btrainai, btestai, boai = grid_best_performance_k(Xs, y, 96, 0.26673740016620473, 2207, 25)
    #print(btraina)
    #print(btesta)
    #print(boa)
    #print(btrainai)
    #print(btestai)
    #print(boai)

    #plot_data_2d(Xs, y, "2d_data.png")
    #plot_2d_boundary(Xs, y, w, b, "2d_boundary.png")
    #plot_data_3d(Xs, y, "3d_data.png")
    #plot_3d_boundary(Xs, y, w, b, "3d_boundary.png")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6aa49324-e0d5-405c-b842-af67c1bb30bb' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>