In [1]:
import itertools
import numpy as np
import pandas as pd
from scipy import linalg
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn import preprocessing, datasets
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, silhouette_samples, calinski_harabaz_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn import mixture
from sklearn.mixture import GaussianMixture
from sklearn.neural_network import MLPClassifier

from sklearn.decomposition import PCA, FastICA, NMF, IncrementalPCA, FactorAnalysis, SparsePCA, TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
import time

**<font color=black size=5>Load Data</font>**

In [15]:
sklearn_data = datasets.load_breast_cancer()
# sklearn_data = datasets.load_digits()
# print(sklearn_data.DESCR)

x, y = sklearn_data.data, sklearn_data.target
x = preprocessing.scale(x) 
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1997)

# print(sklearn_data.data.shape)
# print(x.shape,y.shape)

In [16]:
def neural_net(X_train,y_train,X_test,y_test,learning_rate,plotting):
    clf = MLPClassifier(activation='tanh',alpha=1e-03,batch_size='auto',learning_rate='adaptive',learning_rate_init=learning_rate,solver='adam')
    st = time.time()
    clf.fit(X_train,y_train)
    end = time.time()
    train_time = end-st
    stt = time.time()
    y_pred = clf.predict(X_test)
    endt = time.time()
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)
    title = 'Confusion Matrix with Neural Network'
    clf_name = 'neural_net'
    if plotting == True:
        plot_learning_curve(clf,'Learning Curve for Neural Network', X_train, y_train, (0.7, 1.01), n_jobs=4)
        plot_validation_curve(X_train,y_train,clf,clf_name)
        confusion(y_test,y_pred,title)

    print('Accuracy for Neural Network is ' + str(accuracy_score(y_test,y_pred)))
    print('Training time for Neural Network: ' + str(train_time) + ' seconds')
    print('Testing time for Neural Network: ' + str(endt-stt) + ' seconds')
    print()

def tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate,plotting):
    clf = MLPClassifier(activation='relu',max_iter=5000,alpha=1e-05,batch_size='auto',learning_rate='adaptive',learning_rate_init=learning_rate,solver='adam')
    alpha_range = [1e-05,1e-04,1e-03,1e-02,1e-01,1,2,3]
    lr_range = [0.00001,0.0001,0.001,0.01,0.1,0.5,0.8,1]
    params = {'alpha' : alpha_range, 'learning_rate_init' : lr_range, 'activation': ['relu','tanh']}
    clf = GridSearchCV(clf, param_grid=params, cv=5)
    st = time.time()
    clf.fit(X_train,y_train)
    end = time.time()
    train_time = end-st
    stt = time.time()
    y_pred = clf.predict(X_test)
    endt = time.time()
    y_test = np.array(y_test)
    y_pred = np.array(y_pred)
    title = 'Confusion Matrix with Tuned Neural Network'
    clf_name = 'neural_net'
    if plotting == True:
        plot_learning_curve(clf,'Learning Curve for Neural Network', X_train, y_train, (0.7, 1.01), n_jobs=4)
        confusion(y_test,y_pred,title)
        clf_best = MLPClassifier(hidden_layer_sizes=(5, 2), random_state = 3169, max_iter = 1)
        clf_best.set_params(alpha=clf.best_params_['alpha'], learning_rate_init=clf.best_params_['learning_rate_init'])
        num_epochs = 1000
        train_loss = np.zeros(num_epochs)
        train_scores = np.zeros(num_epochs)
        val_scores = np.zeros(num_epochs)
        # Split training set into training and validation
        X_train_, X_val, y_train_, y_val = train_test_split(X_train, y_train, test_size=0.4, random_state=3169)
        for i in range(num_epochs):
            clf_best.fit(X_train_, y_train_)
            train_loss[i] = clf_best.loss_
            train_scores[i] = accuracy_score(y_train_, clf_best.predict(X_train_))
        range_loss = np.arange(num_epochs) + 1
        plt.figure()
        plt.plot(range_loss, train_loss)
        plt.title('Training loss curve for neural network')
        plt.xlabel('Epochs')
        plt.ylabel("Loss")
        plt.grid()
        plt.show()

    print('Accuracy for Tuned Neural Network is ' + str(accuracy_score(y_test,y_pred)))
    print('Training time for Tuned NN: ' + str(train_time) + ' seconds')
    print('Testing time for Tuned NN: ' + str(endt-stt) + ' seconds')
    print('Best parameters: ' + str(clf.best_params_))
    print()
    return clf.best_params_

In [17]:
def K_Means_silhouette_analysis(X,y):
#     range_n_clusters = [3, 7, 10, 15, 17, 19, 20]

    range_n_clusters = [10]

    for n_clusters in range_n_clusters:
        # Create a subplot with 1 row and 2 columns
        fig, (ax1, ax2) = plt.subplots(1, 2)
        fig.set_size_inches(18, 7)

        # The 1st subplot is the silhouette plot
        # The silhouette coefficient can range from -1, 1 but in this example all
        # lie within [-0.1, 1]
        ax1.set_xlim([-0.1, 1])
        # The (n_clusters+1)*10 is for inserting blank space between silhouette
        # plots of individual clusters, to demarcate them clearly.
        ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])

        # Initialize the clusterer with n_clusters value and a random generator
        # seed of 10 for reproducibility.
        clusterer = KMeans(n_clusters=n_clusters, random_state=10)
        cluster_labels = clusterer.fit_predict(X)

        # The silhouette_score gives the average value for all the samples.
        # This gives a perspective into the density and separation of the formed
        # clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        print("For n_clusters =", n_clusters,
              "The average silhouette_score is :", silhouette_avg)

        # Compute the silhouette scores for each sample
        sample_silhouette_values = silhouette_samples(X, cluster_labels)

        y_lower = 10
        for i in range(n_clusters):
            # Aggregate the silhouette scores for samples belonging to
            # cluster i, and sort them
            ith_cluster_silhouette_values = \
                sample_silhouette_values[cluster_labels == i]

            ith_cluster_silhouette_values.sort()

            size_cluster_i = ith_cluster_silhouette_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_clusters)
            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, ith_cluster_silhouette_values,
                              facecolor=color, edgecolor=color, alpha=0.7)

            # Label the silhouette plots with their cluster numbers at the middle
            ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

            # Compute the new y_lower for next plot
            y_lower = y_upper + 10  # 10 for the 0 samples

        ax1.set_title("The silhouette plot for the various clusters.")
        ax1.set_xlabel("The silhouette coefficient values")
        ax1.set_ylabel("Cluster label")

        # The vertical line for average silhouette score of all the values
        ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

        ax1.set_yticks([])  # Clear the yaxis labels / ticks
        ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

        # 2nd Plot showing the actual clusters formed
        colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
        ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
                    c=colors, edgecolor='k')

        # Labeling the clusters
        centers = clusterer.cluster_centers_
        # Draw white circles at cluster centers
        ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
                    c="white", alpha=1, s=200, edgecolor='k')

        for i, c in enumerate(centers):
            ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
                        s=50, edgecolor='k')


        ax2.set_title("The visualization of the clustered data.")
        ax2.set_xlabel("Feature space for the 1st feature")
        ax2.set_ylabel("Feature space for the 2nd feature")

        plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
                      "with n_clusters = %d" % n_clusters),
                     fontsize=14, fontweight='bold')

        plt.show()
# K_Means_silhouette_analysis(x,y)

def K_Means_Results(best_n,X,y):
    kmeans = KMeans(n_clusters=best_n, random_state=1997)
    kmeans.fit(X)
    print('K-Means Inertia: ', kmeans.inertia_)
    silh_result = silhouette_score(X, kmeans.labels_)
    print('K-Means Silhouette score: ', silh_result)
    AMI = adjusted_mutual_info_score(y, kmeans.labels_)
    print('K-Means Adjusted Mutual Information (AMI) score: ', AMI)
    print()
    return kmeans.fit_transform(X)

In [18]:
def gmm_em(X,y):
    lowest_bic = np.infty
    bic = []
    n_components_range = range(1, 20)
    cv_types = ['spherical', 'tied', 'diag', 'full']
    for cv_type in cv_types:
        for n_components in n_components_range:
            # Fit a Gaussian mixture with EM
            gmm = mixture.GaussianMixture(n_components=n_components,
                                          covariance_type='full')
            gmm.fit(X)
            bic.append(gmm.bic(X))
            if bic[-1] < lowest_bic:
                lowest_bic = bic[-1]
                best_gmm = gmm

    bic = np.array(bic)
    color_iter = itertools.cycle(['navy', 'turquoise', 'cornflowerblue',
                                  'darkorange', 'green'])
    clf = best_gmm
    bars = []

    # Plot the BIC scores
    plt.figure(figsize=(8, 6))
    spl = plt.subplot(2, 1, 1)
    for i, (cv_type, color) in enumerate(zip(cv_types, color_iter)):
        xpos = np.array(n_components_range) + .2 * (i - 2)
        bars.append(plt.bar(xpos, bic[i * len(n_components_range):
                                      (i + 1) * len(n_components_range)],
                            width=.2, color=color))
    plt.xticks(n_components_range)
    plt.ylim([bic.min() * 1.01 - .01 * bic.max(), bic.max()])
    plt.title('BIC score per model')
    xpos = np.mod(bic.argmin(), len(n_components_range)) + .65 +\
        .2 * np.floor(bic.argmin() / len(n_components_range))
    best_num = np.mod(bic.argmin(), len(n_components_range)) + 1
    print(best_num)
    plt.text(xpos, bic.min() * 0.97 + .03 * bic.max(), '*', fontsize=14)
    spl.set_xlabel('Number of components')
    spl.legend([b[0] for b in bars], cv_types)

    # Plot the winner
    splot = plt.subplot(2, 1, 2)
    Y_ = clf.predict(X)
    print(clf.covariances_.shape)
    for i, (mean, cov, color) in enumerate(zip(clf.means_, clf.covariances_,
                                               color_iter)):
        print(cov.shape)
        v, w = linalg.eigh(cov)
        if not np.any(Y_ == i):
            continue
#         plt.scatter(X[Y_ == i, 18], X[Y_ == i, 45], .8, color=color)
        plt.scatter(X[Y_ == i, 0], X[Y_ == i, 1], .8, color=color)


        # Plot an ellipse to show the Gaussian component
        angle = np.arctan2(w[0][1], w[0][0])
        angle = 180. * angle / np.pi  # convert to degrees
        v = 2. * np.sqrt(2.) * np.sqrt(v)
        ell = mpl.patches.Ellipse(mean, v[0], v[1], 180. + angle, color=color)
        ell.set_clip_box(splot.bbox)
        ell.set_alpha(.5)
        splot.add_artist(ell)

    plt.xticks(())
    plt.yticks(())

    plt.title('Selected GMM: full model,' + str(best_num) + ' components')

    plt.subplots_adjust(hspace=.35, bottom=.02)
    plt.show()
    return best_num
# gmm_em(x,y)

In [19]:
def gmm_results(best_n,X,y):
    gmm_best = GaussianMixture(n_components=best_n, random_state=1997)
    gmm_best.fit(X)
    gmm_labels = gmm_best.predict(X)

    print('GMM BIC: ', gmm_best.bic(X))
    score_gmm = silhouette_score(X, gmm_labels)
    print('GMM Silhouette score: ', score_gmm)
    AMI_gmm = adjusted_mutual_info_score(y, gmm_labels)
    print('GMM Adjusted Mutual Information (AMI) score: ', AMI_gmm)
    print()
    return gmm_best.predict_proba(X)

**<font color=black size=5>NN+DR</font>**

In [None]:
best_n = 2

best_X_pca = PCA(n_components = best_n).fit_transform(x)
print('Neural Network with PCA Data: ')
print()
X_train, X_test, y_train, y_test = train_test_split(best_X_pca, y, test_size=0.4, random_state=1997)
# neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
st = time.time()
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
end = time.time()
print('PCA Time:' + str(end-st))
# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)

In [None]:
best_X_ica = FastICA(n_components = best_n).fit_transform(x)
print('Neural Network with ICA Data: ')
print()
X_train, X_test, y_train, y_test = train_test_split(best_X_ica, y, test_size=0.4, random_state=1997)
st = time.time()
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
end = time.time()
print('ICA Time:' + str(end-st))
# neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)

In [None]:
best_X_rp = GaussianRandomProjection(n_components = best_n).fit_transform(x)
print('Neural Network with Randomly Projected Data: ')
print()
X_train, X_test, y_train, y_test = train_test_split(best_X_rp, y, test_size=0.4, random_state=1997)
st = time.time()
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
end = time.time()
print('RP Time:' + str(end-st))
# neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)

In [None]:
best_X_fa = FactorAnalysis(n_components = best_n).fit_transform(x)
print('Neural Network with Factor Analysed Data: ')
print()
X_train, X_test, y_train, y_test = train_test_split(best_X_fa, y, test_size=0.4, random_state=1997)
st = time.time()
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
end = time.time()
print('FA Time:' + str(end-st))
# neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)
# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)

**<font color=black size=5>NN+DR+CA</font>**

In [None]:
print('### RUNNING NEURAL NETWORK WITH CLUSTERED DATA AS FEATURES')

print('### PCA')
K_Means_silhouette_analysis(best_X_pca,y)
K_Clustered_X = K_Means_Results(10,best_X_pca,y)
# best_gmm_n = gmm_em(best_X_pca,y)
GMM_Clustered_X = gmm_results(10,best_X_pca,y)


print('K-Means Clustering & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(K_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()


print('GMM & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(GMM_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()


In [None]:
print('### ICA')
K_Means_silhouette_analysis(best_X_ica,y)
K_Clustered_X = K_Means_Results(10,x,y)
# best_gmm_n = gmm_em(best_X_ica,y)
GMM_Clustered_X = gmm_results(10,x,y)

print('K-Means Clustering & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(K_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

print('GMM & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(GMM_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

In [None]:
print('### RP')
K_Means_silhouette_analysis(best_X_rp,y)
K_Clustered_X = K_Means_Results(10,x,y)
# best_gmm_n = gmm_em(best_X_rp,y)
GMM_Clustered_X = gmm_results(10,x,y)

print('K-Means Clustering & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(K_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

print('GMM & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(GMM_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

In [None]:
print('### FA')
K_Means_silhouette_analysis(best_X_fa,y)
K_Clustered_X = K_Means_Results(10,x,y)
# best_gmm_n = gmm_em(best_X_fa,y)
GMM_Clustered_X = gmm_results(10,x,y)

print('K-Means Clustering & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(K_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

print('GMM & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(GMM_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

In [22]:
print('### O')
# K_Means_silhouette_analysis(x,y)
K_Clustered_X = K_Means_Results(5,x,y)
# best_gmm_n = gmm_em(x,y)
GMM_Clustered_X = gmm_results(2,x,y)


print('K-Means Clustering & Neural Network: ')
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()


print('GMM & Neural Network: ')
# X_train, X_test, y_train, y_test = train_test_split(GMM_Clustered_X, y, test_size=0.4, random_state=1997)
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)

# best_nn = tuned_neural_net(X_train,y_train,X_test,y_test,learning_rate=0.0001,plotting=False)
print()

### O
K-Means Inertia:  8555.100550649087
K-Means Silhouette score:  0.16093390221330767
K-Means Adjusted Mutual Information (AMI) score:  0.29229696479367734

GMM BIC:  6094.040119869131
GMM Silhouette score:  0.31448870991364997
GMM Adjusted Mutual Information (AMI) score:  0.6597028491344276

K-Means Clustering & Neural Network: 
Accuracy for Neural Network is 0.9780701754385965
Training time for Neural Network: 0.11728143692016602 seconds
Testing time for Neural Network: 0.0 seconds


GMM & Neural Network: 
Accuracy for Neural Network is 0.9824561403508771
Training time for Neural Network: 0.14056921005249023 seconds
Testing time for Neural Network: 0.0 seconds




In [45]:
neural_net(X_train,y_train,X_test,y_test,learning_rate=0.1,plotting=False)


Accuracy for Neural Network is 0.9780701754385965
Training time for Neural Network: 0.015623807907104492 seconds
Testing time for Neural Network: 0.0 seconds

