In [1]:
# -*- coding: utf-8 -*-
"""
Created on Fri Nov 2 10:38:28 2018

@author: Shawnzy
"""
import warnings
warnings.filterwarnings('ignore')
#%% Imports
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from time import clock
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans as kmeans
from sklearn.mixture import GaussianMixture as GMM
from collections import defaultdict
from sklearn import metrics
import sys
import matplotlib.pyplot as plt 


#out = './{}/'.format(sys.argv[1])
out = './'

np.random.seed(0)

# Cancer Dataset - 32 Attributes - 2 Classes
cancer = pd.read_hdf(out+'results.hdf','cancer_rca_dim15')
cancerX = cancer.iloc[:,:-1]
cancerY = cancer.iloc[:,-1]

# Madelon Dataset - 500 Attributes - 2 Classes
madelon = pd.read_hdf(out+'results.hdf','madelon_rca_dim300')        
madelonX = madelon.iloc[:,:-1]
madelonY = madelon.iloc[:,-1]

# Scale Attributes (Very important for clustering)
madelonX = StandardScaler().fit_transform(madelonX) 
cancerX= StandardScaler().fit_transform(cancerX)

evals = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
#evas = #Sum of Square Error
        #Log Likelihood
        #Adjusted Rand score
        #Adjusted Mutual Info
        #Homogeneity
        #Completeness
        #Silhouette score
        #Training times 

# Initialize clustering algorithms
km = kmeans(random_state=5)
gmm = GMM(random_state=5)

In [2]:
range_k_clusters =  [1,2,3,4,5,6,7,8,9,10]    

for k in range_k_clusters:
    #----------------------------------------Kmeans----------------------------------------#
    #----------Cancer----------#
   
    # Fit and Predict
    km.set_params(n_clusters=k)
    st = clock()
    km.fit(cancerX)
    ft = clock()
    predictY = km.predict(cancerX)
    pt = clock()
    
    # Save Fit and Prediction Times
    evals['Kmeans']['Cancer']['Fit_Time'][k] = ft - st
    evals['Kmeans']['Cancer']['Predict_Time'][k] = pt - ft
    
    # Save Evaluation Metrics
    evals['Kmeans']['Cancer']['SSE'][k] = -km.score(cancerX)
    evals['Kmeans']['Cancer']['ARI'][k] = metrics.adjusted_rand_score(cancerY,predictY)
    evals['Kmeans']['Cancer']['AMI'][k] = metrics.adjusted_mutual_info_score(cancerY,predictY)
    evals['Kmeans']['Cancer']['HOM'][k] = metrics.homogeneity_score(cancerY,predictY)
    evals['Kmeans']['Cancer']['COM'][k] = metrics.completeness_score(cancerY,predictY)
    if k > 1: evals['Kmeans']['Cancer']['SIL'][k] = metrics.silhouette_score(cancerX,predictY)

    #------------------------------Expectation Maximization------------------------------#

    # Fit and Predict
    gmm.set_params(n_components=k)    
    st = clock()
    gmm.fit(cancerX)
    ft = clock()
    predictY = gmm.predict(cancerX)
    pt = clock()
    
    # Save Fit and Prediction Times
    evals['ExpMax']['Cancer']['Fit_Time'][k] = ft - st
    evals['ExpMax']['Cancer']['Predict_Time'][k] = pt - ft
    
    # Save Evaluation Metrics
    evals['ExpMax']['Cancer']['LGL'][k] = gmm.score(cancerX)
    evals['ExpMax']['Cancer']['ARI'][k] = metrics.adjusted_rand_score(cancerY,predictY)
    evals['ExpMax']['Cancer']['AMI'][k] = metrics.adjusted_mutual_info_score(cancerY,predictY)
    evals['ExpMax']['Cancer']['HOM'][k] = metrics.homogeneity_score(cancerY,predictY)
    evals['ExpMax']['Cancer']['COM'][k] = metrics.completeness_score(cancerY,predictY)
    if k > 1: evals['ExpMax']['Cancer']['SIL'][k] = metrics.silhouette_score(cancerX,predictY)
        
df_ck = pd.DataFrame(evals['Kmeans']['Cancer'])
df_ce = pd.DataFrame(evals['ExpMax']['Cancer'])

df_ck.to_hdf(out+'results.hdf','cancer_kmeans',complib='blosc',complevel=9)
df_ce.to_hdf(out+'results.hdf','cancer_expmax',complib='blosc',complevel=9)

In [3]:
range_k_clusters = [2,3,4,5,6,8,10,14,18,22,26,28,32,36,38,42,46,50,60,70]   

for k in range_k_clusters:
    #----------------------------------------Kmeans----------------------------------------#
    
    # Fit and Predict
    km.set_params(n_clusters=k)
    st = clock()
    km.fit(madelonX)
    ft = clock()
    predictY = km.predict(madelonX)
    pt = clock()
    
    # Save Fit and Prediction Times
    evals['Kmeans']['Madelon']['Fit_Time'][k] = ft - st
    evals['Kmeans']['Madelon']['Predict_Time'][k] = pt - ft
    
    # Save Evaluation Metrics
    evals['Kmeans']['Madelon']['SSE'][k] = -km.score(madelonX)
    evals['Kmeans']['Madelon']['ARI'][k] = metrics.adjusted_rand_score(madelonY,predictY)
    evals['Kmeans']['Madelon']['AMI'][k] = metrics.adjusted_mutual_info_score(madelonY,predictY)
    evals['Kmeans']['Madelon']['HOM'][k] = metrics.homogeneity_score(madelonY,predictY)
    evals['Kmeans']['Madelon']['COM'][k] = metrics.completeness_score(madelonY,predictY)
    if k > 1: evals['Kmeans']['Madelon']['SIL'][k] = metrics.silhouette_score(madelonX,predictY)

    #------------------------------Expectation Maximization------------------------------#

    
    # Fit and Predict
    gmm.set_params(n_components=k)    
    st = clock()
    gmm.fit(madelonX)
    ft = clock()
    predictY = gmm.predict(madelonX)
    pt = clock()
    
    # Save Fit and Prediction Times
    evals['ExpMax']['Madelon']['Fit_Time'][k] = ft - st
    evals['ExpMax']['Madelon']['Predict_Time'][k] = pt - ft
    
    # Save Evaluation Metrics
    evals['ExpMax']['Madelon']['LGL'][k] = gmm.score(madelonX)
    evals['ExpMax']['Madelon']['ARI'][k] = metrics.adjusted_rand_score(madelonY,predictY)
    evals['ExpMax']['Madelon']['AMI'][k] = metrics.adjusted_mutual_info_score(madelonY,predictY)
    evals['ExpMax']['Madelon']['HOM'][k] = metrics.homogeneity_score(madelonY,predictY)
    evals['ExpMax']['Madelon']['COM'][k] = metrics.completeness_score(madelonY,predictY)
    if k > 1: evals['ExpMax']['Madelon']['SIL'][k] = metrics.silhouette_score(madelonX,predictY)

df_mk = pd.DataFrame(evals['Kmeans']['Madelon'])
df_me = pd.DataFrame(evals['ExpMax']['Madelon'])

df_mk.to_hdf(out+'results.hdf','madelon_kmeans',complib='blosc',complevel=9)
df_me.to_hdf(out+'results.hdf','madelon_expmax',complib='blosc',complevel=9)
