In [1]:
import warnings
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_rand_score as ari

In [2]:
np.set_printoptions(suppress=True, precision=3, linewidth=250)

### Loading Data set

In [3]:
x = np.loadtxt("../data/rin.dat")  #  data matrix

print("number of entities:", x.shape[0], ", number of features:", x.shape[1])

with open("../data/namrin", 'r') as fp:  # load names as list of string
    names_ = fp.readlines()

with open("../data/varrin", 'r') as fp:  # load features names as list of string
    features_ = fp.readlines()

number of entities: 91 , number of features: 5


In [4]:
# from sklearn.datasets import load_iris
# iris = load_iris()
# x = iris['data']
# y = iris['target']

### Data Standardization

In [5]:
def standardizer(x):
    
    """
        standardize entity-to-feature data matrix by 
          applying Z-scoring and Range standardization methods
        
        Arguments: 
            x, numpy array, entity-to-feature data matrix
        
        Returns:
            Z-scored and Range standardized data matrices
    """
    
    x_ave = np.mean(x, axis=0)
    x_rng = np.ptp(x, axis=0)
    x_std = np.std(x, axis=0)
    x_zscr_std = np.divide(np.subtract(x, x_ave), x_std)   # Z-scoring standardization
    x_rng_std = np.divide(np.subtract(x, x_ave), x_rng)  # Range standardization 
    return x_zscr_std, x_rng_std


### Applying Batch K-Means 

In [6]:
from sklearn.cluster import KMeans


def apply_kmeans(x_org, n_clusters, n_repeats, std_method='r'):


    """
        Calls Kmeans algorithm from Sklearn library.
        Parameters:
            x, a numpy arrary, entity-to-feature matrix,
            n_clusters, int, number of clusters to detect,
            n_repeats, int, number of repeats for different initilization
        Return:
            centroids, clusters labels over
    """
    
    tmp_inertia = 0
    clusters, best_clusters = {}, {}
    indices, best_indices = {}, {}
    cluster_means, best_cluster_means = {}, {}
    differences, best_differences = {}, {}
    rel_differences, best_rel_differences = {}, {}
    inertia, best_inertia = {}, {}
    g_mean = np.mean(x_org, axis=0)
    
    x_zscr_std, x_rng_std = standardizer(x=x_org)
        
#     x = x_org
    
    for i in range(n_repeats):
        clusters[i] = {}
        cluster_means[i] = {}
        differences[i] = {}
        rel_differences[i] = {}
        indices[i] = {}
        inertia[i] = {}
        
        # instantiate KMeans Alg. object
        km = KMeans(n_clusters=n_clusters, init='random', n_init=1, max_iter=500,
                    tol=1e-4, random_state=i, algorithm='full', )  # verbose=1
        if std_method.lower() == 'r' or \
        std_method.lower() == 'rng' or \
        std_method.lower() == 'range':
            km.fit(x_rng_std)  # Compute k-means by calling fit method 
            
        else:
            km.fit(x_zscr_std)  # Compute k-means by calling fit method 
        
        # Store the computation results per each initilization
        for k in range(n_clusters):
            clusters[i][k] = x_org[np.where(km.labels_==k)]
            indices[i][k] = np.where(km.labels_==k)[0]
            
        inertia[i] = km.inertia_
        
        for k in range(n_clusters):
            cluster_means[i][k] = np.mean(clusters[i][k], axis=0)    
            differences[i][k] = np.subtract(cluster_means[i][k], g_mean)
            rel_differences[i][k] = 100*(np.divide(
                np.subtract(cluster_means[i][k], g_mean), g_mean)
                                        )
        # to chose the best clustering results regarding the inertia
        if i == 0 :
            tmp_inertia = km.inertia_
            delta = 0
        if i != 0:
            delta = tmp_inertia - km.inertia_
        if delta >= 0:
            tmp_inertia = km.inertia_    
            for k in range(n_clusters):
                best_clusters[k] = x_org[np.where(km.labels_==k)]
                best_indices[k] = np.where(km.labels_==k)[0]
            for k in range(n_clusters):
                best_cluster_means[k] = np.mean(best_clusters[k], axis=0)
                best_differences[k] = np.subtract(best_cluster_means[k], g_mean)
                best_rel_differences[k] = 100*(np.divide(
                    np.subtract(best_cluster_means[k], g_mean), g_mean)
                                              )
            best_inertia = km.inertia_
    
    return  clusters, best_clusters, indices, \
best_indices, cluster_means, best_cluster_means, \
differences, rel_differences, best_differences, \
best_rel_differences , inertia, best_inertia

##### Calling the aforementioned function for n_clusters = 4 and demonstrating results

In [17]:
n_clusters_1 = 4
n_clusters_2 = 9

In [8]:
clusters_1, best_clusters_1, indices_1, best_indices_1,\
cluster_means_1, best_cluster_means_1,\
differences_1, rel_differences_1, \
best_differences_1, best_rel_differences_1, \
inertia_1, best_intertia_1 = apply_kmeans(x_org=x, 
                                          n_clusters=n_clusters_1,
                                          n_repeats=10, 
                                          std_method='z')

##### Demonstrating all the results

In [10]:
def demonstrate_results(x, features, clusters, indices, cluster_means,
                        differences, rel_differences, inertia):
    for repeat, results in clusters.items():
        print("results for the initilization number:", repeat,
              "inertia:", inertia[repeat],)
        
        for cluster, result in results.items():
            print("cluster number  : " + str(cluster+1),
                  "Number of el.", len(indices[repeat][cluster]))
            print("cluster mean    : ", cluster_means[repeat][cluster])
            print("grand mean      : ", 
                  np.mean(x, axis=0))
            print("differences     : ", differences[repeat][cluster])
            print("rel. differences: ", rel_differences[repeat][cluster])
        print( ) 
        print("********************************************************************************")
        print( )
    return 

In [11]:
features = [feature.strip().split(",")[0] for feature in features_]

demonstrate_results(x=x, features=features,
                    clusters=clusters_1, 
                    indices=indices_1, 
                    cluster_means=cluster_means_1, 
                    differences=differences_1, 
                    rel_differences=rel_differences_1, 
                    inertia=inertia_1)

results for the initilization number: 0 inertia: 166.7189318164843
cluster number  : 1 Number of el. 29
cluster mean    :  [164.808 155.18    9.616 718.238 145.162]
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
differences     :  [-22.483 -20.576  -1.941 -38.001 -36.963]
rel. differences:  [-12.004 -11.707 -16.794  -5.025 -20.295]
cluster number  : 2 Number of el. 5
cluster mean    :  [ 318.282  293.496   24.758 1026.399  490.126]
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
differences     :  [130.991 117.74   13.201 270.159 308.001]
rel. differences:  [ 69.94   66.99  114.217  35.724 169.116]
cluster number  : 3 Number of el. 44
cluster mean    :  [220.79  207.548  13.305 818.764 204.573]
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
differences     :  [33.499 31.791  1.748 62.525 22.448]
rel. differences:  [17.886 18.088 15.122  8.268 12.326]
cluster number  : 4 Number of el. 13
cluster mean    :  [ 73.685  68.771   4.895 525.48   70.139

##### Demostrating the best obtained results

In [12]:
def demonstrate_best_results(x, features, clusters, indices,
                             cluster_means, differences, rel_differences):
    
    for cluster, result in clusters.items():
        print( )
        print("cluster number  : " + str(cluster+1),
              "Number of el.", len(indices[cluster] ))
        print("grand mean      : ", 
              np.mean(x, axis=0))
        print("cluster mean    : ", cluster_means[cluster])
        print("differences     : ", differences[cluster])
        print("rel. differences: ", rel_differences[cluster])
    

In [13]:
demonstrate_best_results(x=x, features=features,
                         clusters=best_clusters_1,
                         indices=best_indices_1, 
                         cluster_means=best_cluster_means_1, 
                         differences=best_differences_1, 
                         rel_differences=best_rel_differences_1)


cluster number  : 1 Number of el. 13
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [ 73.685  68.771   4.895 525.48   70.139]
differences     :  [-113.606 -106.986   -6.663 -230.759 -111.985]
rel. differences:  [-60.657 -60.872 -57.65  -30.514 -61.488]

cluster number  : 2 Number of el. 28
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [164.928 155.334   9.587 710.445 140.571]
differences     :  [-22.363 -20.423  -1.97  -45.794 -41.554]
rel. differences:  [-11.94  -11.62  -17.047  -6.056 -22.816]

cluster number  : 3 Number of el. 5
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [ 318.282  293.496   24.758 1026.399  490.126]
differences     :  [130.991 117.74   13.201 270.159 308.001]
rel. differences:  [ 69.94   66.99  114.217  35.724 169.116]

cluster number  : 4 Number of el. 45
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [219.471 206.289  13.241 82

In [15]:
for cluster, idx in best_indices_1.items():
    print()
    print("cluster number:", cluster+1)
    for i in idx:
        print(names_[i])
        
    print("********************************************************************")
    


cluster number: 1
'Leningradsk'

'Adygea'

'Kalmykia'

'Krym'

'SevastopolCity'

'Nkaukasus'

'Dagestan'

'Ingush'

'Kabardin'

'Karachai'

'Chechen'

'AltayR'

'Tyva'

********************************************************************

cluster number: 2
'Bryansk'

'Ivanovo'

'Kostroma'

'Lipetsk'

'Tambov'

'Tver'

'Novgorod'

'Pskov'

'South'

'Astrachan'

'Volgograd'

'Nossetia'

'Stavropol'

'Mari'

'Mordovia'

'Orenburg'

'Penza'

'Kurgan'

'Sibir'

'Buryatia'

'Khakas'

'AltayD'

'Baikal'

'Krasnoyarsk'

'Kemerovo'

'Omsk'

'Amur'

'Evreisk'

********************************************************************

cluster number: 3
'Moscow_City'

'NW'

'Karelia'

'Spetersburg'

'Novosibirsk'

********************************************************************

cluster number: 4
'Russia'

'Central'

'Belgorod'

'Vladimir'

'Voronezh'

'Kaluga'

'Kursk'

'Moscow_Obl'

'Orel'

'Ryazan'

'Smolensk'

'Tula'

'Yaroslav'

'Komi'

'Archangel'

'Vologda'

'Kaliningrad'

'Murmansk'

'Kras

In [18]:
clusters_2, best_clusters_2, indices_2, best_indices_2,\
cluster_means_2, best_cluster_means_2,\
differences_2, rel_differences_2, \
best_differences_2, best_rel_differences_2, \
inertia_2, best_intertia_2 = apply_kmeans(x_org=x, 
                                          n_clusters=n_clusters_2,
                                          n_repeats=10, 
                                          std_method='z')

In [20]:
demonstrate_best_results(x=x, features=features,
                         clusters=best_clusters_2,
                         indices=best_indices_2, 
                         cluster_means=best_cluster_means_2, 
                         differences=best_differences_2, 
                         rel_differences=best_rel_differences_2)


cluster number  : 1 Number of el. 2
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [191.75  169.856  22.88  999.787  72.636]
differences     :  [   4.459   -5.901   11.322  243.548 -109.488]
rel. differences:  [  2.381  -3.358  97.964  32.205 -60.117]

cluster number  : 2 Number of el. 6
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [296.576 280.274  16.275 899.14  423.349]
differences     :  [109.285 104.517   4.718 142.901 241.225]
rel. differences:  [ 58.35   59.467  40.821  18.896 132.45 ]

cluster number  : 3 Number of el. 22
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [235.869 222.206  13.694 796.813 206.835]
differences     :  [48.578 46.45   2.136 40.573 24.71 ]
rel. differences:  [25.937 26.428 18.485  5.365 13.568]

cluster number  : 4 Number of el. 3
grand mean      :  [187.291 175.757  11.557 756.239 182.125]
cluster mean    :  [97.154 90.165  6.99  44.633 97.967]
differe

In [21]:
for cluster, idx in best_indices_2.items():
    print()
    print("cluster number:", cluster)
    for i in idx:
        print(names_[i])
    print("********************************************************************")
    


cluster number: 0
'Krasnodar'

'Chukotka'

********************************************************************

cluster number: 1
'Central'

'Moscow_City'

'NW'

'Karelia'

'Murmansk'

'Novosibirsk'

********************************************************************

cluster number: 2
'Russia'

'Voronezh'

'Kaluga'

'Kursk'

'Ryazan'

'Smolensk'

'Tula'

'Yaroslav'

'Komi'

'Kaliningrad'

'Volga'

'Tatarstan'

'Chuvashia'

'Perm'

'Nnogorod'

'Samara'

'Ulianovsk'

'Ural'

'Sverdlovsk'

'Tumen'

'Cheliabinsk'

'Magadan'

********************************************************************

cluster number: 3
'Leningradsk'

'Krym'

'SevastopolCity'

********************************************************************

cluster number: 4
'Moscow_Obl'

'FarEast'

'Sacha'

'Kamchatka'

'Primorsk'

'Khabarovsk'

'Amur'

'Sachalin'

********************************************************************

cluster number: 5
'Spetersburg'

********************************************************