In [1]:
import warnings
import numpy as np
import pandas as pd
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_rand_score as ari

In [2]:
np.set_printoptions(suppress=True, precision=3, linewidth=250)

### Loading Data set

In [3]:
demo_xls = pd.ExcelFile("../data/demo.xlsx")
demo_norm = pd.read_excel(demo_xls,
                          "norm").sort_values(by=["SubjID"]).dropna()

demo_dyslexia = pd.read_excel(demo_xls, 
                              "dyslexia").sort_values(by=["SubjID"]).dropna()

demo_norm.shape, demo_dyslexia.shape

((217, 9), (72, 9))

In [4]:
demo_dyslexia.columns

Index(['Group', 'SubjID', 'Sex', 'Grade', 'Age', 'IQ', 'Reading_speed',
       'Sound_detection', 'Sound_change'],
      dtype='object')

In [5]:
features = ["SubjID", "Sex", "Grade", "Age", "IQ", "Sound_detection", "Sound_change", ]

targets = ["SubjID", "Group", "Reading_speed", ]


In [6]:

x_df = pd.concat([demo_norm.loc[:, features], demo_dyslexia.loc[:, features]])
x_df.head()



Unnamed: 0,SubjID,Sex,Grade,Age,IQ,Sound_detection,Sound_change
182,108_norm2,fem,4,10,34,1.0,0.95
184,10_norm2,fem,4,10,23,0.62,0.91
215,11_norm1,fem,2,9,28,0.95,0.91
185,11_norm2,masc,4,10,34,1.0,0.79
210,12_norm1,masc,2,8,32,0.95,0.62


In [7]:
x_df["Sex"].replace({"fem":1, "f":1, "masc":-1, "m":-1}, inplace=True)
x_df.head()

Unnamed: 0,SubjID,Sex,Grade,Age,IQ,Sound_detection,Sound_change
182,108_norm2,1,4,10,34,1.0,0.95
184,10_norm2,1,4,10,23,0.62,0.91
215,11_norm1,1,2,9,28,0.95,0.91
185,11_norm2,-1,4,10,34,1.0,0.79
210,12_norm1,-1,2,8,32,0.95,0.62


In [8]:

y_df = pd.concat([demo_norm.loc[:, targets], demo_dyslexia.loc[:, targets],  ])
y_df.head()

Unnamed: 0,SubjID,Group,Reading_speed
182,108_norm2,norm,139
184,10_norm2,norm,102
215,11_norm1,norm,71
185,11_norm2,norm,110
210,12_norm1,norm,60


In [9]:
y_df["Group"].replace({"norm":0, "dyslexia":1}, inplace=True)
y_df.head()

Unnamed: 0,SubjID,Group,Reading_speed
182,108_norm2,0,139
184,10_norm2,0,102
215,11_norm1,0,71
185,11_norm2,0,110
210,12_norm1,0,60


### Data Standardization

In [10]:
def standardizer(x):
    
    """
        standardize entity-to-feature data matrix by 
          applying Z-scoring and Range standardization methods
        
        Arguments: 
            x, numpy array, entity-to-feature data matrix
        
        Returns:
            Z-scored and Range standardized data matrices
    """
    
    x_ave = np.mean(x, axis=0)
    x_rng = np.ptp(x, axis=0)
    x_std = np.std(x, axis=0)
    x_zscr_std = np.divide(np.subtract(x, x_ave), x_std)   # Z-scoring standardization
    x_rng_std = np.divide(np.subtract(x, x_ave), x_rng)  # Range standardization 
    return x_zscr_std, x_rng_std


### Applying Batch K-Means 

In [11]:


def apply_kmeans(x_org, n_clusters, n_repeats, std_method='r'):
    
    from sklearn.cluster import KMeans


    """
        Calls Kmeans algorithm from Sklearn library.
        Parameters:
            x, a numpy arrary, entity-to-feature matrix,
            n_clusters, int, number of clusters to detect,
            n_repeats, int, number of repeats for different initilization
        Return:
            centroids, clusters labels over
    """
    
    tmp_inertia = 0
    clusters, best_clusters = {}, {}
    indices, best_indices = {}, {}
    cluster_means, best_cluster_means = {}, {}
    differences, best_differences = {}, {}
    rel_differences, best_rel_differences = {}, {}
    inertia, best_inertia = {}, {}
    g_mean = np.mean(x_org, axis=0)
    
    x_zscr_std, x_rng_std = standardizer(x=x_org)
        
    for i in range(n_repeats):
        clusters[i] = {}
        cluster_means[i] = {}
        differences[i] = {}
        rel_differences[i] = {}
        indices[i] = {}
        inertia[i] = {}
        
        # instantiate KMeans Alg. object
        km = KMeans(n_clusters=n_clusters, init='random', n_init=1, max_iter=1000,
                    tol=1e-5, random_state=i, algorithm='full', )  # verbose=1
        if std_method.lower() == 'r' or \
        std_method.lower() == 'rng' or \
        std_method.lower() == 'range':
            km.fit(x_rng_std)  # Compute k-means by calling fit method 
        else:
            km.fit(x_zscr_std)  # Compute k-means by calling fit method 
        
        # Store the computation results per each initilization
        for k in range(n_clusters):
            clusters[i][k] = x_org[np.where(km.labels_==k)]
            indices[i][k] = np.where(km.labels_==k)[0]
            
        inertia[i] = km.inertia_
        
        for k in range(n_clusters):
            cluster_means[i][k] = np.mean(clusters[i][k], axis=0)    
            differences[i][k] = np.subtract(cluster_means[i][k], g_mean)
            rel_differences[i][k] = 100*(np.divide(
                np.subtract(cluster_means[i][k], g_mean), g_mean)
                                        )
        # to chose the best clustering results regarding the inertia
        if i == 0 :
            tmp_inertia = km.inertia_
            delta = 0
        if i != 0:
            delta = tmp_inertia - km.inertia_
        if delta >= 0:
            tmp_inertia = km.inertia_    
            for k in range(n_clusters):
                best_clusters[k] = x_org[np.where(km.labels_==k)]
                best_indices[k] = np.where(km.labels_==k)[0]
            for k in range(n_clusters):
                best_cluster_means[k] = np.mean(best_clusters[k], axis=0)
                best_differences[k] = np.subtract(best_cluster_means[k], g_mean)
                best_rel_differences[k] = 100*(np.divide(
                    np.subtract(best_cluster_means[k], g_mean), g_mean)
                                              )
            best_inertia = km.inertia_
    
    return  clusters, best_clusters, indices, \
best_indices, cluster_means, best_cluster_means, \
differences, rel_differences, best_differences, \
best_rel_differences , inertia, best_inertia

##### Calling the aforementioned function for n_clusters = 4 and demonstrating results

In [12]:
n_clusters_1 = 2
n_clusters_2 = 3

In [13]:
x_df.iloc[:, 1:].values

array([[ 1.   ,  4.   , 10.   , 34.   ,  1.   ,  0.95 ],
       [ 1.   ,  4.   , 10.   , 23.   ,  0.62 ,  0.91 ],
       [ 1.   ,  2.   ,  9.   , 28.   ,  0.95 ,  0.91 ],
       ...,
       [ 1.   ,  3.   ,  9.   , 34.   ,  0.875,  0.5  ],
       [ 1.   ,  3.   , 10.   , 31.   ,  0.958,  0.917],
       [-1.   ,  3.   , 10.   , 32.   ,  0.957,  0.875]])

In [14]:
clusters_1, best_clusters_1, indices_1, best_indices_1,\
cluster_means_1, best_cluster_means_1,\
differences_1, rel_differences_1, \
best_differences_1, best_rel_differences_1, \
inertia_1, best_intertia_1 = apply_kmeans(x_org=x_df.iloc[:, 1:].values, 
                                          n_clusters=n_clusters_1,
                                          n_repeats=10, 
                                          std_method='z')

##### Demonstrating all the results

In [15]:
def demonstrate_results(x, features, clusters, indices, cluster_means,
                        differences, rel_differences, inertia):
    for repeat, results in clusters.items():
        print("results for the initilization number:", repeat,
              "inertia:", inertia[repeat],)
        
        for cluster, result in results.items():
            print("cluster number  : " + str(cluster+1),
                  "Number of el.", len(indices[repeat][cluster]))
            print("cluster mean    : ", cluster_means[repeat][cluster])
            print("grand mean      : ", 
                  np.mean(x, axis=0))
            print("differences     : ", differences[repeat][cluster])
            print("rel. differences: ", rel_differences[repeat][cluster])
        print( ) 
        print("********************************************************************************")
        print( )
    return 

In [16]:
# features = [feature.strip().split(",")[0] for feature in features_]

demonstrate_results(x=x_df.iloc[:, 2:].values, 
                    features=features,
                    clusters=clusters_1, 
                    indices=indices_1, 
                    cluster_means=cluster_means_1, 
                    differences=differences_1, 
                    rel_differences=rel_differences_1, 
                    inertia=inertia_1)

results for the initilization number: 0 inertia: 1263.2686269691771
cluster number  : 1 Number of el. 159
cluster mean    :  [-0.195  4.201 10.478 32.654  0.916  0.837]
grand mean      :  [ 3.104  9.374 31.26   0.898  0.767]
differences     :  [-0.074  1.097  1.104  1.395  0.018  0.07 ]
rel. differences:  [60.988 35.358 11.781  4.461  2.052  9.122]
cluster number  : 2 Number of el. 130
cluster mean    :  [-0.031  1.762  8.023 29.554  0.876  0.682]
grand mean      :  [ 3.104  9.374 31.26   0.898  0.767]
differences     :  [ 0.09  -1.342 -1.351 -1.706 -0.023 -0.086]
rel. differences:  [-74.593 -43.246 -14.409  -5.456  -2.51  -11.157]

********************************************************************************

results for the initilization number: 1 inertia: 1263.2686269691771
cluster number  : 1 Number of el. 130
cluster mean    :  [-0.031  1.762  8.023 29.554  0.876  0.682]
grand mean      :  [ 3.104  9.374 31.26   0.898  0.767]
differences     :  [ 0.09  -1.342 -1.351 -1.706 -0.0

##### Demostrating the best obtained results

In [17]:
def demonstrate_best_results(x, features, clusters, indices,
                             cluster_means, differences, rel_differences):
    
    for cluster, result in clusters.items():
        print( )
        print("cluster number  : " + str(cluster+1),
              "Number of el.", len(indices[cluster] ))
        print("grand mean      : ", 
              np.mean(x, axis=0))
        print("cluster mean    : ", cluster_means[cluster])
        print("differences     : ", differences[cluster])
        print("rel. differences: ", rel_differences[cluster])
    

In [18]:
features[2:]

['Grade', 'Age', 'IQ', 'Sound_detection', 'Sound_change']

In [19]:
demonstrate_best_results(x=x_df.iloc[:, 1:].values,
                         features=features,
                         clusters=best_clusters_1,
                         indices=best_indices_1, 
                         cluster_means=best_cluster_means_1, 
                         differences=best_differences_1, 
                         rel_differences=best_rel_differences_1)


cluster number  : 1 Number of el. 130
grand mean      :  [-0.121  3.104  9.374 31.26   0.898  0.767]
cluster mean    :  [-0.031  1.762  8.023 29.554  0.876  0.682]
differences     :  [ 0.09  -1.342 -1.351 -1.706 -0.023 -0.086]
rel. differences:  [-74.593 -43.246 -14.409  -5.456  -2.51  -11.157]

cluster number  : 2 Number of el. 159
grand mean      :  [-0.121  3.104  9.374 31.26   0.898  0.767]
cluster mean    :  [-0.195  4.201 10.478 32.654  0.916  0.837]
differences     :  [-0.074  1.097  1.104  1.395  0.018  0.07 ]
rel. differences:  [60.988 35.358 11.781  4.461  2.052  9.122]


In [None]:
clusters_2, best_clusters_2, indices_2, best_indices_2,\
cluster_means_2, best_cluster_means_2,\
differences_2, rel_differences_2, \
best_differences_2, best_rel_differences_2, \
inertia_2, best_intertia_2 = apply_kmeans(x_org=x_df.iloc[:, 1:].values, 
                                          n_clusters=n_clusters_2,
                                          n_repeats=10, 
                                          std_method='z')

In [None]:
demonstrate_best_results(x=x_df.iloc[:, 1:].values,
                         features=features,
                         clusters=best_clusters_2,
                         indices=best_indices_2, 
                         cluster_means=best_cluster_means_2, 
                         differences=best_differences_2, 
                         rel_differences=best_rel_differences_2)

In [None]:
features[1:]