In [None]:
import warnings
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_rand_score as ari
import pandas as pd

### Loading Data set
x = pd.read_excel('') # data matrix
print("number of entities:", x.shaxpe[0], ", number of features:", x.shape[1])

x['Churn']=x['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
# binaries the target variable
x['MaritalStatus'] = pd.get_dummies(x['MaritalStatus'])['Yes']
# binaries the MaritalStatus: 1 if yes else 0

x['MaritalStatus'].value_counts()
#look at how many marital status yes and no

x_ = x[['MonthlyMinutes','RoamingCalls','MaritalStatus']]
#Choose the tree variable for k-mean: MonthlyMinutes, RoamingCalls, MaritalStatus


### Data Standardization

def standardizer(x):
    """
    standardize entity-to-feature data matrix by
    applying Z-scoring and Range standardization methods
    Arguments:
    x, numpy array, entity-to-feature data matrix
    Returns:
    Z-scored and Range standardized data matrices
    """
    x_ave = np.mean(x, axis=0)
    x_rng = np.ptp(x, axis=0)
    x_std = np.std(x, axis=0)
    x_zscr_std = np.divide(np.subtract(x, x_ave), x_std)
    x_rng_std = np.divide(np.subtract(x, x_ave), x_rng) # Range standardization scoring standardization
    return x_zscr_std, x_rng_std

In [None]:
### Applying Batch K-Means
from sklearn.cluster import KMeans


def apply_kmeans(x_org, n_clusters, n_repeats, std_method='r'):
    """
    Calls Mini-Batch Kmeans algorithm from Sklearn library. Parameters:
    x, a numpy arrary, entity-to-feature matrix,
    n_clusters, int, number of clusters to detect,
    n_repeats, int, number of repeats for different initialization
    Return:
    centroids, clusters labels over
    """
    tmp_inertia = 0
    clusters, best_clusters = {}, {}
    indices, best_indices = {}, {}
    cluster_means, best_cluster_means = {}, {}
    differences, best_differences = {}, {}
    rel_differences, best_rel_differences = {}, {}
    inertia, best_inertia = {}, {}
    g_mean = np.mean(x_org, axis=0)
    x_zscr_std, x_rng_std = standardizer(x=x_org)

    for i in range(n_repeats):
        clusters[i] = {}
        cluster_means[i] = {}
        differences[i] = {}
        rel_differences[i] = {}
        indices[i] = {}
        inertia[i] = {}

        # instantiate KMeans Alg. object
        km = KMeans(n_clusters=n_clusters, init='random', n_init=1, max_iter=500, tol=1e-4, random_state=i,
                    algorithm='full')  # verbose=1
        if std_method.lower() == 'r' or std_method.lower() == 'rng' or std_method.lower() == 'range':
            km.fit(x_rng_std)  # Compute k-means by calling fit method
        else:
            km.fit(x_zscr_std)  # Compute k-means by calling fit method

        # Store the computation results per each initilization
        for k in range(n_clusters):
            clusters[i][k] = x_org[np.where(km.labels_ == k)]
            indices[i][k] = np.where(km.labels_ == k)[0]

        inertia[i] = km.inertia_

        for k in range(n_clusters):
            cluster_means[i][k] = np.mean(clusters[i][k], axis=0)
            differences[i][k] = np.subtract(cluster_means[i][k], g_mean)
            rel_differences[i][k] = 100 * (np.divide(np.subtract(cluster_means[i][k], g_mean), g_mean))

        # to chose the best clustering results regarding the inertia
        if i == 0:
            tmp_inertia = km.inertia_
            delta = 0
        if i != 0:
            delta = tmp_inertia - km.inertia_
        if delta >= 0:
            tmp_inertia = km.inertia_
            for k in range(n_clusters):
                best_clusters[k] = x_org[np.where(km.labels_ == k)]
                best_indices[k] = np.where(km.labels_ == k)[0]
            for k in range(n_clusters):
                best_cluster_means[k] = np.mean(best_clusters[k], axis=0)
                best_differences[k] = np.subtract(best_cluster_means[k], g_mean)
                best_rel_differences[k] = 100 * (np.divide(np.subtract(best_cluster_means[k], g_mean), g_mean))

            best_inertia = km.inertia_

        return clusters, best_clusters, indices, best_indices, cluster_means, best_cluster_means, differences, rel_differences, best_differences, best_rel_differences, inertia, best_inertia

    n_clusters_1 = 5
    n_clusters_2 = 9

    clusters_1, best_clusters_1, indices_1, best_indices_1, cluster_means_1, best_cluster_means_1, \
        differences_1, rel_differences_1, best_differences_1, best_rel_differences_1, inertia_1, \
        best_intertia_1 = apply_kmeans(x_org=np.array(x_), n_clusters=n_clusters_1, n_repeats=10, std_method='z')

    def demonstrate_results(x, features, clusters, indices, cluster_means, differences, rel_differences, inertia):
        for repeat, results in clusters.items():
            print("results for the initialization number:", repeat, "inertia:", inertia[repeat], )

            for cluster, result in results.items():
                print("cluster number : " + str(cluster + 1), "Number of el.", len(indices[repeat][cluster]))
                print("cluster mean : ", cluster_means[repeat][cluster])
                print("grand mean : ", np.mean(x, axis=0))
                print("differences : ", differences[repeat][cluster])
                print("rel. differences: ", rel_differences[repeat][cluster])
            print()
            print("********************************************************************************")
            print()
        return

    features = list(x_.columns)
    demonstrate_results(x=x_, features=features,
                        clusters=clusters_1, indices=indices_1,
                        cluster_means=cluster_means_1, differences=differences_1,
                        rel_differences=rel_differences_1, inertia=inertia_1
                        )

    def demonstrate_best_results(x, features, clusters, indices, cluster_means, differences, rel_differences):
        for cluster, result in clusters.items():
            print()
            print("cluster number : " + str(cluster + 1), "Number of el.", len(indices[cluster]), ', which is ',
                  len(indices[cluster]) / 500)
            print("grand mean : ", np.mean(x, axis=0))
            print("cluster mean : ", cluster_means[cluster])
            print("differences : ", differences[cluster])
            print("rel. differences: ", rel_differences[cluster])
            print(pd.DataFrame({'Grand mean': np.mean(x, axis=0), 'Cluster mean': cluster_means[cluster],
                                'Absolute diff.': differences[cluster],
                                'Relative diff.': rel_differences[cluster]}).T)

    demonstrate_best_results(
        x=x_,
        features=features,
        clusters=best_clusters_1,
        indices=best_indices_1,
        cluster_means=best_cluster_means_1,
        differences=best_differences_1,
        rel_differences=best_rel_differences_1
    )

    clusters_2, best_clusters_2, indices_2, best_indices_2, cluster_means_2, best_cluster_means_2, \
        differences_2, rel_differences_2, \
        best_differences_2, best_rel_differences_2, \
        inertia_2, best_intertia_2 = apply_kmeans(x_org=np.array(x_), n_clusters=n_clusters_2, n_repeats=10,
                                                  std_method='z')

    demonstrate_best_results(x=x_, features=features,
                             clusters=best_clusters_2, indices=best_indices_2,
                             cluster_means=best_cluster_means_2, differences=best_differences_2,
                             rel_differences=best_rel_differences_2)