In [None]:
import warnings
import numpy as np
from sklearn.metrics.cluster import contingency_matrix
from sklearn.metrics import adjusted_rand_score as ari
import pandas as pd

### Loading Data set
x = pd.read_excel('') # data matrix
print("number of entities:", x.shaxpe[0], ", number of features:", x.shape[1])

x['Churn']=x['Churn'].apply(lambda x: 1 if x == 'Yes' else 0)
# binaries the target variable
x['MaritalStatus'] = pd.get_dummies(x['MaritalStatus'])['Yes']
# binaries the MaritalStatus: 1 if yes else 0

x['MaritalStatus'].value_counts()
#look at how many marital status yes and no

x_ = x[['MonthlyMinutes','RoamingCalls','MaritalStatus']]
#Choose the tree variable for k-mean: MonthlyMinutes, RoamingCalls, MaritalStatus


### Data Standardization

def standardizer(x):
    """
    standardize entity-to-feature data matrix by
    applying Z-scoring and Range standardization methods
    Arguments:
    x, numpy array, entity-to-feature data matrix
    Returns:
    Z-scored and Range standardized data matrices
    """
    x_ave = np.mean(x, axis=0)
    x_rng = np.ptp(x, axis=0)
    x_std = np.std(x, axis=0)
    x_zscr_std = np.divide(np.subtract(x, x_ave), x_std) # Z-scoring standardization
    x_rng_std = np.divide(np.subtract(x, x_ave), x_rng) # Range standardization scoring standardization
    return x_zscr_std, x_rng_std

In [None]:
from sklearn.cluster import KMeans
def apply_kmeans(x_org, n_clusters, n_repeats, std_method='r'):


    """
        Calls Kmeans algorithm from Sklearn library.
        Parameters:
            x, a numpy arrary, entity-to-feature matrix,
            n_clusters, int, number of clusters to detect,
            n_repeats, int, number of repeats for different initilization
        Return:
            centroids, clusters labels over
    """
    
    tmp_inertia = 0
    clusters, best_clusters = {}, {}
    indices, best_indices = {}, {}
    cluster_means, best_cluster_means = {}, {}
    differences, best_differences = {}, {}
    rel_differences, best_rel_differences = {}, {}
    inertia, best_inertia = {}, {}
    g_mean = np.mean(x_org, axis=0)
    
    x_zscr_std= standardizer(x=x_org)
        
    x = x_org
    
    for i in range(n_repeats):
        clusters[i] = {}
        cluster_means[i] = {}
        differences[i] = {}
        rel_differences[i] = {}
        indices[i] = {}
        inertia[i] = {}
        
        # instantiate KMeans Alg. object
        km = KMeans(n_clusters=n_clusters, init='random', n_init=1, max_iter=500,
                    tol=1e-4, random_state=i, algorithm='full', )  # verbose=1
        km.fit(x_zscr_std) 
        
        if std_method.lower() == 'r' or \
        std_method.lower() == 'rng' or \
        std_method.lower() == 'range':
            km.fit(x_rng_std)  # Compute k-means by calling fit method 
            
        else:
            km.fit(x_zscr_std)  # Compute k-means by calling fit method
            
        
        # Store the computation results per each initilization
        for k in range(n_clusters):
            clusters[i][k] = x_org.iloc[np.where(km.labels_==k)[0]]
            indices[i][k] = np.where(km.labels_==k)[0]
            
        inertia[i] = km.inertia_
        
        for k in range(n_clusters):
            cluster_means[i][k] = np.mean(clusters[i][k], axis=0)    
            differences[i][k] = np.subtract(cluster_means[i][k], g_mean)
            rel_differences[i][k] = 100*(np.divide(
                np.subtract(cluster_means[i][k], g_mean), g_mean)
                                        )
        # to chose the best clustering results regarding the inertia
        if i == 0 :
            tmp_inertia = km.inertia_
            delta = 0
        if i != 0:
            delta = tmp_inertia - km.inertia_
        if delta >= 0:
            tmp_inertia = km.inertia_    
            for k in range(n_clusters):
                best_clusters[k] = x_org.loc[np.where(km.labels_==k)[0]]
                best_indices[k] = np.where(km.labels_==k)[0]
            for k in range(n_clusters):
                best_cluster_means[k] = np.mean(best_clusters[k], axis=0)
                best_differences[k] = np.subtract(best_cluster_means[k], g_mean)
                best_rel_differences[k] = 100*(np.divide(
                    np.subtract(best_cluster_means[k], g_mean), g_mean)
                                              )
            best_inertia = km.inertia_
    
    return  clusters, best_clusters, indices, \
best_indices, cluster_means, best_cluster_means, \
differences, rel_differences, best_differences, \
best_rel_differences , inertia, best_inertia