In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import FastICA
from sklearn.manifold import MDS
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
import skfuzzy as fuzz

In [2]:
# %matplotlib inline
%matplotlib qt5
plt.rcParams['figure.dpi'] = 120

# Data preprocessing:

In [32]:
def read_data():
    data = pd.read_csv("e-shop data and description/e-shop clothing 2008.csv", delimiter=";")
    data = data.drop(['year', 'page 2 (clothing model)'], axis=1)
    return data

In [33]:
def normalize(data, verbose=True):
    scaler = StandardScaler()
    normalized_data = pd.DataFrame(data=scaler.fit_transform(data), columns=data.columns)
    for column in normalized_data.columns:
        if verbose:
            print(column, "column mean value is:", normalized_data[column].mean())
    print()
    return normalized_data

# Dimension reduction:

## PCA

In [5]:
def pca2components(data, verbose=True):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(data)
    data = pd.DataFrame(data=principal_components, columns = ['dim 1', 'dim 2'])
    data.plot.scatter('dim 1', 'dim 2', s=10)
    if verbose:
        print(pca.explained_variance_ratio_)
        print("\n total variance:", str(int((sum(pca.explained_variance_ratio_)*100))) + "%", "\n")
    return data

# Clustering methods:

### performing k-means and computing wss and silhouette score
WSS stands for Within-Cluster-Sum of Squared Errors

In [6]:
def k_means_optimization(data, max_k, verbose=True):
    wss = []
    s = []
    for k in range(1, max_k + 1):
        if verbose:
            print("computing with k =",k, end=' ')
        kmeans = KMeans(n_clusters=k, algorithm='full').fit(data)
        wss_k = kmeans.inertia_
        wss.append((k, wss_k))
        if k != 1:
            s_k = silhouette_score(data, labels=kmeans.labels_, sample_size=3000, random_state=100)
            s.append((k, s_k))
            if verbose:
                print("--> wss:", wss_k, "silhouette:", s_k)
        else:
            if verbose:
                print("--> wss:", wss_k)
    print('done')
    return (wss, s)

In [7]:
def plot_k_means_optimization(test_name: str, test_df, max_k: int, lfs=10, tfs=8):
    test_df = pd.DataFrame(data=test_df, columns=['k',test_name])
    test_ax = test_df.plot(x='k',y=test_name)
    test_ax.set_xlabel('k clusters', fontsize=lfs)
    test_ax.set_ylabel(test_name.replace('_', ' '), fontsize=lfs)
    test_ax.set_xticks(range(2, max_k, 2))
    for tick in test_ax.xaxis.get_major_ticks() + test_ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(tfs) 
    plt.show()

In [8]:
lfs=10 #label font_size
tfs=8 #tick font_size
# %matplotlib inline
%matplotlib qt5

In [11]:
def cluster_color(label):
    colors = ['b', 'g', 'r', 'c', 'm', '#fdff03', '#055803', '#a6fc00', '#9d70d1', '#ff703b', '#3a70d8', '#ff70b5']
    return colors[label % len(colors)]

In [12]:
def plot_clusters(data, labels, n_clusters, lfs=10, tfs=8):
    for label in range(n_clusters):
        plt.scatter(data.values[labels==label, 0], data.values[labels==label, 1], s=40, c=cluster_color(label))
    clusters_ax = plt.gca()
    clusters_ax.set_xlabel('dim1', fontsize=lfs)
    clusters_ax.set_ylabel('dim2', fontsize=lfs)
    plt.show()

In [13]:
def k_means(n_clusters: int):
    kmeans = KMeans(n_clusters=n_clusters, algorithm='full')
    labels = kmeans.fit_predict(data)
    return labels

In [15]:
def gmm(n_clusters: int):
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', init_params='kmeans'
                          , warm_start=True, n_init=3, random_state=100, verbose=2)
    gmm.fit(data)
    labels = gmm.predict(data)
    return labels

In [None]:
def fuzzy(fuzzy_n_clusters: int):
    cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(data.T.values, fuzzy_n_clusters, 2, error=0.005, maxiter=1000)
    fuzzy_labels = np.argmax(u, axis=0)
    return fuzzy_labels

# Main:

### preprocess:

In [17]:
verbose = False

In [40]:
data = read_data()
data = normalize(data, verbose=verbose)




### dimension reduction:

In [41]:
data = pca2components(data, verbose=verbose)

### K-Means
We choose the optimum k by looking at the WSS and the Silhouette score graphs.

In [20]:
max_k = 20
wss, s = k_means_optimization(data, max_k, verbose=verbose)

done


In [21]:
plot_k_means_optimization(test_name='wss_error', test_df=wss, max_k=max_k)

In [22]:
plot_k_means_optimization(test_name='silhouette', test_df=s, max_k=max_k)

In [23]:
km_clusters = 3

In [43]:
k_means_labels = k_means(k=km_clusters)

### GMM

In [25]:
gmm_clusters = 8
gmm_labels = gmm(gmm_clusters)

Initialization 0
Initialization converged: True	 time lapse 6.60234s	 ll -3.37332
Initialization 1
  Iteration 10	 time lapse 6.91945s	 ll change 0.00102
Initialization converged: True	 time lapse 7.47511s	 ll -3.37711
Initialization 2
  Iteration 10	 time lapse 5.88326s	 ll change 0.00103
Initialization converged: True	 time lapse 6.21526s	 ll -3.38396


### fuzzy c-means

In [None]:
fuzzy_clusters = 6
fuzzy_labels = fuzzy(fuzzy_clusters)

### plot clusters

In [45]:
plot_clusters(data, k_means_labels, n_clusters=km_clusters)

  app.exec_()


In [31]:
plot_clusters(data, gmm_labels, n_clusters=gmm_clusters)

In [31]:
plot_clusters(data, fuzzy_labels, n_clusters=fuzzy_clusters)