In [3]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [4]:
# %matplotlib inline
%matplotlib qt5
plt.rcParams['figure.dpi'] = 120

# Data preprocessing:

In [5]:
def read_data():
    data = pd.read_csv("e-shop data and description/e-shop clothing 2008.csv", delimiter=";")
    # clothing_model = data["page 2 (clothing model)"]
    # clothing_model_list = clothing_model.values.tolist()
    # clothing_signs = []
    # clothing_signs_dict = {}
    # for sign in clothing_model_list:
    #     if sign not in clothing_signs:
    #         clothing_signs.append(sign)
    # for letter in ['P','C','B','A']:
    #     clothing_signs.sort(key=lambda x: int(x[1:]) if x[0]==letter else 1000)
    # i = 1
    # for sign in clothing_signs:
    #     clothing_signs_dict[sign] = i
    #     i += 1
    ## print('total amount of different clothing signs:',len(clothing_signs))
    ## print(clothing_signs_dict)
    # data = data.replace({'page 2 (clothing model)':clothing_signs_dict})
    data = data.drop(['year', 'page 2 (clothing model)'], axis=1)
    return data

## data normalization

In [6]:
def normalized(data, verbose=True):
    scaler = StandardScaler()
    normalized_data = pd.DataFrame(data=scaler.fit_transform(data), columns=data.columns)
    for column in normalized_data.columns:
        if verbose:
            print(column, "column mean value is:", normalized_data[column].mean())
    print()
    return normalized_data

# Dimension reduction

## PCA

In [7]:
def pca2components(data, verbose=True):
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(data)
    data = pd.DataFrame(data=principal_components, columns = ['dim 1', 'dim 2'])
    data.plot.scatter('dim 1', 'dim 2', s=10)
    if verbose:
        print(pca.explained_variance_ratio_)
        print("\n total variance:", str(int((sum(pca.explained_variance_ratio_)*100))) + "%", "\n")
    return data

# Clustering methods

## K-Means

### performing k-means and computing wss and silhouette score
WSS stands for Within-Cluster-Sum of Squared Errors

In [8]:
def k_means_optimization(data, max_k, verbose=True):
    wss = []
    s = []
    for k in range(1, max_k + 1):
        if verbose:
            print("computing with k =",k, end=' ')
        kmeans = KMeans(n_clusters=k, algorithm='full').fit(data)
        wss_k = kmeans.inertia_
        wss.append((k, wss_k))
        if k != 1:
            s_k = silhouette_score(data, labels=kmeans.labels_, sample_size=3000, random_state=100)
            s.append((k, s_k))
            if verbose:
                print("--> wss:", wss_k, "silhouette:", s_k)
        else:
            if verbose:
                print("--> wss:", wss_k)
    print('done')
    return (wss, s)

#### plot K-Means optimization:

In [9]:
def plot_k_means_optimization(test_name: str, test_df, max_k: int, lfs=10, tfs=8):
    test_df = pd.DataFrame(data=test_df, columns=['k',test_name])
    test_ax = test_df.plot(x='k',y=test_name)
    test_ax.set_xlabel('k clusters', fontsize=lfs)
    test_ax.set_ylabel(test_name.replace('_', ' '), fontsize=lfs)
    test_ax.set_xticks(range(2, max_k, 2))
    for tick in test_ax.xaxis.get_major_ticks() + test_ax.yaxis.get_major_ticks():
        tick.label.set_fontsize(tfs) 
    plt.show()

#### WSS of k-clusters plot:

In [535]:
lfs=10 #label font_size
tfs=8 #tick font_size
# %matplotlib inline
%matplotlib qt5

In [536]:
# wss_df = pd.DataFrame(data=wss, columns=['k','wss'])
# wss_ax = wss_df.plot(x='k',y='wss')
# wss_ax.set_xlabel('k clusters', fontsize=lfs)
# wss_ax.set_ylabel('wss', fontsize=lfs)
# wss_ax.set_xticks(range(2, max_k, 2))
# for tick in wss_ax.xaxis.get_major_ticks() + wss_ax.yaxis.get_major_ticks():
#     tick.label.set_fontsize(tfs) 
# plt.show()

#### Silhouette score of k-clusters plot:

In [537]:
# s_df = pd.DataFrame(data=s, columns=['k','s'])
# s_ax = s_df.plot(x='k',y='s')
# s_ax.set_xlabel('k clusters', fontsize=lfs)
# s_ax.set_ylabel('Silhouette score', fontsize=lfs)
# s_ax.set_xticks(range(0, max_k, 2))
# for tick in wss_ax.xaxis.get_major_ticks() + wss_ax.yaxis.get_major_ticks():
#     tick.label.set_fontsize(tfs) 
# plt.show()

### color function

In [601]:
def cluster_color(label):
    colors = ['b', 'g', 'r', 'c', 'm', '#fdff03', '#055803', '#a6fc00', '#9d70d1', '#ff703b', '#3a70d8', '#ff70b5']
    return colors[label % len(colors)]

### plot clusters:

In [624]:
def plot_clusters(data, labels, n_clusters, lfs=10, tfs=8):
    for label in range(n_clusters):
        plt.scatter(data.values[labels==label, 0], data.values[labels==label, 1], s=40, c=cluster_color(label))
    clusters_ax = plt.gca()
    clusters_ax.set_xlabel('dim1', fontsize=lfs)
    clusters_ax.set_ylabel('dim2', fontsize=lfs)
    plt.show()

### K-Means
We choose the optimum k by looking at the WSS and the Silhouette score graphs.

In [540]:
def k_means(k: int):
    kmeans = KMeans(n_clusters=k, algorithm='full')
    labels = kmeans.fit_predict(data)
    return labels

In [541]:
# for label in range(optimum_k):
#     plt.scatter(data.values[labels==label, 0], data.values[labels==label, 1], s=40, c=cluster_color(label))
# clusters_ax = plt.gca()
# clusters_ax.set_xlabel('dim1', fontsize=lfs)
# clusters_ax.set_ylabel('dim2', fontsize=lfs)
# plt.show()

## GMM

In [542]:
def gmm(n_clusters: int):
    gmm = GaussianMixture(n_components=n_clusters, covariance_type='full', init_params='kmeans'
                          , warm_start=True, n_init=3, random_state=100, verbose=2)
    gmm.fit(data)
    labels = gmm.predict(data)
    return labels

In [543]:
# for label in range(gmm_k):
#     plt.scatter(data.values[labels==label, 0], data.values[labels==label, 1], s=40, c=cluster_color(label))
# clusters_ax = plt.gca()
# clusters_ax.set_xlabel('dim1', fontsize=lfs)
# clusters_ax.set_ylabel('dim2', fontsize=lfs)
# plt.show()

# Main:

### preprocess:

In [10]:
verbose = False

In [11]:
data = read_data()
data = normalized(data, verbose=verbose)




### dimension reduction:

In [12]:
data = pca2components(data, verbose=verbose)

### K-Means

In [631]:
max_k = 20
wss, s = k_means_optimization(data, max_k, verbose=verbose)

done


In [632]:
plot_k_means_optimization(test_name='wss_error', test_df=wss, max_k=max_k)

In [633]:
plot_k_means_optimization(test_name='silhouette', test_df=s, max_k=max_k)

In [637]:
km_clusters = 8

In [638]:
k_means_labels = k_means(k=km_clusters)

### GMM

In [639]:
gmm_clusters = 8
gmm_labels = gmm(gmm_clusters)

Initialization 0
Initialization converged: True	 time lapse 6.47335s	 ll -3.37332
Initialization 1
  Iteration 10	 time lapse 6.98993s	 ll change 0.00102
Initialization converged: True	 time lapse 7.52894s	 ll -3.37711
Initialization 2
  Iteration 10	 time lapse 7.02007s	 ll change 0.00103
Initialization converged: True	 time lapse 7.58760s	 ll -3.38396


### plot clusters

In [1]:
plot_clusters(data, k_means_labels, n_clusters=km_clusters)

NameError: name 'plot_clusters' is not defined

In [644]:
plot_clusters(data, gmm_labels, n_clusters=gmm_clusters)