In [357]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import MDS
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [302]:
# %matplotlib inline
%matplotlib qt5
plt.rcParams['figure.dpi'] = 120

# Data preproccesing

In [231]:
data = pd.read_csv("e-shop data and description/e-shop clothing 2008.csv", delimiter=";")
clothing_model = data["page 2 (clothing model)"]
clothing_model_list = clothing_model.values.tolist()
clothing_signs = []
clothing_signs_dict = {}
for sign in clothing_model_list:
    if sign not in clothing_signs:
        clothing_signs.append(sign)
for letter in ['P','C','B','A']:
    clothing_signs.sort(key=lambda x: int(x[1:]) if x[0]==letter else 1000)
i = 1
for sign in clothing_signs:
    clothing_signs_dict[sign] = i
    i += 1
# print('total amount of different clothing signs:',len(clothing_signs))
# print(clothing_signs_dict)
data = data.replace({'page 2 (clothing model)':clothing_signs_dict})
data = data.drop(['year', 'page 2 (clothing model)'], axis=1)
data

Unnamed: 0,month,day,order,country,session ID,page 1 (main category),colour,location,model photography,price,price 2,page
0,4,1,1,29,1,1,1,5,1,28,2,1
1,4,1,2,29,1,1,1,6,1,33,2,1
2,4,1,3,29,1,2,10,2,1,52,1,1
3,4,1,4,29,1,2,6,6,2,38,2,1
4,4,1,5,29,1,2,4,3,2,52,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...
165469,8,13,1,29,24024,2,2,4,1,67,1,1
165470,8,13,1,9,24025,1,3,4,1,62,1,1
165471,8,13,1,34,24026,1,3,1,1,43,2,1
165472,8,13,2,34,24026,3,12,1,1,43,1,1


## data normalization

In [232]:
scaler = StandardScaler()
normalized_data = pd.DataFrame(data=scaler.fit_transform(data), columns=data.columns)
for column in normalized_data.columns:
    print(column, "column mean value is:", normalized_data[column].mean())

month column mean value is: 1.070023469479941e-13
day column mean value is: 3.9497263970964005e-15
order column mean value is: -4.940267696348297e-15
country column mean value is: 7.67555573766148e-15
session ID column mean value is: -4.310364104593465e-14
page 1 (main category) column mean value is: 2.827548316713672e-16
colour column mean value is: -5.2480806093639846e-15
location column mean value is: -3.5164966061067447e-15
model photography column mean value is: 5.905206450482421e-15
price column mean value is: 2.8704868164960917e-15
price 2 column mean value is: 1.2170342704916164e-14
page column mean value is: -2.4309092421336522e-14


# Dimension reduction

## PCA

In [233]:
pca = PCA(n_components=2)
principal_components = pca.fit_transform(normalized_data)
data = pd.DataFrame(data=principal_components, columns = ['dim 1', 'dim 2'])
data.plot.scatter('dim 1', 'dim 2', s=10)
print(pca.explained_variance_ratio_)
print("\n total variance:", str(int((sum(pca.explained_variance_ratio_)*100))) + "%")

[0.1731577  0.16582818]

 total variance: 33%


In [234]:
data

Unnamed: 0,dim 1,dim 2
0,-0.644907,-2.271486
1,-0.806228,-2.248928
2,-1.559609,-1.873883
3,0.453565,-2.369259
4,-1.033731,-1.891246
...,...,...
165469,-1.922648,2.864679
165470,-2.285834,2.586794
165471,-0.593539,2.547174
165472,0.004884,2.601500


# Clustering methods

## K-Means

### performing k-means and computing wss and silhouette score
WSS stands for Within-Cluster-Sum of Squared Errors

In [354]:
max_k = 20
wss = []
s = []
for k in range(1, max_k + 1):
    print("computing with k =",k, end=' ')
    kmeans = KMeans(n_clusters=k, algorithm='full').fit(data)
    wss_k = kmeans.inertia_
    wss.append((k, wss_k))
    if k != 1:
        s_k = silhouette_score(data, labels=kmeans.labels_, sample_size=3000, random_state=100)
        s.append((k, s_k))
        print("--> wss:", wss_k, "silhouette:", s_k)
    else:
        print("--> wss:", wss_k)

computing with k = 1 --> wss: 673120.2056813681
computing with k = 2 --> wss: 421099.3943938402 silhouette: 0.37985061435615564
computing with k = 3 --> wss: 272270.0629038268 silhouette: 0.38728721381712605
computing with k = 4 --> wss: 198141.09482258247 silhouette: 0.37773779989067807
computing with k = 5 --> wss: 166121.76665189117 silhouette: 0.3709950767220615
computing with k = 6 --> wss: 138531.04058565563 silhouette: 0.37237416609200713
computing with k = 7 --> wss: 115454.77114195618 silhouette: 0.3721312089874406
computing with k = 8 --> wss: 100938.88547309872 silhouette: 0.3807548566296538
computing with k = 9 --> wss: 88135.41300242335 silhouette: 0.38293652298971886
computing with k = 10 --> wss: 77196.66270036924 silhouette: 0.38452780250025065
computing with k = 11 --> wss: 69666.7332882685 silhouette: 0.38333656178745273
computing with k = 12 --> wss: 64066.12997054764 silhouette: 0.38802074854507734
computing with k = 13 --> wss: 58867.078867443845 silhouette: 0.3850

#### WSS of k-clusters plot:

In [349]:
lfs=10 #label font_size
tfs=8 #tick font_size
# %matplotlib inline
%matplotlib qt5

In [352]:
wss_df = pd.DataFrame(data=wss, columns=['k','wss'])
wss_ax = wss_df.plot(x='k',y='wss')
wss_ax.set_xlabel('k clusters', fontsize=lfs)
wss_ax.set_ylabel('wss', fontsize=lfs)
for tick in wss_ax.xaxis.get_major_ticks() + wss_ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(tfs) 
plt.show()

#### Silhouette score of k-clusters plot:

In [353]:
s_df = pd.DataFrame(data=s, columns=['k','s'])
s_ax = s_df.plot(x='k',y='s')
s_ax.set_xlabel('k clusters', fontsize=lfs)
s_ax.set_ylabel('Silhouette score', fontsize=lfs)
for tick in wss_ax.xaxis.get_major_ticks() + wss_ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(tfs) 
plt.show()

### Plot of K-Means with optimum k
We choose the optimum k by looking at the WSS and the Silhouette score graphs.

In [374]:
k=5
kmeans = KMeans(n_clusters=k, algorithm='full')
labels = kmeans.fit_predict(data)

In [377]:
data.values.tolist()[labels==0, 0]

TypeError: list indices must be integers or slices, not tuple