In [None]:
from os.path import join
from datetime import datetime
import pandas as pd
import numpy as np
from numpy.random import seed
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
from silhouette_plot import plot_silhouette, plot2d_cluster_assignments, plot3d_cluster_assignments, format_silhouette_plot, metric_plot_update, format3D

%matplotlib notebook
pd.options.display.float_format = '{:,.2f}'.format
seed(42)

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='onedork', context='talk', fscale=1.4, spines=False, 
             gridlines='--', ticks=True, grid=False, figsize=(7, 5))

### Load the transaction data

In [None]:
data = pd.read_csv('wholesale_customers_data.csv')
data.info()

### And review the first few rows

In [None]:
data.head()

In [None]:
products = ['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicatessen']

#### There's a wide range of transaction values

In [None]:
fig, axes = plt.subplots(nrows=2, sharex=True)
transaction_value = data.loc[:, products].sum(axis=1)
sns.distplot(transaction_value, ax=axes[0])
flierprops = dict(markerfacecolor='0.75', markersize=5, linestyle='none')
whiskerprops = capprops = dict(c='white')
sns.boxplot(transaction_value, ax=axes[1], flierprops=flierprops, whiskerprops=whiskerprops, capprops=capprops)
axes[0].set_title('Transaction Value Distribution')

### And plenty of outliers for each product

In [None]:
plt.figure()
sns.boxplot(data=data[products], flierprops=flierprops, whiskerprops=whiskerprops, capprops=capprops)
plt.title('Transaction Value Distributions by Product')

### Sampling Customer Profiles

#### Let's sample 5 customers

In [None]:
sample = data.sample(5).reset_index(drop=True)
sample_with_stats = sample.append(data.agg(['median', 'mean']))
sample_with_stats

#### And plot the transaction profiles

In [None]:
sample_with_stats[products].T.plot.bar()
plt.tight_layout()

### Basket Shares

In [None]:
product_sales = data[products]
basket_shares = product_sales.div(product_sales.sum(axis=1), axis=0)
sns.boxplot(data=basket_shares, flierprops=flierprops, whiskerprops=whiskerprops, capprops=capprops)
plt.title('Basket Share Distributions')

In [None]:
corr_matrix = basket_shares.corr()
mask = np.zeros_like(corr_matrix)
np.fill_diagonal(mask, 1)


#### and compare the share of each product in their purchase

In [None]:
sample_profile = basket_shares.loc[sample.index]
sample_profile_with_stats = sample_profile.append(basket_shares.agg(['median', 'mean']))
sample_profile_with_stats

#### visualize as horizontal bar chart and heatmap

In [None]:
fig, axes = plt.subplots(nrows=2)
sample_profile_with_stats.drop('median').plot.barh(stacked=True, ax=axes[0])
sns.heatmap(sample_profile_with_stats, ax=axes[1], annot=True, fmt='.1%')
plt.tight_layout()

### Sales by Region

In [None]:
data.groupby('Region')[products].mean().plot.bar(title='Sales by Region')

### Sales by Channel

In [None]:
data.groupby('Channel')[products].mean().plot.bar(title='Sales by Channel')

### Using categorical variables

In [None]:
pd.get_dummies(data, columns=['Channel', 'Region']).head()

In [None]:
data.Channel = data.Channel.sub(1)

In [None]:
data = data.drop('Region', axis=1)

### KMeans with raw data

#### Compute Principal Components to facilitate 2D visualization

In [None]:
def get_components(data):
    pca = PCA().fit(data)
    C2 = pca.components_[:2].T # for 2D projection
    C3 = pca.components_[:3].T # for 3D projection
    data2d, data3d = data.dot(C2), data.dot(C3)
    return C2, C3, data2d, data3d, pca.explained_variance_ratio_

#### Evaluate 1-10 Clusters

In [None]:
def create_figure(max_clusters):
    ncols, nrows = 3, max_clusters
    fig, axes = plt.subplots(ncols=ncols, nrows=nrows,
                             figsize=(12, nrows * 3))
    for row in range(1, max_clusters):
        axes[row, 2].remove()
        axes[row, 2] = format3D(fig.add_subplot(nrows, ncols, 3 + row * 3, projection='3d'))
    return fig, axes

In [None]:
def evaluate_clusters(data, max_clusters=6, fig_n=10):
    cluster_list = list(range(2, max_clusters + 1))
    inertias = pd.Series(index=cluster_list)
    silhouette_scores = inertias.copy()
    C2, C3, data2d, data3d, ev = get_components(data)
    fig, axes = create_figure(max_clusters)
    pd.Series(ev, index=list(range(1, data.shape[1] + 1))).plot.bar(title='PCA Explained Variance', ax= axes[0][2]);
    for n_cluster in cluster_list:
        kmeans = KMeans(n_clusters=n_cluster, random_state=42).fit(data)
        centroids, assignments, inertia = kmeans.cluster_centers_, kmeans.labels_, kmeans.inertia_
        inertias[n_cluster] = inertia
        metric_plot_update(inertias, fig, axes[0][0])
        silhouette_avg = silhouette_score(data, assignments)
        silhouette_scores[n_cluster] = silhouette_avg
        metric_plot_update(silhouette_scores, fig, axes[0][1], kind='bar', title='Silhouette Scores')
        silhouette_values = silhouette_samples(data, assignments)
        silhouette_plot, cluster_plot2d, clusterplot3d = axes[n_cluster - 1]
        y_lower = 10
        for i in range(n_cluster):
            y_lower = plot_silhouette(np.sort(silhouette_values[assignments == i]), y_lower, i, n_cluster, silhouette_plot)
        format_silhouette_plot(silhouette_plot, silhouette_avg)
        plot2d_cluster_assignments(*data2d.T, centroids.dot(C2), assignments, n_cluster, cluster_plot2d)
        plot3d_cluster_assignments(*data3d.T, centroids.dot(C3), assignments, n_cluster, clusterplot3d)
        fig.tight_layout()

    fig.suptitle('KMeans Silhouette Plot with Wholesale Data', fontsize=14)
    fig.tight_layout()


In [None]:
evaluate_clusters(product_sales.values)

### Evaluate standardized raw data

In [None]:
scaler = StandardScaler()
standardized_data = scaler.fit_transform(product_sales)
evaluate_clusters(standardized_data)

### Evaluate log-transformed Data

In [None]:
log_sales = product_sales.apply(np.log)
evaluate_clusters(product_sales.values)

### Evaluate standardized log-transformed Data

In [None]:
log_sales_standardized = scaler.fit_transform(log_sales)
evaluate_clusters(log_sales_standardized)

### Add Channel Information

In [None]:
log_sales_channel = log_sales.join(data.Channel)
log_sales_channel_standardized = scaler.fit_transform(log_sales_channel)
evaluate_clusters(log_sales_channel_standardized)

### Remove Outliers

In [None]:
outliers = LocalOutlierFactor(n_neighbors=20, contamination=.1)
log_sales_channel['inlier'] = outliers.fit_predict(log_sales_channel)
log_sales_channel_clean = log_sales_channel.loc[log_sales_channel.inlier==1].drop('inlier', axis=1)

log_sales_channel_clean_standardized = scaler.fit_transform(log_sales_channel_clean)
evaluate_clusters(log_sales_channel_clean_standardized)

### Evaluate basket share data

In [None]:
evaluate_clusters(basket_shares.values)

### Evaluate Standardized Basket Shares

In [None]:
basket_shares_standardized = scaler.fit_transform(basket_shares)
evaluate_clusters(basket_shares_standardized)

### Include Channel Information

In [None]:
basket_shares_channel = data[['Channel']].sub(1).join(basket_shares)

basket_shares_channel_standardized = scaler.fit_transform(basket_shares_channel)

evaluate_clusters(basket_shares_channel_standardized)

### Also add total transaction value

In [None]:
basket_shares_total = basket_shares_channel.join(transaction_value.to_frame('total'))

basket_shares_total_standardized = scaler.fit_transform(basket_shares_total)

evaluate_clusters(basket_shares_total_standardized)

### All data

In [None]:
all_data = basket_shares_total.join(log_sales, rsuffix='_log')
all_data_standardized = scaler.fit_transform(all_data)
evaluate_clusters(all_data_standardized)

### Display centroids for log-transformed data

In [None]:
log_sales_channel_clean.head()

#### Recompute Clusters

In [None]:
log_sales_channel_clean_standardized = scaler.fit_transform(
                    log_sales_channel_clean)

kmeans = KMeans(n_clusters=2).fit(log_sales_channel_clean_standardized)
centroids = kmeans.cluster_centers_
centroids

### Transform back to feature space

In [None]:
centroids_rescaled = pd.DataFrame(scaler.inverse_transform(centroids), 
                                  columns=log_sales_channel_clean.columns)
centroids_rescaled

### Apply exponential function

In [None]:
centroids_rescaled.loc[:, products] = np.exp(
    centroids_rescaled.loc[:, products])
centroids_rescaled

### Centroid Profiles 

In [None]:
centroids_rescaled.append(data.agg(['mean', 'median']))

In [None]:
pd.Series(kmeans.labels_).value_counts()

In [None]:
kmeans.inertia_

### Repeat for 3 Clusters

In [None]:
kmeans = KMeans(n_clusters=3).fit(log_sales_channel_clean_standardized)
centroids = kmeans.cluster_centers_
centroids

### Profiles for 3 Centroids

In [None]:
centroids_rescaled = pd.DataFrame(scaler.inverse_transform(centroids), columns=log_sales_channel_clean.columns)
centroids_rescaled.loc[:, products] = np.exp(centroids_rescaled.loc[:, products])
centroids_rescaled.append(data.agg(['mean', 'median']))

In [None]:
centroids_rescaled[products].plot.bar();

In [None]:
pd.Series(kmeans.labels_).value_counts()

In [None]:
kmeans.inertia_