In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
np.random.seed(42)

In [4]:
df = pd.read_csv('data/wholesale_customers.csv')
df.head()

Unnamed: 0,Channel,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,2,3,12669,9656,7561,214,2674,1338
1,2,3,7057,9810,9568,1762,3293,1776
2,2,3,6353,8808,7684,2405,3516,7844
3,1,3,13265,1196,4221,6404,507,1788
4,2,3,22615,5410,7198,3915,1777,5185


In [5]:
df = df.drop(columns=['Channel', 'Region'])
df.head()

Unnamed: 0,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen
0,12669,9656,7561,214,2674,1338
1,7057,9810,9568,1762,3293,1776
2,6353,8808,7684,2405,3516,7844
3,13265,1196,4221,6404,507,1788
4,22615,5410,7198,3915,1777,5185


In [6]:
"""
ML model usually prefers data to be on the range of
1) 0 to 1  (Min-Max Scaling)
2) -1 to 1 (Standard Scaling)
3) Log Transformation
"""

'\nML model usually prefers data to be on the range of\n1) 0 to 1  (Min-Max Scaling)\n2) -1 to 1 (Standard Scaling)\n3) Log Transformation\n'

In [7]:
X = df.values
X_log = np.log1p(X)

In [8]:
X_log

array([[ 9.44699227,  9.17543832,  8.93089098,  5.37063803,  7.89170466,
         7.19967835],
       [ 8.861917  ,  9.19125948,  9.16628399,  7.47477218,  8.09985791,
         7.48268183],
       [ 8.75683981,  9.08352921,  8.94702566,  7.7857209 ,  8.16536363,
         8.96763167],
       ...,
       [ 9.58410839,  9.64788537, 10.3170531 ,  6.08221891,  9.60521628,
         7.53262362],
       [ 9.23902501,  7.59186171,  7.71110125,  6.94601399,  5.12989871,
         7.66199756],
       [ 7.93307977,  7.43779512,  7.82843636,  4.18965474,  6.16961073,
         3.97029191]], shape=(440, 6))

Helper Functions

In [9]:
def log1p_transform(X):
    return np.log1p(X)

def minmax_transform(X):
    X_min = np.min(X)
    X_max = np.max(X)
    return (X - X_min) / (X_max - X_min)

def standard_transform(X):
    X_mean = np.mean(X)
    X_std = np.std(X)
    return (X - X_mean) / X_std

In [10]:
minmax_X = minmax_transform(X_log)

In [11]:
minmax_X

array([[0.78707638, 0.76056084, 0.73668234, 0.38904606, 0.63521234,
        0.56764033],
       [0.72994746, 0.76210568, 0.75966698, 0.59450151, 0.65553719,
        0.59527384],
       [0.71968734, 0.7515865 , 0.73825779, 0.62486369, 0.66193341,
        0.74026983],
       ...,
       [0.8004649 , 0.80669232, 0.87203234, 0.45852745, 0.80252596,
        0.60015034],
       [0.76676968, 0.60593456, 0.61757755, 0.5428716 , 0.36553938,
        0.61278289],
       [0.63925236, 0.59089093, 0.62903459, 0.27373048, 0.46706071,
        0.25231108]], shape=(440, 6))

In [12]:
np.min(minmax_X), np.max(minmax_X)

(np.float64(0.0), np.float64(1.0))

In [None]:
def pairwise_distances(X):
    X_norm_sq = np.sum(X ** 2, axis=1, keepdims=True)
    return X_norm_sq + X_norm_sq.T - 2 * np.dot(X, X.T)

def silhouette_score(X, labels):
    unique_labels = np.unique(labels)

    n = len(X)

    distances = pairwise_distances(X)
    silhouette_vals = np.zeros(n)
    
    for i in range(n):
        same_cluster = labels == labels[i]
        if np.sum(same_cluster) > 1:
            a_i = np.mean(distances[i, same_cluster])
        else:
            a_i = 0
        
        
        