<div style="text-align:center"><span style="font-size:2em; font-weight: bold;">Lecture 9—Clustering</span></div>

# $k$-Means clustering

In [1]:
import numpy as np
import pandas as pd
from cleands import *

Data Generating Process

In [2]:
x1 = np.random.normal(loc=np.random.uniform(size=(5,))*10-5,size=(500,5))
x2 = np.random.normal(loc=np.random.uniform(size=(5,))*10-5,size=(500,5))
x3 = np.random.normal(loc=np.random.uniform(size=(5,))*10-5,size=(500,5))
x = np.vstack((x1,x2,x3))
shuffle = np.random.permutation(x.shape[0])
x = x[shuffle,:]
membership = shuffle.copy()
for i in range(len(membership)):
    if membership[i]<500: membership[i]=0
    elif membership[i]<1000: membership[i]=1
    else: membership[i]=2
np.unique(membership,return_counts=True)
membership

array([1, 2, 2, ..., 1, 1, 0])

In [3]:
# Plotting multiple histograms for each column
num_cols = x1.shape[1]  # Number of columns in x1

plt.figure(figsize=(12, 8))

for i in range(num_cols):
    plt.subplot(2, 3, i+1)  # Creating subplots in a 2x3 grid
    plt.hist(x1[:, i], bins=30, alpha=0.7)
    plt.title(f'Column {i+1} Histogram')
    plt.xlabel('Values', color='white')
    plt.ylabel('Frequency', color='white')
    plt.xticks(color='white')
    plt.yticks(color='white')

plt.tight_layout()
plt.show()

NameError: name 'plt' is not defined

In [None]:
# Plotting multiple histograms for each column
num_cols = x.shape[1]  # Number of columns in x1

plt.figure(figsize=(12, 8))

for i in range(num_cols):
    plt.subplot(2, 3, i+1)  # Creating subplots in a 2x3 grid
    plt.hist(x[:, i], bins=30, alpha=0.7)
    plt.title(f'Column {i+1} Histogram',color='white')
    plt.xlabel('Values', color='white')
    plt.ylabel('Frequency', color='white')
    plt.xticks(color='white')
    plt.yticks(color='white')

plt.tight_layout()
plt.show()

Calculate means of membership variable

In [None]:
means = []
for i in range(3):
    mean = x[membership==i,:].mean(0)
    means += [mean]
means = np.array(means)
means

In [None]:
x

kmeans step 1: randomly guess

In [None]:
k = 3
n = x.shape[0]
group = np.random.randint(k,size=(n,))
group

kmeans step 2: calculate means of each cluster

In [None]:
means = []
for i in range(k):
    mean = x[group==i,:].mean(0)
    means += [mean]
means = np.array(means)
means

kmeans step 3: group each point to its closest mean

In [None]:
dists = []
for i in range(k):
    dist = x-means[i,:]
    dist = (dist**2).sum(1)
    dists += [dist]
dists = np.array(dists)
group = dists.argmin(0)
group

kmeans step 4: go back to step 2 until converges...

Putting it all together:

In [None]:
k = 3
n = x.shape[0]
max_iters = 100
newgroup = np.random.randint(k,size=(n,))
group = np.zeros((n,))
for j in range(max_iters):
    if (group==newgroup).all(): break
    print('iteration')
    group = newgroup
    dists = []
    for i in range(k):
        mean = x[group==i,:].mean(0)
        dist = x-mean
        dist = (dist**2).sum(1)
        dists += [dist]
    dists = np.array(dists)
    newgroup = dists.argmin(0)
group

In [None]:
(membership==group).mean()

Confusion matrix

In [None]:
membershipohe = np.zeros((membership.size, membership.max()+1))
membershipohe[np.arange(membership.size),membership] = 1
groupohe = np.zeros((group.size, group.max()+1))
groupohe[np.arange(group.size),group] = 1
membershipohe.T@groupohe

accuracy

In [None]:
(membershipohe.T@groupohe).max(1).sum()/groupohe.sum()

putting all this in a function

In [None]:
def kmeans(x,k,max_iters=100,seed=None):
    n = x.shape[0]
    if seed != None: np.random.seed(seed)
    newgroup = np.random.randint(k,size=(n,))
    group = np.zeros((n,))
    for j in range(max_iters):
        if (group==newgroup).all(): break
        #print('iteration')
        group = newgroup
        dists = []
        for i in range(k):
            mean = x[group==i,:].mean(0)
            dist = x-mean
            dist = (dist**2).sum(1)
            dists += [dist]
        dists = np.array(dists)
        newgroup = dists.argmin(0)
    return newgroup

Total within sum of squares calculation

In [None]:
k = 5
group = kmeans(x,k)
means = np.array([x[group==i,:].mean(0) for i in range(k)])
wss = [((x[group==i,:]-means[i,:])**2).sum() for i in range(k)]
total_wss = sum(wss)
total_wss

Loop process and get min twss

In [None]:
def rep_kmeans(x,k,max_iters=100,seed=None,n_start=100):
    twss = []
    groups = []
    for i in range(n_start):
        group = kmeans(x,k,max_iters,seed)
        means = np.array([x[group==i,:].mean(0) for i in range(k)])
        wss = [((x[group==i,:]-means[i,:])**2).sum() for i in range(k)]
        total_wss = sum(wss)
        groups += [group]
        twss += [total_wss]
    group = groups[np.array(twss).argmin()]
    return group

In [None]:
k = 10
group = rep_kmeans(x,k,n_start=250)
means = np.array([x[group==i,:].mean(0) for i in range(k)])
wss = [((x[group==i,:]-means[i,:])**2).sum() for i in range(k)]
total_wss = sum(wss)
total_wss

Automatic elbow detector

In [None]:
def auto_kmeans(x,k_max=10,max_iters=100,seed=None,n_start=100):
    groups = []
    twss = []
    for k in range(1,k_max):
        group = rep_kmeans(x,k,max_iters,seed,n_start)
        means = np.array([x[group==i,:].mean(0) for i in range(k)])
        wss = [((x[group==i,:]-means[i,:])**2).sum() for i in range(k)]
        total_wss = sum(wss)
        groups += [group]
        twss += [total_wss]
    twss = np.array(twss)
    dwss = -np.diff(twss)
    dwss = np.insert(dwss,0,dwss.sum()/np.log(k_max))
    dwss = np.trim_zeros(dwss)
    ratio = dwss[:-1]/dwss[1:]
    ratio = ratio[:k_max]
    k = ratio.argmax()
    return groups[k]

In [None]:
result = auto_kmeans(x)
np.unique(result)

In [None]:
membershipohe = np.zeros((membership.size, membership.max()+1))
membershipohe[np.arange(membership.size),membership] = 1
resultohe = np.zeros((result.size, result.max()+1))
resultohe[np.arange(result.size),result] = 1
membershipohe.T@resultohe

In [None]:
(membershipohe.T@resultohe).max(1).sum()/membershipohe.sum()

# Programming challenges

## Quick sort

Write a program which implements the quick sort algorithm.



## $k$-Means class structure

Write a class structure for our k-means code
