# Final Exam Second Semester 2566 - K-Means (Cancer Patient Problem)


This exam problem has an objective to develop a K-Means algorithm to group cancer patient data into k clusters according to 23 features such as age, gender, air pollution, alcohol use, dust, allergy, occupational, hazards, genetic risk, chronic lung disease, etc.

In [2]:
# used for manipulating directory paths
import os

# Scientific and vector computation for python
import numpy as np

from sklearn.cluster import KMeans

# library written for this exam
# import utilsKMeans as utils

%load_ext autoreload
%autoreload 2

# tells matplotlib to embed plots within the notebook
%matplotlib inline

import random 
random.seed(10)

### We start the exam by first loading the dataset. 

In [3]:
# Load the dataset into the variable X 
data = np.loadtxt(os.path.join('Data', 'KMeans_CancerData.txt'))
X = data

m = X.shape[0] # number of training examples

FileNotFoundError: Data\KMeans_CancerData.txt not found.

In [12]:
X.shape

(1000, 23)

### Set initial centroids

In [36]:
# Initial centroids
# For consistency, here we set centroids to specific values
# but in practice you should generate them automatically, such as by
# settings them to be random examples.

initial_centroids = np.array([[44.,  1.,  2.,  1.,  5.,  3.,  2.,  3.,  2.,  4.,  1.,  4.,  2.,
         4.,  6.,  7.,  2.,  5.,  8.,  1.,  3.,  2.,  3.],
       [35.,  1.,  2.,  1.,  5.,  3.,  2.,  3.,  2.,  4.,  1.,  4.,  2.,
         4.,  6.,  7.,  2.,  5.,  8.,  1.,  3.,  2.,  3.],
       [54.,  1.,  6.,  7.,  7.,  7.,  7.,  6.,  7.,  7.,  7.,  8.,  7.,
         7.,  5.,  3.,  2.,  7.,  8.,  2.,  4.,  5.,  3.],
       [62.,  1.,  6.,  8.,  7.,  7.,  7.,  6.,  7.,  7.,  8.,  7.,  7.,
         9.,  3.,  2.,  4.,  1.,  4.,  2.,  4.,  2.,  3.],
       [24.,  2.,  3.,  2.,  2.,  1.,  1.,  1.,  1.,  1.,  4.,  2.,  3.,
         6.,  2.,  1.,  2.,  3.,  4.,  2.,  1.,  1.,  1.]])

In [37]:
def findClosestCentroids(X, centroids): # step 1
    """
    Computes the centroid memberships for every example.
    
    Parameters
    ----------
    X : array_like
        The dataset of size (m, n) where each row is a single example. 
        That is, we have m examples each of n dimensions.
        
    centroids : array_like
        The K-means centroids of size (K, n). K is the number
        of clusters, and n is the the data dimension.
    
    Returns
    -------
    idx : array_like
        A vector of size (m, ) which holds the centroids assignment for each
        example (row) in the dataset X.
    
    Instructions
    ------------
    Go over every example, find its closest centroid, and store
    the index inside `idx` at the appropriate location.
    Concretely, idx[i] should contain the index of the centroid
    closest to example i. Hence, it should be a value in the 
    range 0..K-1

    Note
    ----
    You can use a for-loop over the examples to compute this.
    """
    # Set K
    K = centroids.shape[0]

    # You need to return the following variables correctly.
    idx = np.zeros(X.shape[0], dtype=int)

    # ====================== YOUR CODE HERE ======================
    
    for i in range(idx.size):
        J = np.sqrt(np.sum(np.square(X[i] - centroids),axis = 1)) #norm เฉยๆ
        # we can also set J = np.sum(np.square(X[i] - centroids),axis = 1) #norm กำลังสอง
        
        idx[i] = np.argmin(J)
    
    # =============================================================
    return idx

In [38]:
K = 5   

# Find the closest centroids for the examples using the initial_centroids
idx = findClosestCentroids(X, initial_centroids)
print(idx)

[1 4 1 1 2 1 0 4 1 0 2 3 1 1 4 3 4 1 1 4 4 2 3 1 1 3 1 4 1 1 1 0 1 1 4 4 4
 4 1 0 4 4 0 4 4 0 4 1 4 1 4 4 4 1 1 0 0 2 0 1 3 3 0 0 1 0 0 1 3 1 0 3 1 1
 4 1 4 2 0 1 0 2 0 1 1 1 1 2 1 2 3 2 1 2 1 4 1 1 4 1 2 1 2 1 1 1 1 2 1 4 4
 0 4 4 4 1 1 1 4 1 4 2 3 1 1 4 3 4 1 1 4 4 2 3 1 1 3 1 4 1 1 1 0 1 1 4 1 4
 4 1 0 4 4 0 4 4 0 0 1 4 1 4 4 4 1 1 0 0 1 0 1 3 3 0 0 1 0 0 1 1 1 0 3 1 1
 4 1 4 2 0 4 0 2 0 1 1 1 1 2 1 2 4 2 1 2 1 4 1 1 4 1 2 4 2 1 1 1 1 2 1 4 4
 0 1 1 4 1 1 1 4 1 4 2 3 1 0 4 3 4 1 1 4 4 2 3 1 4 3 1 4 1 1 1 0 1 1 4 4 4
 4 1 0 4 4 0 4 4 0 0 1 4 1 4 4 4 1 1 0 0 4 0 1 3 3 0 0 1 0 0 1 4 1 0 3 1 1
 4 1 4 2 0 0 0 2 0 1 1 1 1 2 1 2 1 2 1 2 1 4 1 1 4 1 2 4 2 1 1 1 1 2 1 4 4
 0 4 1 4 1 1 1 4 1 4 2 3 1 4 4 3 4 1 1 4 4 2 3 1 4 3 1 4 1 1 1 0 1 1 4 4 4
 4 1 0 4 4 0 4 4 0 1 1 4 4 1 0 0 0 1 0 0 1 0 1 3 3 0 0 1 0 0 1 0 1 0 3 0 1
 1 1 4 2 0 0 0 2 0 0 0 1 0 2 1 2 0 3 0 2 1 1 1 1 4 1 2 1 2 1 1 1 1 2 1 1 4
 0 2 3 4 1 1 1 4 1 4 2 3 1 3 4 3 4 1 1 4 4 2 3 1 0 3 1 4 1 1 1 0 1 1 4 0 4
 4 1 0 4 4 0 4 4 0 1 1 4 

In [55]:
for i in range(idx.size):
    J = (1/m) * np.sum(np.square(X[i] - initial_centroids),axis = 1)
print(J)

[0.258 0.393 0.288 0.429 0.873]


In [50]:
utils.runkMeans

<function utilsKMeans.runkMeans(X, centroids, findClosestCentroids, computeCentroids, max_iters=10, plot_progress=False)>

In [51]:
def computeCentroids(X, idx, K): # step 2
    """
    Returns the new centroids by computing the means of the data points
    assigned to each centroid.
    
    Parameters
    ----------
    X : array_like
        The datset where each row is a single data point. That is, it 
        is a matrix of size (m, n) where there are m datapoints each
        having n dimensions. 
    
    idx : array_like 
        A vector (size m) of centroid assignments (i.e. each entry in range [0 ... K-1])
        for each example.
    
    K : int
        Number of clusters
    
    Returns
    -------
    centroids : array_like
        A matrix of size (K, n) where each row is the mean of the data 
        points assigned to it.
    
    Instructions
    ------------
    Go over every centroid and compute mean of all points that
    belong to it. Concretely, the row vector centroids[i, :]
    should contain the mean of the data points assigned to
    cluster i.

    Note:
    -----
    You can use a for-loop over the centroids to compute this.
    """
    # Useful variables
    m, n = X.shape
    
    # You need to return the following variables correctly.
    centroids = np.zeros((K, n))


    # ====================== YOUR CODE HERE ======================
    for i in range(K):
        centroids[i] = np.mean(X[idx == i], axis = 0)
        
    
    # =============================================================
    return centroids

In [56]:
K = 5
p = utils.runkMeans(X, initial_centroids, findClosestCentroids, computeCentroids, max_iters=10)
print(p)

(array([[40.6893617 ,  1.43829787,  2.18297872,  2.05531915,  3.43404255,
         2.8       ,  2.30212766,  2.99574468,  2.35319149,  2.78723404,
         2.51489362,  2.86382979,  2.32340426,  3.37446809,  3.45531915,
         3.4       ,  2.3787234 ,  3.32340426,  4.72340426,  2.37021277,
         3.06808511,  2.4893617 ,  3.26382979],
       [31.85928144,  1.38023952,  5.16766467,  6.70658683,  6.67664671,
         6.56287425,  6.35628743,  5.85628743,  6.3502994 ,  6.07185629,
         5.5       ,  5.82335329,  6.38922156,  6.86526946,  4.64071856,
         4.2245509 ,  5.24850299,  3.95508982,  3.6257485 ,  4.61976048,
         4.15568862,  4.46107784,  3.        ],
       [47.24342105,  1.32894737,  4.57236842,  6.15131579,  6.44078947,
         6.28289474,  6.03947368,  5.21052632,  5.67763158,  5.75657895,
         4.44078947,  5.03289474,  5.44078947,  5.80263158,  4.80921053,
         4.86842105,  5.78289474,  4.17763158,  4.15131579,  4.56578947,
         3.80921053,  5.098

In [57]:
for i in range(idx.size):
    J = (1/m) * np.sum(np.square(X[i] - p),axis = 1)
print(J)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2,) + inhomogeneous part.