In [1]:
# the code below will not run if you do not have numpy, pandas, and plotly installed
# do this to install from powershell/cmd:
#   python -m pip install pandas plotly nbformat
#    or
#   python3 -m pip install pandas plotly nbformat
# do this to install from linux/mac terminal:
#   sudo python -m pip install pandas plotly nbformat
#    or
#   sudo python3 -m pip install pandas plotly nbformat


# first, let's load what we need from numpy, plotly, and scipy
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook_connected"


In [2]:
# let's create a data variable which will have the Iris dataset (from plotly.express.data.iris())
data = px.data.iris()

# let's see this data
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id
0,5.1,3.5,1.4,0.2,setosa,1
1,4.9,3.0,1.4,0.2,setosa,1
2,4.7,3.2,1.3,0.2,setosa,1
3,4.6,3.1,1.5,0.2,setosa,1
4,5.0,3.6,1.4,0.2,setosa,1
...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3
146,6.3,2.5,5.0,1.9,virginica,3
147,6.5,3.0,5.2,2.0,virginica,3
148,6.2,3.4,5.4,2.3,virginica,3


In [3]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='species')
fig.show()


In [4]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_length dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_length', color='species')
fig.show()

#feel free to change dimensions to see this data in different ways...

In [5]:
# let's create a numpy array called inputs with the 4 dimensions of iris data
inputs = data[['sepal_length','sepal_width','petal_length','petal_width']].to_numpy()


In [7]:
# below are some helper functions that you can use when writing your kmeans algorithm

# find which vector from the array of vectors is closest to targetVector
def findClosest(targetVector, vectors):
    # there is a built-in scipy function which creates a searchable KDTree of vectors:
    #   kdtree = spatial.KDTree(vectors)
    # this can be later queries for closest vector:
    #   kdtree.query(targetVector)
    # however, we'll just use vector math to get our closest vector, without a need to import scipy:
    return (((vectors - targetVector)**2).sum(1)).argmin()

# makes random vectors
def makeRandomVectors(numberOfVectors, sizeOfEachVector):
    return np.random.rand(numberOfVectors, sizeOfEachVector)

# get a vector that contains means of vectors
def averageVector(vectors):
    return vectors.mean(0)

# get a vector that contains max values from vectors
def maxVector(vectors):
    return vectors.max(0)

# filter vectors by array of booleans
def filterVectors(vectors, filterArray):
    return vectors[filterArray]

# get array of booleans depending on whether each value in vector is equal to testValue
def isValueInVector(vector, testValue):
    return vector == testValue

# return true if any value in vector is true
def anyTrueValues(vector):
    return vector.any()


In [8]:

# k-means
def kMeans(inputVectors, numberOfClusters=2):
    # make random centroids
    centroids = makeRandomVectors(numberOfClusters, inputVectors.shape[1])
    centroids *= maxVector(inputVectors)
    print('random centroids', centroids)
    
    # do until centroids stop changing
    while True:
        groupings = np.zeros(len(inputVector), dtype = int)
        # Expectations: assign each point to its closest centroid
        for i, inputVector in enumerate(inputVectors):
            group = findClosest(inputVector, centroids)
            groupings[i] = group
            
            
            
        # Check for bad centroids:
        # if any of the random centroids have no values assigned to them, restart kMeans
        # for group in groups:
        #     if not groups[group]:
        #         return kMeans(inputVectors, numberOfClusters)
       
        # Maximization: compute new centroids
        newCentroids = np.zeros((numberOfClusters, inputVectors.shape[1]))
        for group in range(numberOfClusters):
            isVectorInGrp = isValueInVector(groupings, group)
            allVectorssInGrp = filterVectors(inputVectors, isVectorInGrp)
            newCentroids[group] = averageVector(allVectorssInGrp)
            
    
        if (newCentroids == centroids).all():
            break
        
        centroids = newCentroids
        print('new centroids', centroids)
    
    #return all the assignments (should be 150 for the Iris dataset)
    groupings = []
    for inputVector in inputVectors:
        for group in groups:
            if inputVector in groups[group]:
                groupings = group.append(group)
                
    return groupings
    


In [9]:
# use k-means to cluster the data into 2 clusters
clusters = kMeans(inputs,2)

random centroids [[5.11841743 0.33988244 5.78608907 1.7432382 ]
 [2.44963577 0.50712604 4.29720254 0.56613265]]


UnboundLocalError: cannot access local variable 'inputVector' where it is not associated with a value

In [9]:
# add k-means-generated cluster id's to data
data['kmeans cluster']=np.array(clusters,dtype=str)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,kmeans cluster
0,5.1,3.5,1.4,0.2,setosa,1,
1,4.9,3.0,1.4,0.2,setosa,1,
2,4.7,3.2,1.3,0.2,setosa,1,
3,4.6,3.1,1.5,0.2,setosa,1,
4,5.0,3.6,1.4,0.2,setosa,1,
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3,
146,6.3,2.5,5.0,1.9,virginica,3,
147,6.5,3.0,5.2,2.0,virginica,3,
148,6.2,3.4,5.4,2.3,virginica,3,


In [10]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='kmeans cluster')
fig.show()

In [11]:
# use k-means to cluster the data into 3 clusters
clusters = kMeans(inputs, 3)

random centroids [[4.99709422 3.25420424 0.7587246  0.28680834]
 [3.26990738 3.950738   2.77212314 0.96283896]
 [2.17622277 2.92780252 4.32054086 0.82508547]]


In [12]:
# add k-means-generated cluster id's to data
data['kmeans cluster']=np.array(clusters,dtype=str)
data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,species_id,kmeans cluster
0,5.1,3.5,1.4,0.2,setosa,1,
1,4.9,3.0,1.4,0.2,setosa,1,
2,4.7,3.2,1.3,0.2,setosa,1,
3,4.6,3.1,1.5,0.2,setosa,1,
4,5.0,3.6,1.4,0.2,setosa,1,
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica,3,
146,6.3,2.5,5.0,1.9,virginica,3,
147,6.5,3.0,5.2,2.0,virginica,3,
148,6.2,3.4,5.4,2.3,virginica,3,


In [13]:
# create a 3d scatter-plot to show the data across sepal_length, sepal_width, petal_width dimensions
fig = px.scatter_3d(data, x='sepal_length', y='sepal_width', z='petal_width', color='kmeans cluster')
fig.show()