In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import random as rand

In [2]:
iris_data = load_iris()

In [3]:
# PLACE THE IRIS DATA IN A PANDAS
# DATAFRAME
df = pd.DataFrame(data=iris_data.data, 
                  columns=iris_data.feature_names)
df['target'] = pd.Series(iris_data.target)

In [4]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [5]:
iris_data.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [6]:
points = df.drop(columns='target').values.tolist()

In [7]:
# initialization function
# Place k centroids at a random location
def place_centroids(points, k):
    # List to keep track of centroids
    centroids = []
    # Place K centroids in random locations
    # each point is a vector of 4 features
    for i in range(k):
        centroid = points[rand.randint(0, 149)]
        centroids.append(centroid)
    return centroids
    
    

In [8]:
centroids = place_centroids(points, 3)

In [9]:
# assign the centroids to all points 
def assign_cetroids(points, centroids):
    assignments = []
    for dpoint in points:
        # define a data point to a centroid's distnace 
        dpoint_centroid_distance = []
        for centroid in centroids:
            dc = np.linalg.norm(np.array(dpoint) - np.array(centroid))
            dpoint_centroid_distance.append(dc)
        # find the nearest centroid to that point
        nearest_distance = np.argmin(dpoint_centroid_distance)
        assignments.append(nearest_distance)
    return assignments
        

In [10]:
assignments = assign_cetroids(points, centroids)

In [11]:
# Update and compute new centroids
def update_centroids(points, centroids, assignments, k):
    new_centroids = []
    for cluster in range(k):
        point_cluster = []
        for i in range(len(points)):
            if assignments[i] == cluster:
                point_cluster.append(points[i])
        mean_c = np.mean(point_cluster, axis=0)
        new_centroids.append(mean_c)
    return new_centroids

In [12]:
new_centroids = update_centroids(points, centroids, assignments, 3)

In [13]:
new_centroids

[array([6.5       , 2.825     , 4.79285714, 1.60714286]),
 array([5.46168224, 3.11308411, 3.14485981, 0.97102804]),
 array([7.34      , 3.09333333, 6.2       , 2.06666667])]

In [14]:
# minimize the objective function
# to minimize the within-cluster sum of squared error
def cal_sse(points, assignments, centroids):
    errors = []
    for i in range(len(points)):
        # get the assigned centroid for each point
        centroid = centroids[assignments[i]]
        # computer the error, compute the distance between point and its nearest centroid
        error = np.linalg.norm(np.array(points[i]) - np.array(centroid))
        errors.append(pow(error, 2))
    sse = sum(errors)
    return sse
        
        

In [15]:
def kmeans_clustering(all_vals,K,max_iter = 100, tol = pow(10,-3) ):
    it = -1
    all_sse = []
    assignments = []
    
    #Place K centroids at random locations
    centroids = place_centroids(all_vals, K)

   
    #Until algorithm converges
    while (len(all_sse)<=1 or (it < max_iter and np.absolute(all_sse[it] - all_sse[it-1])/all_sse[it-1] >= tol)):
        it += 1
        #Assign all data points to the closest center
        assignments = assign_cetroids(all_vals, centroids)
        
        #Compute the new centroids
        centroids = update_centroids(all_vals, centroids, assignments, K)
        
        #Compute SSE
        sse_kmeans = cal_sse(all_vals, assignments, centroids)
        all_sse.append(sse_kmeans)        
        
     
    return (assignments, centroids, all_sse, it)
        

In [16]:
result = kmeans_clustering(points,K=3)

In [17]:
centroids_x = [result[1][x][0] for x in range(len(result[1]))] #sepal_length: [0] 
centroids_y = [result[1][x][2] for x in range(len(result[1]))] #petal_length: [2]

In [18]:
x = df['sepal length (cm)'].values.tolist()
y = df['petal length (cm)'].values.tolist()
assignments = result[0]

In [19]:
plt.scatter(x, y, c= assignments)
plt.plot(centroids_x, centroids_y, c='white', marker='.', linewidth='0.01', markerfacecolor='red', markersize=22)
plt.title("K-means Visualization")
plt.xlabel("sepal_length")
plt.ylabel("petal_length")

NameError: name 'plt' is not defined