In [1]:
import numpy as np
import pandas as pd

In [2]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
df = pd.read_csv(url, names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'])
data = df.iloc[:, :-1].values
data[:10]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [3]:
def cluster_centroids(points):
    centroid = [0 for i in range(4)]
    for j in range(4):
        for point in points:
            centroid[j] += point[j]
        centroid[j] = centroid[j] / len(points)
    return centroid
cluster_centroids(data)

[5.843333333333335, 3.0540000000000007, 3.7586666666666693, 1.1986666666666672]

In [4]:
def euclidean_distance(c1, c2):
    sum_dis = 0
    print(c1, c2)
    for i in range(len(c1)): # 4 dimensions
        sum_dis += (c1[i] - c2[i])**2
    return np.sqrt(sum_dis)

In [5]:
def convert_index_to_point(cluster):
    points = []
    for i in cluster:
        points.append(data[i])
    return points
# cluster = [0, 2, 3, 4] (index of points in the original data)

In [6]:
clusters = []
print((data[:10]))
for point in range(len(data)):
    clusters.append([point])

[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]]


In [7]:
clusters

[[0],
 [1],
 [2],
 [3],
 [4],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13],
 [14],
 [15],
 [16],
 [17],
 [18],
 [19],
 [20],
 [21],
 [22],
 [23],
 [24],
 [25],
 [26],
 [27],
 [28],
 [29],
 [30],
 [31],
 [32],
 [33],
 [34],
 [35],
 [36],
 [37],
 [38],
 [39],
 [40],
 [41],
 [42],
 [43],
 [44],
 [45],
 [46],
 [47],
 [48],
 [49],
 [50],
 [51],
 [52],
 [53],
 [54],
 [55],
 [56],
 [57],
 [58],
 [59],
 [60],
 [61],
 [62],
 [63],
 [64],
 [65],
 [66],
 [67],
 [68],
 [69],
 [70],
 [71],
 [72],
 [73],
 [74],
 [75],
 [76],
 [77],
 [78],
 [79],
 [80],
 [81],
 [82],
 [83],
 [84],
 [85],
 [86],
 [87],
 [88],
 [89],
 [90],
 [91],
 [92],
 [93],
 [94],
 [95],
 [96],
 [97],
 [98],
 [99],
 [100],
 [101],
 [102],
 [103],
 [104],
 [105],
 [106],
 [107],
 [108],
 [109],
 [110],
 [111],
 [112],
 [113],
 [114],
 [115],
 [116],
 [117],
 [118],
 [119],
 [120],
 [121],
 [122],
 [123],
 [124],
 [125],
 [126],
 [127],
 [128],
 [129],
 [130],
 [131],
 [132],
 [133],
 [134],
 [135],
 [136],
 [137],
 [138]

In [8]:
while len(clusters) > 3:
    centroids = []
    # clusters is like: [[0,1,2],[3,4]]
    # cluster is :[0,1,2], [3,4]
    for cluster in clusters:
        points = convert_index_to_point(cluster)
        centroids.append(cluster_centroids(points))

    # distance matrix    
    matrix = [[0 for i in range(len(centroids))] for j in range(len(centroids))]
    
    # min_dis = [i, j, distance]
    min_dis = [-1, -1, np.inf]
    
    length = len(centroids)
    for i in range(length):
        for j in range(i + 1, length):
            matrix[i][j] = euclidean_distance(centroids[i], centroids[j])
            if matrix[i][j] < min_dis[2]:
                min_dis = [i, j, matrix[i][j]]
    # print("min distance = ", min_dis[2], " cluster num = ", len(clusters))
    clusters[min_dis[0]] += clusters[min_dis[1]]
    # print("cluster[min_dis[0]] = ", clusters[min_dis[0]])
    del clusters[min_dis[1]]
    

[5.1, 3.5, 1.4, 0.2] [4.9, 3.0, 1.4, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.7, 3.2, 1.3, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.6, 3.1, 1.5, 0.2]
[5.1, 3.5, 1.4, 0.2] [5.0, 3.6, 1.4, 0.2]
[5.1, 3.5, 1.4, 0.2] [5.4, 3.9, 1.7, 0.4]
[5.1, 3.5, 1.4, 0.2] [4.6, 3.4, 1.4, 0.3]
[5.1, 3.5, 1.4, 0.2] [5.0, 3.4, 1.5, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.4, 2.9, 1.4, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.9, 3.1, 1.5, 0.1]
[5.1, 3.5, 1.4, 0.2] [5.4, 3.7, 1.5, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.8, 3.4, 1.6, 0.2]
[5.1, 3.5, 1.4, 0.2] [4.8, 3.0, 1.4, 0.1]
[5.1, 3.5, 1.4, 0.2] [4.3, 3.0, 1.1, 0.1]
[5.1, 3.5, 1.4, 0.2] [5.8, 4.0, 1.2, 0.2]
[5.1, 3.5, 1.4, 0.2] [5.7, 4.4, 1.5, 0.4]
[5.1, 3.5, 1.4, 0.2] [5.4, 3.9, 1.3, 0.4]
[5.1, 3.5, 1.4, 0.2] [5.1, 3.5, 1.4, 0.3]
[5.1, 3.5, 1.4, 0.2] [5.7, 3.8, 1.7, 0.3]
[5.1, 3.5, 1.4, 0.2] [5.1, 3.8, 1.5, 0.3]
[5.1, 3.5, 1.4, 0.2] [5.4, 3.4, 1.7, 0.2]
[5.1, 3.5, 1.4, 0.2] [5.1, 3.7, 1.5, 0.4]
[5.1, 3.5, 1.4, 0.2] [4.6, 3.6, 1.0, 0.2]
[5.1, 3.5, 1.4, 0.2] [5.1, 3.3, 1.7, 0.5]
[5.1, 3.5, 1.4, 0.2] [4.8, 3.4, 1.

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



[6.1, 2.6, 5.6, 1.4]
[6.0, 2.7, 5.1, 1.6] [7.7, 3.0, 6.1, 2.3]
[6.0, 2.7, 5.1, 1.6] [6.3, 3.4, 5.6, 2.4]
[6.0, 2.7, 5.1, 1.6] [6.7, 3.1, 5.6, 2.4]
[6.0, 2.7, 5.1, 1.6] [6.9, 3.1, 5.1, 2.3]
[6.0, 2.7, 5.1, 1.6] [6.8, 3.2, 5.9, 2.3]
[6.0, 2.7, 5.1, 1.6] [6.7, 3.3, 5.7, 2.5]
[6.0, 2.7, 5.1, 1.6] [6.7, 3.0, 5.2, 2.3]
[6.0, 2.7, 5.1, 1.6] [6.3, 2.5, 5.0, 1.9]
[6.0, 2.7, 5.1, 1.6] [6.5, 3.0, 5.2, 2.0]
[6.0, 2.7, 5.1, 1.6] [6.2, 3.4, 5.4, 2.3]
[6.0, 2.7, 5.1, 1.6] [5.9, 3.0, 5.1, 1.8]
[6.0, 3.4, 4.5, 1.6] [6.7, 3.1, 4.7, 1.5]
[6.0, 3.4, 4.5, 1.6] [6.3, 2.3, 4.4, 1.3]
[6.0, 3.4, 4.5, 1.6] [5.666666666666667, 2.966666666666667, 4.166666666666667, 1.2666666666666666]
[6.0, 3.4, 4.5, 1.6] [5.5, 2.6, 4.4, 1.2]
[6.0, 3.4, 4.5, 1.6] [5.65, 2.75, 4.15, 1.3]
[6.0, 3.4, 4.5, 1.6] [6.2, 2.9, 4.3, 1.3]
[6.0, 3.4, 4.5, 1.6] [5.1, 2.5, 3.0, 1.1]
[6.0, 3.4, 4.5, 1.6] [6.3, 3.3, 6.0, 2.5]
[6.0, 3.4, 4.5, 1.6] [5.8, 2.7, 5.1, 1.9]
[6.0, 3.4, 4.5, 1.6] [7.1, 3.0, 5.9, 2.1]
[6.0, 3.4, 4.5, 1.6] [6.3, 2.9, 5.6, 

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [9]:
print(clusters)

[[0, 17, 27, 28, 7, 39, 4, 40, 49, 10, 48, 19, 21, 46, 20, 31, 36, 23, 26, 43, 44, 1, 45, 12, 9, 34, 37, 25, 29, 30, 2, 3, 47, 6, 35, 11, 24, 8, 38, 42, 13, 22, 5, 18, 16, 32, 33, 14, 15, 41], [50, 52, 86, 76, 77, 54, 58, 65, 75, 51, 56, 85, 63, 91, 78, 73, 71, 74, 97, 70, 127, 138, 149, 72, 83, 133, 123, 126, 146, 101, 142, 113, 121, 114, 68, 87, 119, 53, 89, 69, 80, 81, 64, 79, 59, 55, 90, 67, 82, 92, 88, 95, 96, 94, 99, 61, 66, 84, 62, 106, 57, 93, 98, 60], [100, 103, 116, 137, 111, 104, 128, 132, 110, 147, 112, 139, 141, 145, 120, 143, 140, 144, 124, 115, 136, 148, 108, 134, 102, 125, 129, 107, 130, 135, 105, 122, 118, 109, 117, 131]]


In [10]:
labels = df.iloc[:, -1].values
for i in range(3):
    points = clusters[i]
    for j in range(len(points)):
        print("Cluster", (i+1), points[j], labels[points[j]])

Cluster 1 0 Iris-setosa
Cluster 1 17 Iris-setosa
Cluster 1 27 Iris-setosa
Cluster 1 28 Iris-setosa
Cluster 1 7 Iris-setosa
Cluster 1 39 Iris-setosa
Cluster 1 4 Iris-setosa
Cluster 1 40 Iris-setosa
Cluster 1 49 Iris-setosa
Cluster 1 10 Iris-setosa
Cluster 1 48 Iris-setosa
Cluster 1 19 Iris-setosa
Cluster 1 21 Iris-setosa
Cluster 1 46 Iris-setosa
Cluster 1 20 Iris-setosa
Cluster 1 31 Iris-setosa
Cluster 1 36 Iris-setosa
Cluster 1 23 Iris-setosa
Cluster 1 26 Iris-setosa
Cluster 1 43 Iris-setosa
Cluster 1 44 Iris-setosa
Cluster 1 1 Iris-setosa
Cluster 1 45 Iris-setosa
Cluster 1 12 Iris-setosa
Cluster 1 9 Iris-setosa
Cluster 1 34 Iris-setosa
Cluster 1 37 Iris-setosa
Cluster 1 25 Iris-setosa
Cluster 1 29 Iris-setosa
Cluster 1 30 Iris-setosa
Cluster 1 2 Iris-setosa
Cluster 1 3 Iris-setosa
Cluster 1 47 Iris-setosa
Cluster 1 6 Iris-setosa
Cluster 1 35 Iris-setosa
Cluster 1 11 Iris-setosa
Cluster 1 24 Iris-setosa
Cluster 1 8 Iris-setosa
Cluster 1 38 Iris-setosa
Cluster 1 42 Iris-setosa
Cluster 1

In [11]:
err_num = 0
for i in range(3):
    points = clusters[i]
    for j in range(len(points)):
        if i ==0 and labels[points[j]] != "Iris-setosa":
            err_num+=1
        elif i ==1 and labels[points[j]] != "Iris-versicolor":
            err_num +=1
        elif i ==2 and labels[points[j]] != "Iris-virginica":
            err_num +=1
print("Accuracy =", (1-err_num/len(data))*100, "%")

Accuracy = 90.66666666666666 %
