In [34]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from numpy.linalg import norm

In [35]:
dataset = pd.read_csv('iris_new_data.csv',delimiter=' ',header = None)

In [36]:
dataset

Unnamed: 0,0,1,2,3
0,5.7,4.4,1.5,0.4
1,5.5,4.2,1.4,0.2
2,5.2,4.1,1.5,0.1
3,5.8,4.0,1.2,0.2
4,5.4,3.9,1.7,0.4
...,...,...,...,...
145,4.5,2.3,1.3,0.3
146,6.0,2.2,5.0,1.5
147,6.2,2.2,4.5,1.5
148,6.0,2.2,4.0,1.0


In [37]:
X = dataset.iloc[:, :].values

In [38]:
class Kmeans:


    def __init__(self, n_cluster, max_iter=100, random_state=68):
        self.n_cluster = n_cluster
        self.max_iter = max_iter
        self.random_state = random_state

    def initial_centroids(self, X):
        np.random.RandomState(self.random_state)
        random_indx = np.random.permutation(X.shape[0])
        centroids = X[random_indx[:self.n_cluster]]
        return centroids
    
    def compute_dist(self, X, centroids):
        dist = np.zeros((X.shape[0], self.n_cluster))
        for k in range(self.n_cluster):
            row_norm = norm(X - centroids[k, :], axis=1)
            dist[:, k] = np.square(row_norm)
        return dist
    
    def compute_sse(self, X, labels, centroids):
        dist = np.zeros(X.shape[0])
        for k in range(self.n_cluster):
            dist[labels == k] = norm(X[labels == k] - centroids[k], axis=1)
        return np.sum(np.square(dist))
    

    def compute_centroids(self, X, labels):
        centroids = np.zeros((self.n_cluster, X.shape[1]))
        for k in range(self.n_cluster):
            centroids[k, :] = np.mean(X[labels == k, :], axis=0)
        return centroids


    def find_nearest_cluster(self, dist):
        return np.argmin(dist, axis=1)


    def fit(self, X):
        self.centroids = self.initial_centroids(X)
        for i in range(self.max_iter):
            old_centroids = self.centroids
            dist = self.compute_dist(X, old_centroids)
            self.labels = self.find_nearest_cluster(dist)
            self.centroids = self.compute_centroids(X, self.labels)
            if np.all(old_centroids == self.centroids):
                break
        self.error = self.compute_sse(X, self.labels, self.centroids)
    
    def predict(self, X):
        dist = self.compute_dist(X, old_centroids)
        return self.find_nearest_cluster(dist)

In [39]:
km = Kmeans(n_cluster=3,max_iter=100)

In [22]:
km.fit(X)

In [23]:
centroids = km.centroids

In [24]:
labels = km.labels

In [25]:
labels

array([2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2,
       2, 2, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 1, 2, 2, 0, 0,
       0, 0, 0, 1, 1, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2,
       2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2,
       2, 2, 2, 2, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1], dtype=int64)

In [26]:
labels = labels + 1

In [27]:
labels

array([3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3,
       3, 3, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 2, 3, 3, 1, 1,
       1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 3,
       3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
       3, 3, 3, 3, 3, 1, 1, 2, 2, 2, 2, 2, 2, 2, 3, 1, 1, 1, 1, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 1,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2], dtype=int64)

In [28]:
np.savetxt('clust_result2.dat', labels , fmt='%s')