In [62]:
import pandas as pd
import numpy as np
import matplotlib as plt

animals = pd.read_csv('/content/animals', header = None)
countries = pd.read_csv('/content/countries', header = None)
fruits = pd.read_csv('/content/fruits', header = None)
veggies = pd.read_csv('/content/veggies', header = None)

x_animals = np.array([np.array(str(np.array(i)).split(" "))[1:].astype(float) for i in animals[0]])
y_animals = np.array([0 for i in animals[0]])

x_countries = np.array([np.array(str(np.array(i)).split(" "))[1:].astype(float) for i in countries[0]])
y_countries = np.array([1 for i in countries[0]])

x_fruits = np.array([np.array(str(np.array(i)).split(" "))[1:].astype(float) for i in fruits[0]])
y_fruits = np.array([2 for i in fruits[0]])

x_veggies = np.array([np.array(str(np.array(i)).split(" "))[1:].astype(float) for i in veggies[0]])
y_veggies = np.array([3 for i in veggies[0]])

_X = np.concatenate([x_animals, x_countries, x_veggies, x_fruits])
_y = np.concatenate([y_animals, y_countries, y_fruits, y_veggies])


(50, 300)
(50,)
(161, 300)
(161,)
(58, 300)
(58,)
(58, 300)
(58,)


In [84]:

np.random.seed(42)


def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


class KMeans:
    def __init__(self, K=5, max_iters=100, plot_steps=False):
        self.K = K
        self.max_iters = max_iters
        self.plot_steps = plot_steps

        # list of sample indices for each cluster
        self.clusters = [[] for _ in range(self.K)]
        # the centers (mean feature vector) for each cluster
        self.centroids = []

    def predict(self, X):
        self.X = X
        self.n_samples, self.n_features = X.shape

        # initialize
        random_sample_idxs = np.random.choice(self.n_samples, self.K, replace=False)
        self.centroids = [self.X[idx] for idx in random_sample_idxs]
        # print(random_sample_idxs)
        # print(self.centroids)

        # Optimize clusters
        for _ in range(self.max_iters):
            # Assign samples to closest centroids (create clusters)
            self.clusters = self._create_clusters(self.centroids)

            # if self.plot_steps:
            #     self.plot()

            # Calculate new centroids from the clusters
            centroids_old = self.centroids
            self.centroids = self._get_centroids(self.clusters)
            print(self.centroids.shape)

            # check if clusters have changed
            if self._is_converged(centroids_old, self.centroids):
                print("converged")
                print(_)
                break

            # if self.plot_steps:
            #     self.plot()

        # Classify samples as the index of their clusters
        return self._get_cluster_labels(self.clusters)

    def _get_cluster_labels(self, clusters):
        # each sample will get the label of the cluster it was assigned to
        labels = np.empty(self.n_samples)

        for cluster_idx, cluster in enumerate(clusters):
            for sample_index in cluster:
                labels[sample_index] = cluster_idx
        return labels

    def _create_clusters(self, centroids):
        # Assign the samples to the closest centroids to create clusters
        clusters = [[] for _ in range(self.K)]
        for idx, sample in enumerate(self.X):
            centroid_idx = self._closest_centroid(sample, centroids)
            clusters[centroid_idx].append(idx)
        return clusters

    def _closest_centroid(self, sample, centroids):
        # distance of the current sample to each centroid
        distances = [euclidean_distance(sample, point) for point in centroids]
        closest_index = np.argmin(distances)
        return closest_index

    def _get_centroids(self, clusters):
        # assign mean value of clusters to centroids
        centroids = np.zeros((self.K, self.n_features))
        for cluster_idx, cluster in enumerate(clusters):
            cluster_mean = np.mean(self.X[cluster], axis=0)
            # cluster_median = np.median(self.X[cluster], axis=0)
            centroids[cluster_idx] = cluster_mean
        return centroids

    def _is_converged(self, centroids_old, centroids):
        # distances between each old and new centroids, fol all centroids
        distances = [
            euclidean_distance(centroids_old[i], centroids[i]) for i in range(self.K)
        ]
        return sum(distances) == 0

    # def plot(self):
    #     fig, ax = plt.subplots(figsize=(12, 8))

    #     for i, index in enumerate(self.clusters):
    #         point = self.X[index].T
    #         ax.scatter(*point)

    #     for point in self.centroids:
    #         ax.scatter(*point, marker="x", color="black", linewidth=2)

    #     plt.show()


# Testing
if __name__ == "__main__":
    # from sklearn.datasets import make_blobs

    # X, y = make_blobs(
    #     centers=3, n_samples=500, n_features=2, shuffle=True, random_state=40
    # )
    # print(X.shape)
    # print(X)
    # print(y)

    print(_X.shape)
    # print(_X)
    # print(_y)

    clusters = len(np.unique(_y))
    print(clusters)

    k = KMeans(K=clusters, max_iters=150, plot_steps=True)
    y_pred = k.predict(_X)



    # k.plot()

(327, 300)
4
[231 110 250   9]
[array([ 2.5316e-01,  1.8264e-01, -2.7487e-01,  9.8020e-02,  4.4124e-01,
       -9.6528e-01, -5.0654e-02, -5.9358e-01, -4.3078e-01, -1.0226e+00,
        5.2074e-01, -3.3690e-02,  4.5034e-01,  4.5660e-01,  5.2442e-01,
       -2.5182e-01,  2.1656e-02,  3.4084e-01,  2.0542e-01,  3.2101e-01,
        1.0683e-01,  4.1240e-01, -2.1667e-01,  4.0969e-01,  6.8011e-02,
       -4.1328e-01, -3.0766e-01, -6.5344e-01, -2.5346e-03, -3.4729e-01,
        5.1200e-01,  2.5382e-01, -1.3925e-03,  3.2606e-02,  4.2447e-01,
        2.9593e-01, -3.0377e-01, -1.3645e-01,  7.8503e-02, -7.9713e-02,
        9.0570e-02, -4.1105e-01,  4.1533e-02, -5.0155e-01, -3.1418e-01,
       -4.2424e-01, -3.9796e-01, -2.7441e-01,  2.8547e-01,  3.5568e-01,
        3.1032e-01, -2.9850e-01,  7.8357e-01,  4.4660e-02, -1.2244e+00,
       -2.7668e-01,  1.7946e-02,  1.3843e-01,  2.9247e-01, -6.3251e-01,
        2.1300e-01,  4.2669e-01, -2.7817e-01,  6.2453e-01, -4.0906e-01,
       -2.5550e-01,  2.4480e-01,

In [71]:
def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred) / len(y_true)
  return accuracy

In [81]:
print(accuracy(y, y_pred))

0.334


In [82]:
print(y_pred)

[0. 2. 1. 1. 2. 2. 0. 0. 0. 0. 2. 2. 1. 0. 2. 1. 2. 2. 0. 1. 0. 2. 2. 2.
 0. 0. 0. 2. 0. 1. 2. 0. 2. 1. 1. 0. 2. 1. 0. 0. 2. 0. 2. 2. 1. 1. 1. 1.
 2. 2. 1. 1. 0. 1. 1. 2. 1. 2. 1. 2. 0. 1. 2. 0. 1. 0. 0. 2. 2. 1. 1. 0.
 2. 1. 0. 2. 2. 0. 2. 1. 0. 1. 1. 2. 1. 2. 0. 1. 1. 0. 0. 0. 2. 0. 1. 2.
 1. 0. 1. 2. 1. 2. 1. 2. 2. 0. 2. 2. 2. 1. 1. 1. 2. 2. 2. 0. 2. 0. 0. 2.
 1. 1. 2. 1. 2. 0. 1. 2. 2. 2. 2. 1. 1. 1. 2. 0. 2. 1. 2. 2. 0. 1. 2. 0.
 0. 2. 0. 0. 0. 0. 2. 2. 2. 2. 0. 1. 1. 1. 0. 1. 1. 0. 1. 1. 2. 0. 1. 1.
 2. 2. 0. 1. 2. 1. 1. 1. 1. 1. 1. 0. 2. 1. 2. 0. 2. 0. 1. 0. 2. 2. 1. 1.
 0. 2. 1. 1. 0. 1. 1. 0. 0. 1. 2. 2. 2. 0. 0. 0. 1. 2. 2. 2. 2. 0. 1. 0.
 2. 1. 0. 0. 1. 2. 1. 1. 1. 2. 2. 1. 1. 0. 1. 2. 0. 2. 1. 1. 2. 2. 0. 0.
 1. 1. 0. 0. 0. 1. 1. 0. 0. 1. 0. 1. 2. 2. 1. 1. 0. 1. 2. 2. 2. 2. 2. 2.
 0. 2. 1. 1. 0. 2. 0. 0. 0. 1. 0. 1. 2. 1. 2. 1. 2. 0. 0. 1. 1. 1. 2. 0.
 0. 1. 1. 0. 2. 2. 0. 0. 0. 1. 2. 2. 1. 1. 2. 1. 0. 2. 2. 1. 0. 0. 2. 2.
 2. 0. 2. 1. 1. 2. 0. 0. 2. 0. 0. 0. 0. 2. 0. 1. 2.