In [163]:
# Set up
import math
import random
import pandas as pd
import numpy as np
import scipy

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn import datasets
from sklearn.model_selection import train_test_split
import tqdm

cmap_bold = ListedColormap(['#FF0000', '#0000FF', '#00FF00'])

In [386]:
# reading data
iris = datasets.load_iris()
names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target']
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                       columns= names)

targets = iris_df['target']
iris_df = iris_df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
iris_df_2d = iris_df[['sepal_length', 'sepal_width']]

In [387]:
iris_df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
13,4.3,3.0,1.1,0.1
133,6.3,2.8,5.1,1.5
104,6.5,3.0,5.8,2.2
33,5.5,4.2,1.4,0.2
26,5.0,3.4,1.6,0.4


In [388]:
class K_MEANS:
    '''
    Divides given data into K clusters.
    
    Fields:
        df: Dataset without 'target' attribute
        N: Size of dataset
        k: Number of clusters
        MAX_ITER: Number of iterations
        colors: Each color corresponds to a classification class
        centroids: Ids of centroids
        distances: N x N matrix of distances, where len(df) = N
    '''
    
    def __init__(self, data, k = 3, MAX_ITER = 100):
        self.df = np.array(data)
        (self.N, self.M) = self.df.shape
        self.k = k;
        self.MAX_ITER = MAX_ITER
        self.colors = np.zeros(self.N)
        self.centroids = self.random_centroids()
        self.distances = self.compute_distances()

        
    def random_centroids(self):
        '''
        Chose k random centroids.
        Color centroids.
        
        Returns:
            List of random k indices
        '''
        centrs = random.sample(range(self.N), self.k)
        for c, i in enumerate(centrs):
            self.colors[i] = c + 1
            
        return centrs
    
    
    def compute_distances(self):
        '''
        Returns: 
            N x N matrix of distances
        '''
        X = self.df
        dists = -2 * np.dot(X, X.T) + np.sum(X * X, axis=1) + np.sum(X * X, axis=1)[:, np.newaxis]
        return dists
        
    def nearest_centroid(self, point):
        return min((self.distances[point][c], i) for i, c in enumerate(self.centroids))
    
    
    def plot_2d(self):
        plt.figure(dpi=200)
#         plt.scatter(self.df[:, 0], self.df[:, 1], c=self.colors, edgecolor='black', s=20)
        for i, x in enumerate(self.df):
            if i in self.centroids:
                plt.scatter(x[0], x[1], c='red', edgecolor='black', s=100)
            else:
                plt.scatter(x[0], x[1], c='blue', edgecolor='black', s=20)
        plt.show()
            
        