<a href="https://colab.research.google.com/github/Riptide898/CatsandDogs/blob/master/Wine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

In [0]:
# The algorithm translated into Python code might look like this. Then we just need to implement each of the functions.
def k_means(data, k, max_iterations):
    centres = get_initial_centres(data, k)
    
    for i in range(0, max_iterations):
        cluster_assignment = assign_clusters(data, centres) # will pass back array of the clusters for each datapoint
        new_centres = revise_centres(data, cluster_assignment, k) # returns 3 new centres 
        if np.array_equal(new_centres, centres): # check if the centres are equal if they are break out 
            break
        centres = new_centres # new centres is assigned to centres variable
    return cluster_assignment

In [0]:
# Should take the data and number of clusters (k) as input, return k random values from the data
def get_initial_centres(data, k):
    data_points = data.copy() # need a copy because we want to isolated the shuffling to inside the function
    np.random.shuffle(data_points) # shuffle the points to create the random data centres
    return data_points[:k]

In [0]:
from scipy.spatial import distance # cluster assignment

# Should take the data and the k centres and return an np array with a cluster assignment for each data point
def assign_clusters(data, centres):
    num_data_points = data.shape[0] # there are 1599 rows and 2 columns and the [0] just assigns the number to 150 

    cluster_assignment = np.zeros(shape=(num_data_points, 1)) # creates an 1D array with 150 zeros; cannot append np array

    for i in range(0, num_data_points):
        distances = [] # recording the distances between the current data point and the centres
        for j in range(0, centres.shape[0]):
            distances.append(distance.sqeuclidean(data[i], centres[j]))
        cluster_assignment[i] = distances.index(min(distances)) # using min will give us the smallest value of the 
    # distance list and gives the index of that list element, value will be 0,1 or 2 an will return that cluster
    # assignment array for the data points

    return cluster_assignment

In [0]:
# Should take the data, cluster assignments and number of clusters and return the point closest to the average for each cluster

def revise_centres(data, cluster_assignment, k):
    revised_centres = np.zeros(shape=(k, data[0].shape[0])) # initalising revised_centres array to an np array of zeros
    print(revised_centres.shape)
    # k is the amount of rows in the array and data.shape is the number of columns
   # Get the number of data points in each cluster
    
    centre_labels, num_in_clusters = np.unique(cluster_assignment, return_counts=True)
    # shows how many of the data points are in each cluster and assigns them to the two variables; uses tuples
    # For each cluster we want to find the average of all the points and find the point closest to that
    
    for i in range(0, k):
        # Initialise an empty np array where we will store all the data points from cluster i
      
        this_cluster = np.zeros(shape=(num_in_clusters[i], data[0].shape[0]))
        # np array uses the number of points in a cluster to create the rows
        # We need to keep track of how many data points we have collected for cluster i so far
        
        data_point_index = 0
        for j in range(0, data.shape[0]):
            if cluster_assignment[j] == centre_labels[i]:
                this_cluster[data_point_index] = data[j]
                data_point_index += 1
        # Now we have collected all data points for this cluster we can calculate the middle/average position
       
        average_position = np.mean(this_cluster, axis=0)
        # axis = 0 averages each column 
        # Calculate the closest point from this cluster to the average position
        
        new_centre = this_cluster[0]
        for j in range(0, num_in_clusters[i]):
            if distance.sqeuclidean(this_cluster[j], average_position) < distance.sqeuclidean(new_centre, average_position):
                new_centre = this_cluster[j]
        revised_centres[i] = new_centre
       # loops through data points and finds the closest to the new centre and returns the new data centres          
    return revised_centres

In [0]:
def k_means(data, k, max_iterations):
  centres = get_initial_centres(data, k)
  
  for i in range(0, max_iterations):
    cluster_assignment = assign_clusters(data, centres)
    new_centres = revise_centres(data, cluster_assignment, k)
    if np.array_equal(new_centres, centres):
      break
    centres = new_centres
    
  print('Number of iterations: %d' % (i + 1))
  return cluster_assignment