In [None]:
import pandas as pd
import numpy as np
import os
import time
import math
import warnings # current version of seaborn generates a bunch of warnings that we'll ignore
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import cdist
from sil_coef import silhouette_coefficient
import random
random.seed(10)

warnings.filterwarnings("ignore")
sns.set(style="white", color_codes=True)
%matplotlib inline

def get_iris_data(file_name):
    """Simple function to pre-process the iris dataset."""

    # Specify column names and read in data
    i_names = ['s_len', 's_wid', 'p_len', 'p_wid', 'species']
    iris = pd.read_table(os.path.join(os.getcwd(), file_name), header=None, sep=',', names=i_names)

    return iris

In [None]:
df = get_iris_data('iris.data')
df.head()

In [None]:
print("The data set contains {} records and {} features.".format(df.shape[0], df.shape[1]))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming df is already defined
g = sns.FacetGrid(df, hue="species", height=5)  # Use 'height' instead of 'size'
g.map(plt.scatter, "s_len", "s_wid") 
g.add_legend()

plt.show()


In [None]:
sns.boxplot(x="species", y="p_wid", data=df)

In [None]:
class KMeans:
    """The k-means algorithm."""

    def __init__(self, n_clusters):
        self.data = pd.DataFrame()
        self.n_clusters = n_clusters
        self.centroids = pd.DataFrame()
        self.clusters = np.ndarray(1)
        self.old_centroids = pd.DataFrame()
        self.verbose = False
        self.predictions = list()

    def train(self, df, verbose):
        self.verbose = verbose
        self.data = df.copy(deep=True)
        self.clusters = np.zeros(len(self.data))

        if 'species' in self.data.columns:
            self.data.drop('species', axis=1, inplace=True)

        # Randomly initialize centroids
        unique_rows = self.data.drop_duplicates()
        unique_rows.reset_index(drop=True, inplace=True)
        self.centroids = unique_rows.sample(n=self.n_clusters)
        self.centroids.reset_index(drop=True, inplace=True)

        if self.verbose:
            print("\nRandomly initiated centroids:")
            print(self.centroids)

        # Initialize old centroids as a matrix of all 0's
        self.old_centroids = pd.DataFrame(np.zeros(shape=(self.n_clusters, self.data.shape[1])),
                                          columns=self.data.columns)

        # Compare every data point in our dataset to each of the k-means and assign each point to closest cluster
        while not self.old_centroids.equals(self.centroids):
            
            if self.verbose:
                time.sleep(3)

            # Stash old centroids
            self.old_centroids = self.centroids.copy(deep=True)

            # Iterate through each data point in the matrix
            for row_i in range(0, len(self.data)):
                distances = list()
                point = self.data.iloc[row_i]

                # Calculate the distance between the current point and each of the centroids
                for row_c in range(0, len(self.centroids)):
                    centroid = self.centroids.iloc[row_c]
                    distances.append(np.linalg.norm(point - centroid))

                # Assign this data point to a cluster
                self.clusters[row_i] = np.argmin(distances)

            # For each cluster extract the values which now belong to each cluster and calculate new k-means
            for cls in range(0, self.n_clusters):

                cls_idx = np.where(self.clusters == cls)[0]

                if len(cls_idx) == 0:
                    self.centroids.loc[cls] = self.old_centroids.loc[cls]
                else:
                    # Set the new k-mean to the mean value of the data points within this cluster
                    self.centroids.loc[cls] = self.data.iloc[cls_idx].mean()
                    
                if self.verbose:
                    print("\nRow indices belonging to cluster {}: [n={}]".format(cls, len(cls_idx)))
                    print(cls_idx)

            if self.verbose:
                print("\nOld centroids:")
                print(self.old_centroids)
                print("New centroids:")
                print(self.centroids)

In [None]:
# Run through a simple application
number_of_clusters = 3
kmeans = KMeans(n_clusters=number_of_clusters)
kmeans.train(df=df, verbose=False)

In [None]:
# Extract the results
df['cluster'] = kmeans.clusters
centroids = kmeans.centroids
centroids['cluster'] = 'centroid'
all_df = pd.concat([df, centroids])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the cluster centroids
g = sns.FacetGrid(all_df, hue="cluster", height=5)  # Replaced 'size' with 'height'
g.map(sns.scatterplot, "s_len", "s_wid", marker="o")  # Use 'sns.scatterplot' for better control
g.add_legend()

plt.show()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the clusters
g = sns.FacetGrid(all_df, hue="species", height=5)  # Replaced 'size' with 'height'
g.map(plt.scatter, "s_len", "s_wid").add_legend()

plt.show()


In [None]:
# Set up parameters
sil_coefs = list()
distortions = list()
K = [1, 2, 3, 4, 5]

# Test out multiple values for k
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.train(df=df, verbose=False)
    
    # Extract the results
    df['cluster'] = kmeans.clusters

    # Calculate silhouette coefficient
    s_c = silhouette_coefficient(df=df, clusters=kmeans.clusters, n_clusters=k)
    
    # Calculate distortion
    centroid_vals = kmeans.centroids[kmeans.centroids.columns[:-1]]
    d = df[df.columns[:-2]]
    distortion = sum(np.min(cdist(d, centroid_vals, 'euclidean'), axis=1)) / d.shape[0]
    
    print("For k={}\tAvg. Sil. Coef: {}\tDistortion: {}".format(k, s_c, distortion))
       
    # Keep track of cluster size metrics
    distortions.append(distortion)
    sil_coefs.append(s_c)
    
# Elbow plot
plt.plot(K, distortions, 's-', markersize=8, color='cadetblue', mec='gray')
plt.xlabel('k')
plt.xticks(K)
plt.ylabel('Distortion')
plt.title('Elbow Method for Finding Optimal k')
plt.show()

In [None]:
from sklearn.cluster import KMeans as sklKMeans
from sklearn import datasets

k = 2

# Re-load data set
iris = get_iris_data('iris.data')
kmeans = sklKMeans(n_clusters=k, init='random').fit(iris[iris.columns[:-1]].values)

# Calculate silhouette coefficient
s_c = silhouette_coefficient(df=iris, clusters=kmeans.labels_, n_clusters=k)

# Calculate distortion
distortion = sum(np.min(cdist(iris[iris.columns[:-1]], kmeans.cluster_centers_, 'euclidean'), axis=1)) / iris.shape[0]
print("For k={}\tAvg. Sil. Coef: {}\tDistortion: {}".format(k, s_c, distortion))