<a href="https://colab.research.google.com/github/WilliamPoe/CSCI-290/blob/main/notebooks/k_means_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [28]:
class KMeans_clustering:

    def __init__(self, k=3, max_iterations=100):
        self.k = k
        self.max_iterations = max_iterations

    def euclidean_distance(self, point1, point2): # Calculate the euclidean distance
        return np.sqrt(np.sum((np.array(point1) - np.array(point2))**2))

    def ran_centroids(self, data): # Picks random centroids from dataset
        ran_centroids = data.sample(self.k).index
        return data.loc[ran_centroids]

    def calc_centroids(self, clusters, data): # Calculate the new centroids from the clusters
        centroids = pd.DataFrame(columns=data.columns, index=range(self.k))
        for i, cluster in enumerate(clusters):
            centroids.iloc[i] = np.mean(data.iloc[[idx for idx in cluster]], axis=0)
        return centroids

    def distances(self, point, centroids): # Calculate the distance between the data and the centroids using euclidean distances function
        distances = [self.euclidean_distance(point, centroid) for _, centroid in centroids.iterrows()]
        return np.argmin(distances)

    def clustering(self, data, centroids): # Clusters the data together based off of the calculated distances
        clusters = [[] for _ in range(self.k)]
        labels = np.empty(self.n_samples)
        for idx in range(data.shape[0]):
            point = data.iloc[idx]
            cluster_index = self.distances(point, centroids)
            clusters[cluster_index].append(idx)
            labels[idx] = cluster_index
        return clusters, labels

    def fit(self, data): # Fits the data
      self.n_samples, self.n_features = data.shape[0], data.shape[1]
      print('The number of samples is:',self.n_samples)
      print('The number of features is:',self.n_features)
      self.centroids = self.ran_centroids(data)

      for iteration in range(self.max_iterations):
        self.clusters, self.labels = self.clustering(data, self.centroids)
        cur_centroids = self.calc_centroids(self.clusters, data)

        #if (iteration / 10) % 1 == 0 or iteration == 1: if you don't want to plot every iteration
        print('CLUSTERS:', self.clusters)

        for cluster in range(self.k):
          for point in self.clusters[cluster]:
            inertia = inertia + np.sum((data.iloc[point] - cur_centroids[cluster])**2)
        #print('INERTIA:', inertia)

        plt.figure(figsize=(5, 3))

        # Plots the data points with cluster colors
        sns.scatterplot( x = data.iloc[:, 0], y = data.iloc[:, 1], hue = self.labels, palette='viridis')

        # Plots the centroids with a different color
        print('test1:',cur_centroids.iloc[:, 0])
        print('test2:',cur_centroids.iloc[:, 1])
        sns.scatterplot( x = cur_centroids.iloc[:, 0], y = cur_centroids.iloc[:, 1], c = "black", marker="X", s = 100);

        plt.title(f'Cluster Visualization - Iteration {iteration}')
        plt.xlabel('Feature 1')
        plt.ylabel('Feature 2')

        plt.show()

        if self.centroids.equals(cur_centroids.set_index(self.centroids.index)):
          print('The centroids have converged')
          print('The number of iterations is:',iteration)
          break
        self.centroids = cur_centroids

In [29]:
#df = pd.read_csv('https://github.com/WilliamPoe/CSCI-290/raw/refs/heads/main/Data/penguins.csv')
df = pd.read_csv('https://raw.githubusercontent.com/WilliamPoe/CSCI-290/refs/heads/main/Data/Iris.csv')

def plot_clusters(data, labels, centroids):
    plt.figure(figsize=(5, 3))

    # Plots the data points with cluster colors
    sns.scatterplot( x = data.iloc[:, 0], y = data.iloc[:, 1], hue = labels, palette='viridis', s=50)

    # Plots the centroids with a different color
    sns.scatterplot( x = centroids.iloc[:, 0], y = centroids.iloc[:, 1], c = "black", marker="x", s = 100);

    plt.title('Cluster Visualization')
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')

    plt.show()

numeric_df = df.select_dtypes(include=[np.number])

#numeric_df = numeric_df.drop(columns='year')
numeric_df = numeric_df.drop(columns='Id')
numeric_df = numeric_df.dropna()
print(numeric_df.dtypes)


model = KMeans_clustering(k=3)
model.fit(numeric_df)
labels = model.labels
centroids = model.centroids


plot_clusters(numeric_df, labels, centroids)


SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
dtype: object
The number of samples is: 150
The number of features is: 4
CLUSTERS: [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 98], [53, 57, 59, 60, 68, 80, 81, 83, 89, 90, 93, 101, 106, 113, 114, 119, 121, 128, 132, 134, 142, 146], [50, 51, 52, 54, 55, 56, 58, 61, 62, 63, 64, 65, 66, 67, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 82, 84, 85, 86, 87, 88, 91, 92, 94, 95, 96, 97, 99, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 115, 116, 117, 118, 120, 122, 123, 124, 125, 126, 127, 129, 130, 131, 133, 135, 136, 137, 138, 139, 140, 141, 143, 144, 145, 147, 148, 149]]


UnboundLocalError: local variable 'inertia' referenced before assignment

In [49]:
numeric_df

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
0,39.1,18.7,181.0,3750.0
1,39.5,17.4,186.0,3800.0
2,40.3,18.0,195.0,3250.0
4,36.7,19.3,193.0,3450.0
5,39.3,20.6,190.0,3650.0
...,...,...,...,...
339,55.8,19.8,207.0,4000.0
340,43.5,18.1,202.0,3400.0
341,49.6,18.2,193.0,3775.0
342,50.8,19.0,210.0,4100.0
