# Comp 5130 Project

# K-Means Clustering

## Authors: Kevin Dong, Maci Hadley, Marshall Nelson

### Dr. Yang Zhou


#### Imports

In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random as random



#### Load Dataset  
We are using the simple circles dataset from UCIrvine

In [27]:
 # Define the file to open
file_path = 'circles.txt'

# Open the file and read coordinates
coordinates = np.loadtxt(file_path, delimiter=',', skiprows=1, usecols=(0, 1))

# Print out the list of coordinates
# print("Points:", coordinates)

Points: [[  3.15676 116.12252]
 [ 16.14436  16.8166 ]
 [100.31212  64.99025]
 ...
 [ 81.18879 184.85662]
 [116.47674 116.45223]
 [120.84808  75.07245]]


#### Define K-Means Clustering Function
Self-Implemented without using an external k-means library


In [31]:
def initialize_centroids(data, k):
    indices = np.random.choice(len(data), k, replace=False)
    centroids = np.array(data[indices])
    return centroids

def assign_clusters(data, centroids):
    # Assign each point to the nearest centroid
    clusters = []
    for point in data:
        distances = np.linalg.norm(point - centroids, axis=1)
        nearest_centroid = np.argmin(distances)
        clusters.append(nearest_centroid)
    return np.array(clusters)

def update_centroids(data, clusters, k):
    # Calculate new centroids as the mean of assigned points
    new_centroids = []
    for i in range(k):
        cluster_points = data[clusters == i]
        if len(cluster_points) > 0:
            new_centroids.append(cluster_points.mean(axis=0))
        else:
            # If a cluster has no points, reinitialize its centroid randomly
            new_centroids.append(data[np.random.choice(len(data))])
    return np.array(new_centroids)

        
            

#### Perform K-Means on the dataset  

In [32]:
def kmeans(data, k, max_iters=100, tol=1e-4):
    # Initialize centroids
    centroids = initialize_centroids(data, k)
    for i in range(max_iters):
        # Assign clusters
        clusters = assign_clusters(data, centroids)
        # Update centroids
        new_centroids = update_centroids(data, clusters, k)
        # Check for convergence
        if np.all(np.abs(new_centroids - centroids) < tol):
            print(f"Converged after {i+1} iterations.")
            break
        centroids = new_centroids
    return clusters, centroids

In [36]:
k = 100
clusters, centroids = kmeans(coordinates, k)
print(centroids)
print(clusters)

Converged after 15 iterations.
[[ 1.59152447e+02  1.79933850e+02]
 [ 1.82630991e+02  1.61392822e+02]
 [ 4.02920262e+01  5.44020455e+01]
 [ 1.40026865e+02  1.31929394e+02]
 [ 3.96761900e+01  7.96235782e+01]
 [ 1.79875178e+02  9.99519463e+01]
 [ 1.79891007e+02  5.36377576e+01]
 [ 1.16271057e+02  1.17381537e+02]
 [ 1.36385668e+02  2.23176277e+01]
 [ 1.36418452e+02  1.01751587e+02]
 [-9.27741000e-02  1.40244033e+02]
 [ 4.02858017e+01  1.19580183e+02]
 [ 1.80302278e+02  8.00208115e+01]
 [ 4.32199909e+01  9.89102002e+01]
 [ 9.97575962e+01  2.00270981e+01]
 [ 2.01760978e+01  1.20058594e+02]
 [ 6.01548000e-02  5.96413229e+01]
 [ 6.04726321e+01  7.99657609e+01]
 [ 1.20283778e+02  1.69903934e+02]
 [ 2.01402318e+01  3.36777831e+00]
 [ 1.59541968e+02  6.03387535e+01]
 [-1.66649000e-01  1.39901100e-01]
 [ 6.01209191e+01  6.03748878e+01]
 [ 1.39710069e+02  6.06412753e+01]
 [ 1.19498603e+02  5.94612926e+01]
 [ 1.76308668e+02  1.59106212e+02]
 [ 8.01644809e+01 -4.53388800e-01]
 [ 1.43242202e+02  2.260