# Comp 5130 Project

# K-Means Clustering

## Authors: Kevin Dong, Maci Hadley, Marshall Nelson

### Dr. Yang Zhou


#### Imports

In [2]:
import sys
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random as random



#### Load Dataset  
We are using the simple circles dataset from UCIrvine

In [27]:
 # Define the file to open
file_path = 'circles.txt'

# Open the file and read coordinates
coordinates = np.loadtxt(file_path, delimiter=',', skiprows=1, usecols=(0, 1))

# Print out the list of coordinates
# print("Points:", coordinates)

Points: [[  3.15676 116.12252]
 [ 16.14436  16.8166 ]
 [100.31212  64.99025]
 ...
 [ 81.18879 184.85662]
 [116.47674 116.45223]
 [120.84808  75.07245]]


#### Define K-Means Clustering Function
Self-Implemented without using an external k-means library


In [31]:
def initialize_centroids(data, k):
    indices = np.random.choice(len(data), k, replace=False)
    centroids = np.array(data[indices])
    return centroids

def assign_clusters(data, centroids):
    # Assign each point to the nearest centroid
    clusters = []
    for point in data:
        distances = np.linalg.norm(point - centroids, axis=1)
        nearest_centroid = np.argmin(distances)
        clusters.append(nearest_centroid)
    return np.array(clusters)

def update_centroids(data, clusters, k):
    # Calculate new centroids as the mean of assigned points
    new_centroids = []
    for i in range(k):
        cluster_points = data[clusters == i]
        if len(cluster_points) > 0:
            new_centroids.append(cluster_points.mean(axis=0))
        else:
            # If a cluster has no points, reinitialize its centroid randomly
            new_centroids.append(data[np.random.choice(len(data))])
    return np.array(new_centroids)

def euclidian_distance(record_a, record_b):
    difference = (record_a - record_b)
    e_distance = np.sqrt(np.sum(np.square(difference)))
    
    return e_distance
        
            

#### Perform K-Means on the dataset  

In [32]:
def kmeans(data, k, max_iters=100, tol=1e-4):
    # Initialize centroids
    centroids = initialize_centroids(data, k)
    for i in range(max_iters):
        # Assign clusters
        clusters = assign_clusters(data, centroids)
        # Update centroids
        new_centroids = update_centroids(data, clusters, k)
        # Check for convergence
        if np.all(np.abs(new_centroids - centroids) < tol):
            print(f"Converged after {i+1} iterations.")
            break
        centroids = new_centroids
    return clusters, centroids

In [34]:
k = 100
clusters, centroids = kmeans(coordinates, k)

Converged after 15 iterations.
