In [1]:
import matplotlib
import matplotlib.pyplot as plt
import math
import random
import os

In [2]:
matplotlib.use('Agg') 

In [3]:
def read_data(filename):
    points = []
    with open(filename, 'r') as f:
        for line in f:
            parts = line.split()
            if len(parts)>=2:
                points.append([int(parts[0]), int(parts[0])])
    return points

In [4]:
def filter_data(points, keep_percentage):
    if keep_percentage >= 100:
        return points
    count = int(len(points)*(keep_percentage/100))
    return random.sample(points, count)

In [5]:
full_data = read_data('unbalance.txt')
partial_data = filter_data(full_data, 1)
print(partial_data)

[[525797, 525797], [179169, 179169], [152133, 152133], [171678, 171678], [212286, 212286], [179440, 179440], [171715, 171715], [152646, 152646], [213931, 213931], [151022, 151022], [147282, 147282], [440743, 440743], [149504, 149504], [179540, 179540], [205390, 205390], [182442, 182442], [214523, 214523], [209654, 209654], [208573, 208573], [180335, 180335], [181400, 181400], [206857, 206857], [175371, 175371], [217332, 217332], [213131, 213131], [153181, 153181], [209089, 209089], [176770, 176770], [153307, 153307], [182342, 182342], [182508, 182508], [204756, 204756], [474062, 474062], [178479, 178479], [530296, 530296], [146819, 146819], [179491, 179491], [151784, 151784], [173760, 173760], [147551, 147551], [177795, 177795], [209078, 209078], [205923, 205923], [153503, 153503], [208996, 208996], [210975, 210975], [151372, 151372], [177748, 177748], [208949, 208949], [204437, 204437], [148286, 148286], [145533, 145533], [186111, 186111], [208521, 208521], [146343, 146343], [152732, 

In [6]:
def euclidean_dist_sq(p1, p2):
    return (p1[0]-p2[0])**2 + (p1[1]-p2[1])**2

In [7]:
def get_closest_centroid(point, centroids):
    min_dist = float('inf')
    index = -1
    for i, c in enumerate(centroids):
        d = euclidean_dist_sq(point, c)
        if d < min_dist:
            min_dist = d
            index = i
    return index, min_dist

In [8]:
test_cents = [[0,0], [10,10]]
get_closest_centroid([1,1],test_cents)

(0, 2)

#### The k-mean core algorithm 

In [9]:
def init_random(points, k):
    return random.sample(points, k)

def assign_clusters(points, centroids):
    clusters = [[] for _ in centroids]
    total_wcss = 0
    for p in points:
        idx, dist_sq = get_closest_centroid(p, centroids)
        clusters[idx].append(p)
        total_wcss += dist_sq
    return clusters, total_wcss

def update_centroids(clusters):
    new_centroids = []
    for cluster in clusters:
        if not cluster: continue
        sum_x = sum(p[0] for p in cluster)
        sum_y = sum(p[1] for p in cluster)
        n = len(cluster)
        new_centroids.append([sum_x / n, sum_y / n])
    return new_centroids

In [10]:
def kmeans_algorithm(points, k, max_iters = 100):
    centroids = init_random(points, k)
    for _ in range(max_iters):
        clusters, wcss = assign_clusters(points, centroids)
        new_centroids = update_centroids(clusters)
        if new_centroids == centroids:
            break
        centroids = new_centroids
    return clusters, centroids, wcss

In [11]:
def generate_elbow_plot(data, max_k = 20):
    wcss_history = []
    k_range = range(1, max_k + 1)
    
    for k in k_range:
        _, _, wcss = kmeans_algorithm(data, k)
        wcss_history.append(wcss)
        print(f"  k={k}: Error={int(wcss)}")

    plt.figure(figsize=(10, 6))
    plt.plot(k_range, wcss_history, 'bx-')
    plt.xlabel('k (Number of clusters)')
    plt.ylabel('WCSS Error')
    plt.title('The Elbow Method')    
    plt.savefig('elbow_curve.png')
    plt.close()
    print("Saved 'elbow_curve.png'.")

In [23]:
generate_elbow_plot(sample_data, 10)

  k=1: Error=17734001337707
  k=2: Error=1884043174653
  k=3: Error=709191938630
  k=4: Error=351098826652
  k=5: Error=345575290431
  k=6: Error=341596224055
  k=7: Error=343450803625
  k=8: Error=334689239565
  k=9: Error=334393081639
  k=10: Error=332430942110
Saved 'elbow_curve.png'.


In [37]:
def plot_results(clusters, centroids, k):
    print("Plotting Clusters...")
    plt.figure(figsize=(10, 8))
    colors = ['r', 'g', 'b', 'c', 'm', 'y', 'orange', 'purple', 'brown']
    
    for i, cluster in enumerate(clusters):
        if not cluster: continue
        #p_clus = random.sample(cluster, 1000) if len(cluster) > 1000 else cluster
        p_clus = cluster
        
        x = [p[0] for p in p_clus]
        y = [p[1] for p in p_clus]
        col = colors[i % len(colors)]
        plt.plot(x, y, 'o', markersize=4, color=col, alpha=0.6, label=f'C{i+1}')

    cx = [c[0] for c in centroids]
    cy = [c[1] for c in centroids]
    plt.plot(cx, cy, 'X', markersize=15, color='black', label='Centroids')
    
    plt.title(f"k-Means (k={k})")
    plt.legend()
    plt.savefig('kmeans_final5.png')
    plt.close()
    print("Saved 'kmeans_final.png'. Done!")

In [39]:
k = 2
runs = 5
data = read_data('unbalance.txt')

best_wcss = float('inf')
best_clusters = None
best_centroids = None

print(f"Running k-Means (k={k}) with {runs} restarts...")
for i in range(runs):
    clusters, cents, wcss = kmeans_algorithm(data, k)
    if wcss < best_wcss:
        best_wcss = wcss
        best_clusters = clusters
        best_centroids = cents

print(f"Best Run WCSS: {int(best_wcss)}")
plot_results(best_clusters, best_centroids, k)

Running k-Means (k=2) with 5 restarts...
Best Run WCSS: 7011360115400
Plotting Clusters...
Saved 'kmeans_final.png'. Done!
