Implementasi Python Dengan Dataset TXT

In [1]:
#import libary yang digunakan
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
import time
import itertools

#baca dataset dari file
datasetPath = "customer.csv"
# Use np.genfromtxt to load data, handle mixed types, and fill missing values with NaN.
# The delimiter is changed to ',' to correctly read the CSV file.
dataset = np.genfromtxt(datasetPath, delimiter=",", filling_values=np.nan)

#mendefinisikan parameter k-means klustering
k = 2 #jumlah klaster yg diinginkan
iterationCounter = 0 #counter untuk iterasi
input = dataset #input data

#fungsi untuk inisialisi titik pusat klaster (random)
def initCentroid(dataIn, k):
  result = dataIn[np.random.choice(dataIn.shape[0], k, replace=False)]
  return result

In [2]:
#fungsi untuk plot hasil klaster per iterasi
def plotClusterResult(listClusterMembers, centroid, iteration, converged):
  n = listClusterMembers.__len__()
  color = iter(cm.rainbow(np.linspace(0, 1, n)))
  plt.figure("result")
  plt.clf()
  plt.title("iteration-" + iteration)
  marker = itertools.cycle(('.', '*', '^', 'x', '+'))
  for i in range(n):
    col = next(color)
    memberCluster = np.asmatrix(listClusterMembers[i])
    plt.scatter(np.ravel(memberCluster[:, 0]), np.ravel(memberCluster[:, 1]),
                marker=marker.__next__(), s=100, c=col, label="klaster-"+str(i+1))
  for i in range(n):
    plt.scatter((centroid[i, 0]), (centroid[i, 1]), marker=marker.__next__(),
                c=col, label="centroid-" + str(i+1))
  if(converged == 0):
    plt.legend()
    plt.ion()
    plt.show()
    plt.pause(0.1)
  if (converged == 1):
    plt.legend()
    plt.show(block=True)

In [5]:
#fungsi utama algoritma k-means
def kMeans(data, centroidInit):
  nCluster = k #banyaknya klaster
  global iterationCounter
  centroidInit = np.matrix(centroidInit)
  #looping hingga konvergen
  while(True):
    iterationCounter += 1
    euclideanMatrixAllCluster = np.ndarray(shape=(data.shape[0], 0))
    #ulangi proses untuk semua klaster
    for i in range(0, nCluster):
      centroidRepeated = np.repeat(centroidInit[i, :], data.shape[0], axis=0)
      deltaMatrix = abs(np.subtract(data, centroidRepeated))
      #hitung jarak Euclidean
      euclideanMatrix = np.sqrt(np.square(deltaMatrix).sum(axis=1))
      euclideanMatrixAllCluster = \
        np.concatenate((euclideanMatrixAllCluster, euclideanMatrix), axis=1)
    #tempatkan data ke klaster yang jarak Euclideannya paling dekat
    clusterMatrix = np.ravel(np.argmin(np.matrix(euclideanMatrixAllCluster), axis=1))
    listClusterMembers = [[] for i in range(k)]
    for i in range(data.shape[0]): #assign data to cluster regarding cluster matrix
      # Use listClusterMembers instead of listClusterMember
      # Replace np.asscalar with .item() to get the scalar value
      listClusterMembers[clusterMatrix[i].item()].append(data[i,:])
    #hitung titik pusat klaster terbaru
    newCentroid = np.ndarray(shape=(0, centroidInit.shape[1]))
    for i in range(0,nCluster):
      memberCluster = np.asmatrix(listClusterMembers[i])
      centroidCluster = memberCluster.mean(axis=0)
      newCentroid = np.concatenate((newCentroid, centroidCluster), axis=0)
    print("iter: ", iterationCounter)
    print("centroid: ", newCentroid)
    #break dari loop jika sudah konvergen
    if((centroidInit == newCentroid).all()):
      break
    #update titik pusat klaster dengan nilai yang baru
    centroidInit = newCentroid
    #plot hasil klaster per iterasi
    plotClusterResult(listClusterMembers, centroidInit, str(iterationCounter), 0)
    time.sleep(1) # diberi jeda 1 detik agak hasil plot klaster nyaman dilihat
  return listClusterMembers, centroidInit

In [10]:
import numpy as np

def kMeans(data, centroidInit, maxIter=100, tol=1e-4):
    # Jumlah klaster
    k = centroidInit.shape[0]
    # Centroid awal
    centroids = centroidInit
    # Variabel untuk menyimpan hasil klaster
    clusterResults = np.zeros(data.shape[0], dtype=int)

    for iterationCounter in range(maxIter):
        # Hitung jarak setiap titik data ke setiap centroid
        distances = np.linalg.norm(data[:, np.newaxis] - centroids, axis=2)
        # Tentukan klaster terdekat untuk setiap data
        clusterResults = np.argmin(distances, axis=1)

        # Buat array untuk menyimpan centroid baru
        newCentroids = np.zeros_like(centroids)
        for i in range(k):
            # Ambil data yang termasuk dalam klaster i
            clusterData = data[clusterResults == i]
            if len(clusterData) > 0:
                # Hitung rata-rata posisi untuk klaster i
                newCentroids[i] = clusterData.mean(axis=0)
            else:
                # Jika klaster kosong, pertahankan centroid lama
                newCentroids[i] = centroids[i]

        # Debugging: Tampilkan centroid pada iterasi ini
        print(f"Iterasi {iterationCounter + 1}")
        print("Centroid Baru:\n", newCentroids)

        # Cek konvergensi (jika centroid tidak banyak berubah)
        if np.all(np.linalg.norm(newCentroids - centroids, axis=1) < tol):
            print("K-Means telah konvergen.")
            break

        # Update centroid untuk iterasi berikutnya
        centroids = newCentroids

    return clusterResults, centroids


# Contoh penggunaan
# Data input (100 sampel, 2 fitur)
np.random.seed(42)  # Untuk hasil random yang konsisten
data = np.random.rand(100, 2)

# Inisialisasi centroid awal (3 klaster)
k = 3
centroidInit = data[np.random.choice(data.shape[0], k, replace=False)]

# Jalankan algoritma K-Means
clusterResults, finalCentroids = kMeans(data, centroidInit)

print("\nHasil Akhir:")
print("Centroid Akhir:\n", finalCentroids)


Iterasi 1
Centroid Baru:
 [[0.8039633  0.57026999]
 [0.25937141 0.55876692]
 [0.34600441 0.11071069]]
Iterasi 2
Centroid Baru:
 [[0.81167067 0.56668218]
 [0.20718828 0.66171551]
 [0.38646391 0.16319889]]
Iterasi 3
Centroid Baru:
 [[0.81167067 0.56668218]
 [0.19283281 0.69933275]
 [0.37796131 0.18907176]]
Iterasi 4
Centroid Baru:
 [[0.8039633  0.57026999]
 [0.18668273 0.71210228]
 [0.36787846 0.19320081]]
Iterasi 5
Centroid Baru:
 [[0.8039633  0.57026999]
 [0.18520943 0.72228065]
 [0.36376248 0.20008043]]
Iterasi 6
Centroid Baru:
 [[0.8039633  0.57026999]
 [0.18520943 0.72228065]
 [0.36376248 0.20008043]]
K-Means telah konvergen.

Hasil Akhir:
Centroid Akhir:
 [[0.8039633  0.57026999]
 [0.18520943 0.72228065]
 [0.36376248 0.20008043]]
