<a href="https://colab.research.google.com/github/SoparAwayyy/Machine-Learning/blob/main/Week6/6_Dwi_Saputra_Sopar_Siagian_Reproduce_kode_Sklearn_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift, estimate_bandwidth
from sklearn.datasets import make_blobs
from sklearn import covariance, cluster, manifold
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import LocallyLinearEmbedding
from sklearn.metrics import silhouette_score
from sklearn.cluster import Birch
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
file_path = '/content/drive/MyDrive/ML DataSet/Mall_Customers.csv'
df = pd.read_csv(file_path)

In [None]:
df.info()

In [None]:
df.head()

# **Affinity Propagation Clustering**

In [None]:
# Memilih kolom yang akan digunakan untuk clustering
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Penskalaan data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Menjalankan Affinity Propagation Clustering
af = AffinityPropagation(preference=-50, random_state=0).fit(X_scaled)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)

print("Estimated number of clusters:", n_clusters_)
print("Silhouette Coefficient:", silhouette_score(X_scaled, labels, metric='sqeuclidean'))

# Visualisasi hasil clustering
plt.figure(figsize=(10, 8))
colors = plt.cm.viridis(np.linspace(0, 1, n_clusters_))

for k, col in zip(range(n_clusters_), colors):
    class_members = labels == k
    cluster_center = X_scaled[cluster_centers_indices[k]]
    plt.scatter(X_scaled[class_members, 0], X_scaled[class_members, 1], color=col, marker=".", label=f'Cluster {k}')
    plt.scatter(cluster_center[0], cluster_center[1], s=200, color=col, marker="o", label=f'Cluster {k} center')
    for x in X_scaled[class_members]:
        plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], color=col)

plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.legend()
plt.show()

# **Mean Shift Clustering**

> Add blockquote



In [None]:
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

bandwidth = estimate_bandwidth(X_scaled, quantile=0.2, n_samples=500)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X_scaled)
labels = ms.labels_
cluster_centers = ms.cluster_centers_

labels_unique = np.unique(labels)
n_clusters_ = len(labels_unique)

print("Number of estimated clusters: %d" % n_clusters_)

plt.figure(figsize=(10, 8))

for k in range(n_clusters_):
    cluster_center = cluster_centers[k]
    cluster_points = X_scaled[labels == k]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {k}')

plt.scatter(cluster_centers[:, 0], cluster_centers[:, 1], color='black', marker='x', s=100, label='Centroids')
plt.title("Estimated number of clusters: %d" % n_clusters_)
plt.xlabel('Age (scaled)')
plt.ylabel('Annual Income (k$) (scaled)')
plt.legend()
plt.show()

# **BIRCH Clustering**

In [None]:
# Menampilkan beberapa baris pertama dari dataset
print(df.head())

# Memilih fitur yang akan digunakan untuk clustering
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Penskalaan data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Menerapkan BIRCH Clustering
birch = Birch(threshold=0.5, n_clusters=None)
birch.fit(X_scaled)

# Menyimpan label cluster untuk setiap titik data
labels = birch.labels_

# Menambahkan kolom label ke dataframe
df['Cluster'] = labels

# Menampilkan informasi hasil clustering
print("Jumlah cluster yang dihasilkan:", len(set(labels)))
print(df['Cluster'].value_counts())

# Visualisasi hasil clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='viridis', legend='full')
plt.title('Clustering menggunakan BIRCH')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# **K-Means Clustering**

In [None]:
# Menampilkan beberapa baris pertama dari dataset
print(df.head())

# Memilih fitur yang akan digunakan untuk clustering
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Penskalaan data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Menentukan jumlah cluster yang diinginkan
n_clusters = 5

# Menerapkan K-Means Clustering
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans.fit(X_scaled)

# Menyimpan label cluster untuk setiap titik data
labels = kmeans.labels_

# Menambahkan kolom label ke dataframe
df['Cluster'] = labels

# Menampilkan informasi hasil clustering
print("Jumlah cluster yang dihasilkan:", n_clusters)
print(df['Cluster'].value_counts())

# Visualisasi hasil clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='viridis', legend='full')
plt.title('Clustering menggunakan K-Means')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()

# **Hierarchical clustering**

In [None]:
# Menampilkan beberapa baris pertama dari dataset
print(df.head())

# Memilih fitur yang akan digunakan untuk clustering
X = df[['Age', 'Annual Income (k$)', 'Spending Score (1-100)']]

# Penskalaan data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Menentukan jumlah cluster yang diinginkan
n_clusters = 5

# Menerapkan Hierarchical Clustering
hc = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='ward')
hc.fit(X_scaled)

# Menyimpan label cluster untuk setiap titik data
labels = hc.labels_

# Menambahkan kolom label ke dataframe
df['Cluster'] = labels

# Menampilkan informasi hasil clustering
print("Jumlah cluster yang dihasilkan:", n_clusters)
print(df['Cluster'].value_counts())

# Visualisasi hasil clustering
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='viridis', legend='full')
plt.title('Clustering menggunakan Hierarchical Clustering')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.show()