<a href="https://colab.research.google.com/github/raihanewubd/CSE457/blob/main/Lab_2_Unsupervised_learning_Clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Dataset

In [None]:
!gdown --id 1nLdjq_y0hJ4_A-kH6MZb9x-GDk4sNrRY

# Preprocessing and Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA

# Load the dataset

df = pd.read_csv('adult.csv')
df.head()
df.head()

In [None]:
# Replace '?' with NaN for handling missing values
df.replace(' ?', pd.NA, inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Drop the 'income' column as it is not needed for unsupervised learning
df_unsupervised = df.drop(columns=['income'])

# Split the features into categorical and numerical
categorical_features = df_unsupervised.select_dtypes(include=['object']).columns
numerical_features = df_unsupervised.select_dtypes(include=['int64', 'float64']).columns

# Standard scaling for numerical features only
scaler = StandardScaler()
scaled_numerical_data = scaler.fit_transform(df_unsupervised[numerical_features])

# One-hot encoding for categorical features only
encoder = OneHotEncoder(drop='first')
encoded_categorical_data = encoder.fit_transform(df_unsupervised[categorical_features])

# Concatenate the scaled numerical and encoded categorical data
processed_data = np.hstack([scaled_numerical_data, encoded_categorical_data.toarray()])

# Convert to DataFrame with appropriate column names
final_columns = numerical_features.tolist() + encoder.get_feature_names_out(categorical_features).tolist()
final_df = pd.DataFrame(processed_data, columns=final_columns)

final_df.to_csv('processed_data.csv', index=False)

final_df.head()


# Apply Clustering

In [None]:
!gdown --id 1cO2SQHvkg1SWUJgk4FU_mVx_l_NSnrn8

In [None]:
# Load the newly provided dataset
file_path = 'processed_data_adults.csv'
processed_data_df = pd.read_csv(file_path)
processed_data_df.head()


# Display the first few rows to understand the structure
processed_data_df.head()


**Elbow Method**
The Elbow Method can help identify the optimal number of clusters for K-Means by plotting the Within-Cluster Sum of Squares (WCSS) for different values of
ùëò
k and looking for an "elbow" point where the rate of decrease sharply diminishes. This point suggests a good balance between cluster compactness and complexity.

In [None]:
# Calculate WCSS for a range of K values to use the Elbow Method
wcss = []
k_range = range(1, 11)  # Range of K values from 1 to 10

# Iterate through each k and calculate WCSS
for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(processed_data_df)
    wcss.append(kmeans.inertia_)  # Inertia is the WCSS

# Plot the Elbow graph
plt.figure(figsize=(8, 5))
plt.plot(k_range, wcss, marker='o', linestyle='-', color='b')
plt.title("Elbow Method for Optimal K in K-Means")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Within-Cluster Sum of Squares (WCSS)")
plt.xticks(k_range)
plt.show()


In [None]:
# Define number of clusters for K-Means
n_clusters = 8

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters, max_iter=30, random_state=42)
processed_data_df['KMeans_Cluster'] = kmeans.fit_predict(processed_data_df)

# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
processed_data_df['DBSCAN_Cluster'] = dbscan.fit_predict(processed_data_df)

# PCA for 2D visualization
pca = PCA(n_components=2, random_state=42)
pca_result = pca.fit_transform(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster']))
processed_data_df['PCA1'] = pca_result[:, 0]
processed_data_df['PCA2'] = pca_result[:, 1]

# Calculate evaluation scores
data = processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'PCA1', 'PCA2'])
silhouette_kmeans = silhouette_score(data,
                                     processed_data_df['KMeans_Cluster'])
silhouette_dbscan = silhouette_score(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'PCA1', 'PCA2']),
                                     processed_data_df['DBSCAN_Cluster'])
dbi_kmeans = davies_bouldin_score(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'PCA1', 'PCA2']),
                                  processed_data_df['KMeans_Cluster'])
dbi_dbscan = davies_bouldin_score(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'PCA1', 'PCA2']),
                                  processed_data_df['DBSCAN_Cluster'])

# Print evaluation scores
print(f"Silhouette Score (K-Means): {silhouette_kmeans}")
print(f"Silhouette Score (DBSCAN): {silhouette_dbscan}")
print(f"Davies-Bouldin Index (K-Means): {dbi_kmeans}")
print(f"Davies-Bouldin Index (DBSCAN): {dbi_dbscan}")




**1. Silhouette Score**

The Silhouette Score measures how similar an object is to its own cluster (cohesion) compared to other clusters (separation). It ranges from -1 to +1:

* +1: Indicates that samples are far from neighboring clusters, suggesting well-separated clusters.
* 0: Indicates that samples are close to the boundary between clusters.
* -1: Indicates that samples may have been assigned to the wrong cluster, with distances closer to other clusters than their own.

**Interpretation of Your Values**

* K-Means Silhouette Score: 0.117
* DBSCAN Silhouette Score: -0.385

Both scores are negative, suggesting that samples may be closer to clusters other than their assigned ones. This generally indicates poorly separated clusters or overlap, with DBSCAN performing slightly worse here than K-Means. A higher (closer to +1) score is preferable.

**2. Davies-Bouldin Index**

The Davies-Bouldin Index (DBI) assesses the average ‚Äúsimilarity‚Äù ratio of each cluster with the most similar cluster. This index is non-negative, where:

* 0: Perfect score, indicating clusters are compact and well-separated.
Higher values indicate worse clustering, with clusters that overlap or have high within-cluster spread.

**Interpretation of Your Values**

* K-Means DBI: 11.03
* DBSCAN DBI: 1.53

The K-Means DBI is quite high, suggesting large overlap and poor separation. The DBSCAN score is considerably better (lower), suggesting it may have produced slightly more compact clusters.

**Summary: Good vs. Bad Clustering**

* Silhouette Score: Closer to +1 is better; scores near 0 or negative suggest poor clustering.
* Davies-Bouldin Index: Lower values are better, ideally approaching 0.
Given these criteria:

DBSCAN appears to have a better Davies-Bouldin Index, meaning the clusters are more compact and less overlapping.
K-Means and DBSCAN both have negative Silhouette Scores, indicating the clustering structure may not be clearly defined.

# Visualization with PCA

In [None]:
# Plot PCA visualization for K-Means and DBSCAN clusters
plt.figure(figsize=(14, 6))

# Plot for K-Means
plt.subplot(1, 2, 1)
sns.scatterplot(data=processed_data_df, x='PCA1', y='PCA2', hue='KMeans_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("PCA Visualization of K-Means Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

# Plot for DBSCAN
plt.subplot(1, 2, 2)
sns.scatterplot(data=processed_data_df, x='PCA1', y='PCA2', hue='DBSCAN_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("PCA Visualization of DBSCAN Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")

plt.tight_layout()
plt.show()


# GMM

In [None]:
from sklearn.mixture import GaussianMixture

# Apply Gaussian Mixture Model with the same number of clusters as K-Means (6 clusters)
gmm = GaussianMixture(n_components=8, random_state=42)
processed_data_df['GMM_Cluster'] = gmm.fit_predict(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'PCA1', 'PCA2']))

# Calculate silhouette and Davies-Bouldin scores for GMM
silhouette_gmm = silhouette_score(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'GMM_Cluster', 'PCA1', 'PCA2']),
                                  processed_data_df['GMM_Cluster'])
dbi_gmm = davies_bouldin_score(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'GMM_Cluster', 'PCA1', 'PCA2']),
                               processed_data_df['GMM_Cluster'])

# Print the evaluation scores for GMM
print(f"Silhouette Score (GMM): {silhouette_gmm}")
print(f"Davies-Bouldin Index (GMM): {dbi_gmm}")

# PCA visualization for GMM clusters
plt.figure(figsize=(7, 6))
sns.scatterplot(data=processed_data_df, x='PCA1', y='PCA2', hue='GMM_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("PCA Visualization of GMM Clusters")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.show()


The Gaussian Mixture Model (GMM) clustering results are as follows:

* Silhouette Score: 0.112, indicating weak cohesion and separation within clusters.
* Davies-Bouldin Index: 2.39, suggesting room for improvement in cluster definition.

# Visualization using t-SNE
(Takes ~30min to complete)

In [None]:
from sklearn.manifold import TSNE

# Apply t-SNE for 2D visualization
tsne = TSNE(n_components=2, random_state=42)
tsne_result = tsne.fit_transform(processed_data_df.drop(columns=['KMeans_Cluster', 'DBSCAN_Cluster', 'GMM_Cluster', 'PCA1', 'PCA2']))


# Add t-SNE results to the DataFrame
processed_data_df['tSNE1'] = tsne_result[:, 0]
processed_data_df['tSNE2'] = tsne_result[:, 1]

# Plot t-SNE visualization for K-Means, DBSCAN, and GMM clusters
plt.figure(figsize=(18, 6))

# Plot for K-Means
plt.subplot(1, 3, 1)
sns.scatterplot(data=processed_data_df, x='tSNE1', y='tSNE2', hue='KMeans_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("t-SNE Visualization of K-Means Clusters")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")

# Plot for DBSCAN
plt.subplot(1, 3, 2)
sns.scatterplot(data=processed_data_df, x='tSNE1', y='tSNE2', hue='DBSCAN_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("t-SNE Visualization of DBSCAN Clusters")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")

# Plot for GMM
plt.subplot(1, 3, 3)
sns.scatterplot(data=processed_data_df, x='tSNE1', y='tSNE2', hue='GMM_Cluster', palette='viridis', s=60, alpha=0.7)
plt.title("t-SNE Visualization of GMM Clusters")
plt.xlabel("t-SNE Component 1")
plt.ylabel("t-SNE Component 2")

plt.tight_layout()
plt.show()


# Exercise

* Apply K-NN, DBSCAN and GMM over the following dataset
* Visualize using PCA

In [None]:
!gdown --id 1Q6pdhzWFu2oegWMPvrOE8dWTra8FJsTf

In [None]:
df = pd.read_csv("Mall_Customers.csv")
df.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture

Step 3: Preprocess Data

In [None]:
# Drop CustomerID
df = df.drop("CustomerID", axis=1)

# Encode Gender
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Gender'] = le.fit_transform(df['Gender'])  # Male=1, Female=0

# Scale numerical features
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df)

In [None]:
df.columns

PCA for Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])

K-Means Clustering

In [None]:
# Reduce features to 2D for visualization
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_scaled)  # df_scaled from preprocessing step

# Convert to DataFrame for plotting
df_pca = pd.DataFrame(df_pca, columns=['PC1', 'PC2'])

# Add cluster labels for visualization
df_pca['KMeans_Cluster'] = kmeans_labels      # K-Means clusters
df_pca['DBSCAN_Cluster'] = dbscan_labels      # DBSCAN clusters
df_pca['GMM_Cluster'] = gmm_labels            # GMM clusters

In [None]:
# Fit K-Means (choose 5 clusters as an example)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled)
df_pca['KMeans_Cluster'] = kmeans_labels

# Visualize K-Means clusters
plt.figure(figsize=(7,5))
sns.scatterplot(x='PC1', y='PC2', hue='KMeans_Cluster', palette='Set2', data=df_pca, s=60)
plt.title("K-Means Clusters (PCA)")
plt.show()

DBSCAN Clustering

In [None]:
# Fit DBSCAN
dbscan = DBSCAN(eps=1.5, min_samples=5)  # tune eps/min_samples if needed
dbscan_labels = dbscan.fit_predict(df_scaled)
df_pca['DBSCAN_Cluster'] = dbscan_labels

# Visualize DBSCAN clusters
plt.figure(figsize=(7,5))
sns.scatterplot(x='PC1', y='PC2', hue='DBSCAN_Cluster', palette='Set1', data=df_pca, s=60)
plt.title("DBSCAN Clusters (PCA)")
plt.show()

GMM Clustering

In [None]:
gmm = GaussianMixture(n_components=5, random_state=42)
gmm_labels = gmm.fit_predict(df_scaled)
df_pca['GMM_Cluster'] = gmm_labels

# Visualize GMM clusters
plt.figure(figsize=(7,5))
sns.scatterplot(x='PC1', y='PC2', hue='GMM_Cluster', palette='Set3', data=df_pca, s=60)
plt.title("GMM Clusters (PCA)")
plt.show()

Cluster Summary

In [None]:
# Add clusters to original dataset
df['KMeans_Cluster'] = kmeans_labels
df['DBSCAN_Cluster'] = dbscan_labels
df['GMM_Cluster'] = gmm_labels

# Summary statistics
print("K-Means Cluster Summary:")
print(df.groupby('KMeans_Cluster').mean())
print("\nDBSCAN Cluster Summary:")
print(df.groupby('DBSCAN_Cluster').mean())
print("\nGMM Cluster Summary:")
print(df.groupby('GMM_Cluster').mean())

In [None]:
# K-Means
kmeans_labels = kmeans.fit_predict(df_scaled)

# DBSCAN
dbscan_labels = dbscan.fit_predict(df_scaled)

# GMM
gmm_labels = gmm.fit_predict(df_scaled)

In [None]:
# Apply PCA
pca = PCA(n_components=2)
df_pca_array = pca.fit_transform(df_scaled)

# Convert to DataFrame
df_pca = pd.DataFrame(df_pca_array, columns=['PC1', 'PC2'])

# Add cluster labels
df_pca['KMeans'] = kmeans_labels
df_pca['DBSCAN'] = dbscan_labels
df_pca['GMM'] = gmm_labels

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, axes = plt.subplots(1, 3, figsize=(21,6))

# K-Means
sns.scatterplot(
    x='PC1', y='PC2', hue='KMeans', palette='Set2', data=df_pca, ax=axes[0], s=60
)
axes[0].set_title("K-Means Clusters (PCA)")

# DBSCAN
sns.scatterplot(
    x='PC1', y='PC2', hue='DBSCAN', palette='Set1', data=df_pca, ax=axes[1], s=60
)
axes[1].set_title("DBSCAN Clusters (PCA)")

# GMM
sns.scatterplot(
    x='PC1', y='PC2', hue='GMM', palette='Set3', data=df_pca, ax=axes[2], s=60
)
axes[2].set_title("GMM Clusters (PCA)")

plt.tight_layout()
plt.show()