The Autoencoder model is developed using keras library. The Artificial Neural Network is constructed to train and validate the one-hot encoded data before the clustering implementation.

Autoencoder model:

In [None]:
# Import libraries
import keras
from keras import layers, regularizers
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


# Convert DataFrame to NumPy array
X = final_df.values  # Ensure final_df contains only numeric values

# Normalization of the entire dataset
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)

# Train-Test Split (80% train, 20% test)
X_train, X_test = train_test_split(X_normalized, test_size=0.2, random_state=42)

# Build Autoencoder
input_dim = X_normalized.shape[1]  # Number of features

# Input layer
input_data = keras.Input(shape=(input_dim,))

# Encoder
encoded = layers.Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.01))(input_data)
encoded = layers.Dense(32, activation='relu', kernel_regularizer=regularizers.l2(0.01))(encoded)
encoded = layers.Dense(16, activation='relu', kernel_regularizer=regularizers.l2(0.01))(encoded)


# Decoder
decoded = layers.Dense(16, activation='relu')(encoded)
decoded = layers.Dense(32, activation='relu')(decoded)
decoded = layers.Dense(input_dim, activation='linear')(decoded)

# Compile Autoencoder
autoencoder = keras.Model(input_data, decoded)
autoencoder.compile(optimizer=Adam(learning_rate=0.0005), loss='mean_squared_error')

# EarlyStopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

# Train the Autoencoder
history = autoencoder.fit(
    X_train, X_train,
    epochs=200,
    batch_size=128,
    shuffle=True,
    validation_data=(X_test, X_test),
    callbacks=[early_stopping],
    verbose=1
)

# Plot Training & Validation Loss
plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()
plt.show()

# Evaluate on Test Set
test_loss = autoencoder.evaluate(X_test, X_test)
print(f"Test Reconstruction Loss: {test_loss:.4f}")

# Extract Encoded Representations for Clustering
encoder = keras.Model(input_data, encoded)
X_encoded = encoder.predict(X_normalized)

Elbow method to determine the optimal number of clusters (k) for K-means Clustering.

In [None]:
# Elbow Method to Find Optimal k
inertia_values = []
k_values = range(1, 11)  # Test k from 1 to 10

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_encoded)  # Use the encoded data for clustering
    inertia_values.append(kmeans.inertia_)  # Inertia: sum of squared distances to centroids

# Plot Elbow Method
plt.figure(figsize=(8, 6))
plt.plot(k_values, inertia_values, marker='o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')
plt.show()

K-means Clustering:

In [None]:
n_clusters = 3  # Number of clusters based on the Elbow Method

# Perform K-means clustering on the latent representations (X_encoded)
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
clusters_labels = kmeans.fit_predict(X_encoded)  # X_encoded for clustering

# Add cluster labels to a DataFrame
clustered_data = pd.DataFrame(X_encoded)
clustered_data['Cluster'] = clusters_labels  # Add cluster labels as a new column

# Visualization using 1st and 2nd dimensions of X_encoded data
plt.figure(figsize=(6, 4))
plt.scatter(X_encoded[:, 0], X_encoded[:, 1], c=clusters_labels, cmap='viridis')
plt.xlabel('Latent Dimension 1')
plt.ylabel('Latent Dimension 2')
plt.title('K-means Clustering on Latent Space')
plt.colorbar(label='Cluster')

# Save image
image4_path = '/content/drive/MyDrive/k-means.png'
plt.savefig(image4_path, dpi=300, bbox_inches='tight')  # Save the plot as an image

plt.show()

# Evaluation of Clustering - Silhouette Score
from sklearn.metrics import silhouette_score, silhouette_samples

# Calculate Silhouette Score
silhouette_avg = silhouette_score(X_encoded, clusters_labels)
print(f"Silhouette Score for {n_clusters} clusters: {silhouette_avg:.2f}")

# Evaluation for each cluster
silhouette_values = silhouette_samples(X_encoded, clusters_labels)
for i in range(n_clusters):
    cluster_silhouette_avg = silhouette_values[clusters_labels == i].mean()
    print(f"Cluster {i}: Silhouette Score = {cluster_silhouette_avg:.2f}")

Hierarchical Clustering:

In [None]:
# Load libraries
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.spatial.distance import pdist, squareform


# Convert panda series to list
location_labels = location_data.tolist()


unique_locations = list(set(location_labels))  # Get unique locations
location_colors = {region: plt.cm.tab10(i) for i, region in enumerate(unique_locations)}  # Create color dictionary for each location

# Assign color to each location label
def assign_colors(location_labels, location_colors):
    leaf_colors = []
    for label in location_labels:
        location = label
        color = location_colors[location]
        leaf_colors.append(color)
    return leaf_colors


# Perform Hierarchical Clustering on Latent Representations
linked = linkage(X_encoded, method='ward')  # Ward's method minimizes variance

# Dendrogram
cutoff = 6
plt.figure(figsize=(12, 8))

dendrogram(linked,
           orientation='top',
           color_threshold=cutoff,
           above_threshold_color='grey',
           labels=location_labels,
           leaf_rotation=90,
           leaf_font_size= 8)

# Get x-axis
ax = plt.gca()
x_labels = ax.get_xticklabels()  # Retrieve x-axis labels
# Iterate through the x labels and change their colors if they match a region
for label in x_labels:
    text = label.get_text()
    if text in location_colors:
        label.set_color(location_colors[text])

plt.title("Hierarchical Clustering Dendrogram")
plt.xlabel("Samples")
plt.ylabel("Distance")
plt.axhline(y=6, color='r', linestyle='--')

# Save image
image5_path = '/content/drive/MyDrive/dendrogram_ward.png'
plt.savefig(image5_path, dpi=300, bbox_inches='tight')

plt.show()