# Meta-Atom Defect Clustering Pipeline

This notebook segments metasurface arrays into individual meta-atom images and applies multiple clustering techniques to categorize defect types:
- **Intact**: Normal, undamaged atoms
- **Fallen/Collapsed**: Atoms that have toppled over
- **Missing**: Empty or nearly empty positions
- **Misshapen/Irregular**: Deformed atoms

## 1. Setup and Imports

In [None]:
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import glob
from pathlib import Path

# ML/Clustering
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

# For reproducibility
np.random.seed(42)

# Paths
BASE_DIR = Path(os.getcwd())
if BASE_DIR.name == 'notebooks':
    BASE_DIR = BASE_DIR.parent
OUTPUT_DIR = BASE_DIR / "Meta_Atoms"
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Base Directory: {BASE_DIR}")
print(f"Output Directory: {OUTPUT_DIR}")

## 2. Image Segmentation

Extract individual meta-atom tiles from the 21x21 grid arrays.

In [None]:
def save_grid_images_simple(image_path, output_dir, grid_size=21, tile_size=32, show_grid=True):
    """
    Extracts a perfect grid of tiles from a CLEAN, manually cropped array.
    Displays the grid overlay before saving tiles.

    Parameters:
    -----------
    image_path : str
        Path to manually cropped array image (21x21 grid).
    output_dir : str
        Directory to save output tiles.
    grid_size : int
        Number of grid rows/cols (default 21).
    tile_size : int
        Half-width of tile crop.
    show_grid : bool
        If True, display image with grid overlay.
    """
    img = cv2.imread(str(image_path))
    if img is None:
        raise ValueError(f"Could not read {image_path}")

    H, W = img.shape[:2]
    base_name = os.path.splitext(os.path.basename(image_path))[0]

    # Output folder
    array_dir = os.path.join(output_dir, base_name)
    os.makedirs(array_dir, exist_ok=True)

    # Compute grid spacing
    x_spacing = W / grid_size
    y_spacing = H / grid_size

    # Show grid overlay
    if show_grid:
        img_disp = img.copy()

        # Draw vertical lines
        for c in range(grid_size + 1):
            x = int(c * x_spacing)
            cv2.line(img_disp, (x, 0), (x, H), (0, 255, 0), 1)

        # Draw horizontal lines
        for r in range(grid_size + 1):
            y = int(r * y_spacing)
            cv2.line(img_disp, (0, y), (W, y), (0, 255, 0), 1)

        # Draw grid centers
        for r in range(grid_size):
            for c in range(grid_size):
                cx = int((c + 0.5) * x_spacing)
                cy = int((r + 0.5) * y_spacing)
                cv2.circle(img_disp, (cx, cy), 3, (255, 0, 0), -1)

        plt.figure(figsize=(10, 10))
        plt.imshow(cv2.cvtColor(img_disp, cv2.COLOR_BGR2RGB))
        plt.title(f"Grid Overlay: {base_name}")
        plt.axis("off")
        plt.show()

    # Extract and save 441 tiles
    saved = 0

    for r in range(grid_size):
        for c in range(grid_size):
            cx = int((c + 0.5) * x_spacing)
            cy = int((r + 0.5) * y_spacing)

            x1 = max(0, cx - tile_size)
            y1 = max(0, cy - tile_size)
            x2 = min(W, cx + tile_size)
            y2 = min(H, cy + tile_size)

            tile = img[y1:y2, x1:x2]

            fname = f"{base_name}_{r+1},{c+1}.bmp"
            cv2.imwrite(os.path.join(array_dir, fname), tile)
            saved += 1

    print(f"Saved {saved} tiles to {array_dir}")

In [None]:
# Segment all three arrays
arrays = ["Array_1Crop.bmp", "Array_2Crop.bmp", "Array_3Crop.bmp"]

for arr in arrays:
    arr_path = BASE_DIR / arr
    if arr_path.exists():
        print(f"\n--- Processing {arr} ---")
        save_grid_images_simple(
            image_path=str(arr_path),
            output_dir=str(OUTPUT_DIR),
            grid_size=21,
            tile_size=32,
            show_grid=True
        )
    else:
        print(f"Warning: {arr} not found at {arr_path}")

## 3. Feature Extraction

Extract features from each meta-atom image for clustering.

In [None]:
def extract_features(img_gray):
    """
    Extract a comprehensive feature vector from a grayscale meta-atom image.
    
    Returns a dictionary of features for clustering.
    """
    # Gradient features (for stitching detection)
    sobelx = cv2.Sobel(img_gray, cv2.CV_64F, 1, 0, ksize=3)
    sobely = cv2.Sobel(img_gray, cv2.CV_64F, 0, 1, ksize=3)
    mag = np.sqrt(sobelx**2 + sobely**2)
    max_gradient = np.max(mag)
    mean_gradient = np.mean(mag)
    
    # Intensity features
    mean_intensity = np.mean(img_gray)
    std_intensity = np.std(img_gray)
    
    # Binary thresholding for shape analysis
    img_bin = cv2.adaptiveThreshold(
        img_gray, 255,
        cv2.ADAPTIVE_THRESH_MEAN_C,
        cv2.THRESH_BINARY_INV,
        21, 3
    )
    
    # Contour-based features
    contours, _ = cv2.findContours(img_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    area = 0
    perimeter = 0
    solidity = 0
    circularity = 0
    hu_moments = np.zeros(7)
    
    if contours:
        largest_cnt = max(contours, key=cv2.contourArea)
        area = cv2.contourArea(largest_cnt)
        perimeter = cv2.arcLength(largest_cnt, True)
        
        # Solidity
        hull = cv2.convexHull(largest_cnt)
        hull_area = cv2.contourArea(hull)
        if hull_area > 0:
            solidity = float(area) / hull_area
        
        # Circularity
        if perimeter > 0:
            circularity = (4 * np.pi * area) / (perimeter ** 2)
        
        # Hu Moments
        mask = np.zeros_like(img_bin)
        cv2.drawContours(mask, [largest_cnt], -1, 255, -1)
        M = cv2.moments(mask)
        hu_moments = cv2.HuMoments(M).flatten()
    
    # Texture features (Laplacian variance for sharpness)
    laplacian_var = cv2.Laplacian(img_gray, cv2.CV_64F).var()
    
    # Histogram features
    hist = cv2.calcHist([img_gray], [0], None, [16], [0, 256]).flatten()
    hist = hist / hist.sum()  # Normalize
    
    return {
        'mean_intensity': mean_intensity,
        'std_intensity': std_intensity,
        'max_gradient': max_gradient,
        'mean_gradient': mean_gradient,
        'area': area,
        'perimeter': perimeter,
        'solidity': solidity,
        'circularity': circularity,
        'hu_moments': hu_moments,
        'laplacian_var': laplacian_var,
        'histogram': hist
    }

In [None]:
def load_all_tiles():
    """
    Load all tile images and extract features.
    """
    all_data = []
    
    array_names = ["Array_1Crop", "Array_2Crop", "Array_3Crop"]
    
    for array_name in array_names:
        array_dir = OUTPUT_DIR / array_name
        if not array_dir.exists():
            print(f"Warning: {array_dir} not found")
            continue
            
        files = list(array_dir.glob("*.bmp"))
        print(f"Loading {len(files)} tiles from {array_name}")
        
        for fpath in files:
            img = cv2.imread(str(fpath), cv2.IMREAD_GRAYSCALE)
            if img is None:
                continue
            
            # Parse row/col from filename
            fname = fpath.stem
            try:
                coords = fname.split('_')[-1]
                row, col = map(int, coords.split(','))
            except:
                row, col = -1, -1
            
            features = extract_features(img)
            
            all_data.append({
                'array': array_name,
                'filename': fpath.name,
                'filepath': str(fpath),
                'row': row,
                'col': col,
                'image': img,
                **features
            })
    
    print(f"\nTotal tiles loaded: {len(all_data)}")
    return all_data

# Load all tile data
tile_data = load_all_tiles()

In [None]:
def build_feature_matrix(tile_data, use_hu=True, use_hist=True):
    """
    Build a feature matrix from tile data for clustering.
    """
    feature_list = []
    
    for tile in tile_data:
        row = [
            tile['mean_intensity'],
            tile['std_intensity'],
            tile['max_gradient'],
            tile['mean_gradient'],
            tile['area'],
            tile['perimeter'],
            tile['solidity'],
            tile['circularity'],
            tile['laplacian_var']
        ]
        
        if use_hu:
            # Log transform Hu moments (they vary over many orders of magnitude)
            hu_log = -np.sign(tile['hu_moments']) * np.log10(np.abs(tile['hu_moments']) + 1e-10)
            row.extend(hu_log)
        
        if use_hist:
            row.extend(tile['histogram'])
        
        feature_list.append(row)
    
    X = np.array(feature_list)
    X = np.nan_to_num(X, nan=0, posinf=0, neginf=0)
    
    return X

# Build feature matrix
X_raw = build_feature_matrix(tile_data, use_hu=True, use_hist=True)
print(f"Feature matrix shape: {X_raw.shape}")

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_raw)

## 4. Dimensionality Reduction for Visualization

In [None]:
# PCA for visualization and clustering
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

print(f"Explained variance ratio (first 10 components): {pca.explained_variance_ratio_}")
print(f"Total variance explained: {pca.explained_variance_ratio_.sum():.2%}")

# Plot explained variance
plt.figure(figsize=(10, 4))
plt.subplot(1, 2, 1)
plt.bar(range(1, 11), pca.explained_variance_ratio_)
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.title('PCA Explained Variance')

plt.subplot(1, 2, 2)
plt.plot(range(1, 11), np.cumsum(pca.explained_variance_ratio_), 'bo-')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.title('Cumulative Explained Variance')
plt.axhline(y=0.9, color='r', linestyle='--', label='90% threshold')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# t-SNE for 2D visualization
print("Computing t-SNE (this may take a moment)...")
tsne = TSNE(n_components=2, perplexity=30, random_state=42, n_iter=1000)
X_tsne = tsne.fit_transform(X_pca[:, :5])  # Use first 5 PCA components
print("t-SNE complete.")

# Plot t-SNE
plt.figure(figsize=(10, 8))
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], alpha=0.5, s=10)
plt.xlabel('t-SNE 1')
plt.ylabel('t-SNE 2')
plt.title('t-SNE Visualization of Meta-Atoms')
plt.show()

## 5. Clustering Methods

We'll try multiple clustering approaches and compare their results.

In [None]:
def evaluate_clustering(X, labels, method_name):
    """
    Evaluate clustering quality using multiple metrics.
    """
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    
    if n_clusters < 2:
        print(f"{method_name}: Only {n_clusters} cluster(s) found - cannot compute metrics")
        return None
    
    # Filter out noise points for metrics
    mask = labels != -1
    X_valid = X[mask]
    labels_valid = labels[mask]
    
    if len(set(labels_valid)) < 2:
        print(f"{method_name}: Not enough valid clusters for metrics")
        return None
    
    silhouette = silhouette_score(X_valid, labels_valid)
    davies_bouldin = davies_bouldin_score(X_valid, labels_valid)
    calinski = calinski_harabasz_score(X_valid, labels_valid)
    
    print(f"{method_name}:")
    print(f"  Clusters: {n_clusters}")
    print(f"  Silhouette Score: {silhouette:.3f} (higher is better, range [-1,1])")
    print(f"  Davies-Bouldin Index: {davies_bouldin:.3f} (lower is better)")
    print(f"  Calinski-Harabasz Index: {calinski:.1f} (higher is better)")
    
    return {
        'method': method_name,
        'n_clusters': n_clusters,
        'silhouette': silhouette,
        'davies_bouldin': davies_bouldin,
        'calinski_harabasz': calinski
    }

### 5.1 K-Means Clustering

In [None]:
# Find optimal K using elbow method and silhouette
k_range = range(2, 10)
inertias = []
silhouettes = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_pca[:, :5])
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X_pca[:, :5], labels))

# Plot elbow and silhouette
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].plot(k_range, inertias, 'bo-')
axes[0].set_xlabel('Number of Clusters (K)')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')

axes[1].plot(k_range, silhouettes, 'go-')
axes[1].set_xlabel('Number of Clusters (K)')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')

plt.tight_layout()
plt.show()

# Best K by silhouette
best_k = k_range[np.argmax(silhouettes)]
print(f"Best K by silhouette score: {best_k}")

In [None]:
# K-Means with 4 clusters (for 4 defect types)
kmeans_4 = KMeans(n_clusters=4, random_state=42, n_init=10)
labels_kmeans_4 = kmeans_4.fit_predict(X_pca[:, :5])

metrics_kmeans_4 = evaluate_clustering(X_pca[:, :5], labels_kmeans_4, "K-Means (K=4)")

# Store labels
for i, tile in enumerate(tile_data):
    tile['kmeans_4'] = labels_kmeans_4[i]

### 5.2 DBSCAN Clustering

In [None]:
# DBSCAN - density based clustering
# Try different eps values
eps_values = [0.5, 0.7, 1.0, 1.5, 2.0]

print("DBSCAN parameter search:")
for eps in eps_values:
    dbscan = DBSCAN(eps=eps, min_samples=10)
    labels = dbscan.fit_predict(X_pca[:, :5])
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = (labels == -1).sum()
    print(f"  eps={eps}: {n_clusters} clusters, {n_noise} noise points")

In [None]:
# DBSCAN with selected parameters
dbscan = DBSCAN(eps=1.0, min_samples=10)
labels_dbscan = dbscan.fit_predict(X_pca[:, :5])

metrics_dbscan = evaluate_clustering(X_pca[:, :5], labels_dbscan, "DBSCAN (eps=1.0)")

for i, tile in enumerate(tile_data):
    tile['dbscan'] = labels_dbscan[i]

### 5.3 Gaussian Mixture Model

In [None]:
# GMM - soft clustering
gmm = GaussianMixture(n_components=4, random_state=42, covariance_type='full')
labels_gmm = gmm.fit_predict(X_pca[:, :5])

metrics_gmm = evaluate_clustering(X_pca[:, :5], labels_gmm, "GMM (4 components)")

for i, tile in enumerate(tile_data):
    tile['gmm'] = labels_gmm[i]

### 5.4 Agglomerative Clustering

In [None]:
# Agglomerative (Hierarchical) Clustering
agg = AgglomerativeClustering(n_clusters=4, linkage='ward')
labels_agg = agg.fit_predict(X_pca[:, :5])

metrics_agg = evaluate_clustering(X_pca[:, :5], labels_agg, "Agglomerative (4 clusters)")

for i, tile in enumerate(tile_data):
    tile['agglomerative'] = labels_agg[i]

### 5.5 Spectral Clustering

In [None]:
# Spectral Clustering
spectral = SpectralClustering(n_clusters=4, random_state=42, affinity='nearest_neighbors', n_neighbors=10)
labels_spectral = spectral.fit_predict(X_pca[:, :5])

metrics_spectral = evaluate_clustering(X_pca[:, :5], labels_spectral, "Spectral (4 clusters)")

for i, tile in enumerate(tile_data):
    tile['spectral'] = labels_spectral[i]

## 6. Visualization of Clustering Results

In [None]:
def visualize_clusters_tsne(X_tsne, labels, title, ax=None):
    """
    Visualize clusters in t-SNE space.
    """
    if ax is None:
        fig, ax = plt.subplots(figsize=(8, 6))
    
    unique_labels = sorted(set(labels))
    colors = plt.cm.tab10(np.linspace(0, 1, len(unique_labels)))
    
    for label, color in zip(unique_labels, colors):
        mask = labels == label
        label_name = 'Noise' if label == -1 else f'Cluster {label}'
        ax.scatter(X_tsne[mask, 0], X_tsne[mask, 1], 
                   c=[color], label=label_name, alpha=0.6, s=15)
    
    ax.set_xlabel('t-SNE 1')
    ax.set_ylabel('t-SNE 2')
    ax.set_title(title)
    ax.legend(loc='best', fontsize=8)
    
    return ax

In [None]:
# Compare all clustering methods
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

visualize_clusters_tsne(X_tsne, labels_kmeans_4, 'K-Means (K=4)', axes[0, 0])
visualize_clusters_tsne(X_tsne, labels_dbscan, 'DBSCAN', axes[0, 1])
visualize_clusters_tsne(X_tsne, labels_gmm, 'GMM', axes[0, 2])
visualize_clusters_tsne(X_tsne, labels_agg, 'Agglomerative', axes[1, 0])
visualize_clusters_tsne(X_tsne, labels_spectral, 'Spectral', axes[1, 1])

# Empty plot for summary
axes[1, 2].axis('off')

plt.tight_layout()
plt.savefig(str(OUTPUT_DIR / 'clustering_comparison.png'), dpi=150)
plt.show()

## 7. Sample Images from Each Cluster

In [None]:
def show_cluster_samples(tile_data, label_key, n_samples=8):
    """
    Display sample images from each cluster.
    """
    labels = [tile[label_key] for tile in tile_data]
    unique_labels = sorted(set(labels))
    
    n_clusters = len(unique_labels)
    fig, axes = plt.subplots(n_clusters, n_samples, figsize=(n_samples * 1.5, n_clusters * 1.5))
    
    if n_clusters == 1:
        axes = [axes]
    
    for i, label in enumerate(unique_labels):
        cluster_tiles = [t for t in tile_data if t[label_key] == label]
        samples = np.random.choice(len(cluster_tiles), min(n_samples, len(cluster_tiles)), replace=False)
        
        label_name = 'Noise' if label == -1 else f'Cluster {label}'
        
        for j in range(n_samples):
            ax = axes[i][j] if n_clusters > 1 else axes[j]
            if j < len(samples):
                img = cluster_tiles[samples[j]]['image']
                ax.imshow(img, cmap='gray')
            ax.axis('off')
            if j == 0:
                ax.set_ylabel(f'{label_name}\n(n={len(cluster_tiles)})', fontsize=9)
    
    plt.suptitle(f'Cluster Samples: {label_key}', fontsize=12)
    plt.tight_layout()
    plt.show()

In [None]:
# Show samples for each clustering method
print("K-Means Clusters:")
show_cluster_samples(tile_data, 'kmeans_4')

print("\nGMM Clusters:")
show_cluster_samples(tile_data, 'gmm')

print("\nAgglomerative Clusters:")
show_cluster_samples(tile_data, 'agglomerative')

## 8. Cluster Statistics and Analysis

In [None]:
def analyze_clusters(tile_data, label_key):
    """
    Analyze feature distributions within each cluster.
    """
    labels = [tile[label_key] for tile in tile_data]
    unique_labels = sorted(set(labels))
    
    print(f"\n=== Cluster Analysis for {label_key} ===")
    print(f"Number of clusters: {len([l for l in unique_labels if l != -1])}")
    
    for label in unique_labels:
        cluster_tiles = [t for t in tile_data if t[label_key] == label]
        n = len(cluster_tiles)
        
        label_name = 'Noise' if label == -1 else f'Cluster {label}'
        
        # Calculate mean features
        mean_intensity = np.mean([t['mean_intensity'] for t in cluster_tiles])
        mean_area = np.mean([t['area'] for t in cluster_tiles])
        mean_solidity = np.mean([t['solidity'] for t in cluster_tiles])
        mean_circularity = np.mean([t['circularity'] for t in cluster_tiles])
        mean_gradient = np.mean([t['max_gradient'] for t in cluster_tiles])
        
        print(f"\n{label_name} (n={n}):")
        print(f"  Avg Intensity: {mean_intensity:.1f}")
        print(f"  Avg Area: {mean_area:.1f}")
        print(f"  Avg Solidity: {mean_solidity:.3f}")
        print(f"  Avg Circularity: {mean_circularity:.3f}")
        print(f"  Avg Max Gradient: {mean_gradient:.1f}")

# Analyze K-Means clusters
analyze_clusters(tile_data, 'kmeans_4')

In [None]:
def plot_cluster_features(tile_data, label_key):
    """
    Plot feature distributions for each cluster as box plots.
    """
    import pandas as pd
    
    # Create DataFrame
    df = pd.DataFrame([{
        'cluster': tile[label_key],
        'intensity': tile['mean_intensity'],
        'area': tile['area'],
        'solidity': tile['solidity'],
        'circularity': tile['circularity'],
        'gradient': tile['max_gradient']
    } for tile in tile_data])
    
    features = ['intensity', 'area', 'solidity', 'circularity', 'gradient']
    
    fig, axes = plt.subplots(1, len(features), figsize=(15, 4))
    
    for i, feat in enumerate(features):
        df.boxplot(column=feat, by='cluster', ax=axes[i])
        axes[i].set_title(feat.capitalize())
        axes[i].set_xlabel('Cluster')
    
    plt.suptitle(f'Feature Distributions by Cluster ({label_key})', fontsize=12)
    plt.tight_layout()
    plt.show()

plot_cluster_features(tile_data, 'kmeans_4')

## 9. Summary and Export

In [None]:
# Summary of clustering metrics
print("=== Clustering Method Comparison ===")
print()

all_metrics = [m for m in [metrics_kmeans_4, metrics_dbscan, metrics_gmm, metrics_agg, metrics_spectral] if m is not None]

if all_metrics:
    import pandas as pd
    df_metrics = pd.DataFrame(all_metrics)
    print(df_metrics.to_string(index=False))

In [None]:
# Export results to CSV
import csv

csv_path = OUTPUT_DIR / 'clustering_results.csv'

fieldnames = ['array', 'filename', 'row', 'col', 'mean_intensity', 'area', 'solidity', 
              'circularity', 'max_gradient', 'kmeans_4', 'dbscan', 'gmm', 'agglomerative', 'spectral']

with open(csv_path, 'w', newline='') as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames, extrasaction='ignore')
    writer.writeheader()
    for tile in tile_data:
        writer.writerow(tile)

print(f"Results exported to: {csv_path}")

## 10. Manual Cluster Labeling Guide

Based on the feature analysis, you can interpret the clusters:

- **Low Area + Low Intensity**: Likely **Missing** atoms
- **High Solidity + Low Intensity**: Likely **Fallen/Collapsed** atoms  
- **Low Circularity + Normal Area**: Likely **Misshapen** atoms
- **High Area + High Circularity + Normal Intensity**: Likely **Intact** atoms
- **High Gradient**: May indicate **Stitching errors** (image artifacts)

In [None]:
# Interactive cluster labeling
def create_labeled_clusters(tile_data, label_key, cluster_names):
    """
    Assign semantic labels to clusters based on feature analysis.
    
    cluster_names: dict mapping cluster index to label name
    Example: {0: 'Intact', 1: 'Missing', 2: 'Fallen', 3: 'Misshapen'}
    """
    for tile in tile_data:
        cluster_id = tile[label_key]
        tile['defect_type'] = cluster_names.get(cluster_id, 'Unknown')
    
    # Count defect types
    from collections import Counter
    defect_counts = Counter([t['defect_type'] for t in tile_data])
    print("Defect Type Counts:")
    for defect, count in sorted(defect_counts.items()):
        print(f"  {defect}: {count}")

# Example: After analyzing cluster features, assign labels
# Uncomment and modify based on your cluster analysis:
# cluster_labels = {0: 'Intact', 1: 'Missing', 2: 'Fallen', 3: 'Misshapen'}
# create_labeled_clusters(tile_data, 'kmeans_4', cluster_labels)