In [None]:
import os
import shutil
import numpy as np
import hdbscan
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# Define the paths
stl_folder = '/content/drive/MyDrive/5000 files/stliitm'  # Folder where STL files are stored
feature_folder = '/content/drive/MyDrive/feature_5000_15'  # Folder where feature files are stored
output_folder = ''  # Folder to store clustered files

# Get the  .stl filenames (without the .stl extension)
stl_files = [os.path.splitext(f)[0] for f in os.listdir(stl_folder) if f.endswith('.stl')]

# Initialize lists to store the features and valid filenames
features = []
valid_filenames = []


# Iterate over the STL files and load corresponding feature files
for stl_file in stl_files:
    feature_file = f"{stl_file}_features.npy"  # Add _features suffix
    feature_path = os.path.join(feature_folder, feature_file)  # Assuming features are stored in .npy format
    if os.path.exists(feature_path):
        try:
            feature = np.load(feature_path)
            features.append(feature)
            valid_filenames.append(stl_file)
        except Exception as e:
            print(f"Error loading feature for {stl_file}: {e}")
    else:
        print(f"Feature file not found for {stl_file}, skipping.")



# Check if there are any valid features to cluster
if len(features) == 0:
    raise ValueError("No valid features found for clustering.")

# Convert the list of features into a NumPy array
features_array = np.array(features)

# Standardize the features (optional but recommended)
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features_array)

# Perform HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(min_cluster_size=2, min_samples=1)  # Adjust min_cluster_size as needed
labels = clusterer.fit_predict(features_scaled)

# Create a DataFrame to store filenames and corresponding cluster labels
results = pd.DataFrame({'filename': valid_filenames, 'hdbscan_cluster_id': labels})

# Exclude noise (cluster_-1) for silhouette score calculation
valid_indices = results['hdbscan_cluster_id'] != -1
valid_features = features_scaled[valid_indices]
valid_labels = labels[valid_indices]

# Calculate Silhouette Score (excluding cluster_-1)
if len(np.unique(valid_labels)) > 1:  # Ensure there is more than 1 cluster
    silhouette_avg = silhouette_score(valid_features, valid_labels)
    print(f"Silhouette Score (excluding cluster_-1): {silhouette_avg}")
else:
    print("Silhouette score cannot be calculated with less than 2 clusters.")

# Create output directories for each cluster and move files
unique_clusters = results['hdbscan_cluster_id'].unique()

for cluster_id in unique_clusters:
    if cluster_id == -1:
        continue  # Skip noise cluster

    # Create a directory for each cluster
    cluster_dir = os.path.join(output_folder, f"cluster_{cluster_id}")
    os.makedirs(cluster_dir, exist_ok=True)

    # Move files into their respective cluster directories
    cluster_files = results[results['hdbscan_cluster_id'] == cluster_id]['filename']
    for file in cluster_files:
        src_path = os.path.join(stl_folder, f"{file}.stl")
        dst_path = os.path.join(cluster_dir, f"{file}.stl")
        if os.path.exists(src_path):
            shutil.copy(src_path, dst_path)  # Copy the file to the cluster folder

print("Clustering completed. Files organized into cluster folders.")
