In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, davies_bouldin_score
import hdbscan

# Load and preprocess the data
df = pd.read_csv('combined_output.csv')
df2 = df.drop(' Label', axis=1)
df2 = df2.dropna()

# Replace infinite values with NaN and drop rows with NaNs
df2.replace([np.inf, -np.inf], np.nan, inplace=True)
df2_cleaned = df2.dropna()

# Select only numeric columns, keeping 'Label Num' for later comparison
numeric_df = df2_cleaned.select_dtypes(include=[np.number])

# Extract the 'Label Num' for the ground truth comparison later
labels = numeric_df['Label Num']

# Drop 'Label Num' column from the features for clustering
numeric_df = numeric_df.drop('Label Num', axis=1)

# Convert to NumPy array and scale the data
X = numeric_df.to_numpy()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Optional: Apply PCA to reduce dimensionality
pca = PCA(n_components=10)  # Adjust 'n_components' based on the dataset size
X_reduced = pca.fit_transform(X_scaled)
print(f"Reduced dimensionality to {X_reduced.shape[1]} dimensions.")

# Apply HDBSCAN clustering
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=10,   # Minimum size of clusters
    metric='euclidean',    # Distance metric
    cluster_selection_method='eom'  # Excess of mass (default)
)
cluster_labels = clusterer.fit_predict(X_reduced)

# Print basic clustering results
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise = list(cluster_labels).count(-1)
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")

# Evaluate clustering performance
if len(set(cluster_labels)) > 1:  # Silhouette score requires at least 2 clusters
    silhouette_avg = silhouette_score(X_reduced, cluster_labels)
    print(f"Silhouette Score: {silhouette_avg}")

if n_clusters > 1:  # Davies-Bouldin score only makes sense with at least 2 clusters
    db_score = davies_bouldin_score(X_reduced[cluster_labels != -1], cluster_labels[cluster_labels != -1])
    print(f"Davies-Bouldin Score (non-noise): {db_score}")

# Create a DataFrame to examine cluster distribution
data_with_predictions = pd.DataFrame({
    'Cluster': cluster_labels,  # Predicted clusters
    'Actual': labels.reset_index(drop=True)  # Original labels
})

# Group by cluster prediction and actual label, then count occurrences
distribution = pd.crosstab(data_with_predictions['Cluster'], data_with_predictions['Actual'])

# Print the distribution matrix
print(distribution)


Reduced dimensionality to 10 dimensions.
Estimated number of clusters: 49782
Estimated number of noise points: 786015
