In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import hdbscan        #install hdbscan

In [2]:
df = pd.read_csv('../data/train_preprocessed.csv', encoding='utf-8')

# Examine the data
df.head()

Unnamed: 0,offres_recues_pipeline__offresRecues,other_num_pipeline__montant,other_num_pipeline__dureeMois,other_num_pipeline__origineFrance,binary_pipeline__marcheInnovant,binary_pipeline__sousTraitanceDeclaree,binary_pipeline__idAccordCadre,cat_pipeline__nature_Marché,cat_pipeline__nature_Marché de défense ou de sécurité,cat_pipeline__nature_Marché de partenariat,...,cat_pipeline__codeCPV_2_80000000,cat_pipeline__codeCPV_2_85000000,cat_pipeline__codeCPV_2_90000000,cat_pipeline__codeCPV_2_92000000,cat_pipeline__codeCPV_2_98000000,cat_pipeline__tauxAvance_cat_large_advance,cat_pipeline__tauxAvance_cat_medium_advance,cat_pipeline__tauxAvance_cat_missing,cat_pipeline__tauxAvance_cat_no_advance,cat_pipeline__tauxAvance_cat_small_advance
0,-0.578746,-0.511996,0.153094,-0.088438,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.032831,2.35689,0.616194,-0.088438,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,-0.578746,1.642289,-0.502836,-0.088438,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.032831,1.704263,0.916718,-0.088438,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.032831,1.66424,0.616194,-0.088438,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [7]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.9)  # Keep enough components to explain 90% of variance
df_pca = pca.fit_transform(df)

print(f"Original dimensions: {df.shape[1]}")
print(f"Reduced dimensions: {df_pca.shape[1]}")
print(f"Explained variance: {sum(pca.explained_variance_ratio_):.2f}")

Original dimensions: 74
Reduced dimensions: 15
Explained variance: 0.90


In [9]:
# HDBSCAN hyperparameter exploration
min_samples = 5  # Similar to DBSCAN's min_samples

results = []


# Apply HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=min_samples,
    metric='euclidean',
    gen_min_span_tree=True,
    cluster_selection_method='eom'  # 'eom' is usually better for variable density
)
clusterer.fit(df_pca)
labels = clusterer.labels_
# Calculate metrics
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
noise_ratio = 100 * n_noise / len(df_pca)
results.append({
    'min_cluster_size': 20,
    'num_clusters': n_clusters,
    'noise_points': n_noise,
    'noise_percentage': noise_ratio
})
print("min_cluster_size = 20")
print(f"  Number of clusters: {n_clusters}")
print(f"  Number of noise points: {n_noise}")
print(f"  Percentage of noise: {noise_ratio:.2f}%")
# Calculate silhouette score if more than one cluster and not all noise
if n_clusters > 1 and n_noise < len(df_pca):
    # Filter out noise points
    mask = labels != -1
    silhouette_avg = silhouette_score(df_pca[mask], labels[mask])
    print(f"  Silhouette Score: {silhouette_avg:.3f}")
print()

# Display results table
results_df = pd.DataFrame(results)
results_df


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



min_cluster_size = 20
  Number of clusters: 2172
  Number of noise points: 73008
  Percentage of noise: 32.07%
  Silhouette Score: 0.077



Unnamed: 0,min_cluster_size,num_clusters,noise_points,noise_percentage
0,20,2172,73008,32.068452


In [11]:
# 2D visualization using PCA
# For visualization, use the first 2 PCA components
vis_pca = PCA(n_components=2).fit_transform(df.drop('cluster', axis=1))

plt.figure(figsize=(12, 8))
scatter = plt.scatter(vis_pca[:, 0], vis_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.5, s=5)
plt.colorbar(scatter, label='Cluster')
plt.title(f'HDBSCAN Clustering (min_cluster_size={best_min_cluster_size}, min_samples={min_samples})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True, alpha=0.3)
plt.show()

KeyError: "['cluster'] not found in axis"

In [12]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

# Create 3D PCA components
vis_pca_3d = PCA(n_components=3).fit_transform(df.drop('cluster', axis=1))

# Create DataFrame for plotly
pca_df = pd.DataFrame(vis_pca_3d, columns=['PCA1', 'PCA2', 'PCA3'])
pca_df['cluster'] = df['cluster'].astype(str)

# Create interactive 3D scatter plot
fig = px.scatter_3d(
    pca_df,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='cluster',
    opacity=0.7
)

fig.show()

KeyError: "['cluster'] not found in axis"

# Adjust Clustering Parameters


In [13]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=5,
    cluster_selection_epsilon=0.2,  # Try values between 0.1-0.5
    cluster_selection_method='eom'
)

In [15]:
# Test with different metrics
metrics = ['euclidean', 'manhattan', 'cosine']
for metric in metrics:
    print(f"\nTesting with {metric} distance:")
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20, metric=metric)

    # Fit and add labels to dataframe
    clusterer.fit(df_pca)
    df['cluster'] = clusterer.labels_

    # Analyze clusters
    n_clusters = len(set(df['cluster'])) - (1 if -1 in df['cluster'] else 0)
    n_noise = list(df['cluster']).count(-1)

    print("HDBSCAN Results with min_cluster_size=20, min_samples=5:")
    print(f"Number of clusters: {n_clusters}")
    print(f"Noise points: {n_noise} ({100 * n_noise / len(df):.2f}%)")



Testing with euclidean distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



HDBSCAN Results with min_cluster_size=20, min_samples=5:
Number of clusters: 968
Noise points: 101209 (44.46%)

Testing with manhattan distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



HDBSCAN Results with min_cluster_size=20, min_samples=5:
Number of clusters: 914
Noise points: 112690 (49.50%)

Testing with cosine distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



ValueError: Unrecognized metric 'cosine'