In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import hdbscan        #install hdbscan
from sklearn.model_selection import train_test_split

In [6]:
from scripts.data_cleaner import filter_top_cpv_categories
from scripts.preprocess_pipeline import create_pipeline_cat

In [3]:
data_path='../data/data_clean.csv'
model_save_path='../data'
top_n = 40
cpv_column = 'codeCPV_2'

In [4]:
df = pd.read_csv(data_path, encoding='utf-8')
# Examine the data
df.head()

Unnamed: 0,uid,id,nature,acheteur_id,acheteur_nom,acheteur_siren,titulaire_id,titulaire_typeIdentifiant,titulaire_nom,titulaire_siren,...,lieuExecution_typeCode,idAccordCadre,source_open_data,codeCPV_FR,codeCPV_2,codeCPV_3,codeCPV_4,codeCPV_5,codeCPV_2_3,annee
0,2154005160001320242024-LOT04,20242024-LOT04,Marché,21540051600013,COMMUNE DE BATILLY,215400516.0,78885470100018,SIRET,ACOMETAL,788854701.0,...,Code postal,,data.gouv.fr decp-2024.json,Travaux de charpente,45000000,45200000,45260000,45261000,45200000,2024
1,243500667002882021M226MO,2021M226MO,Marché,24350066700288,CC VAL D'ILLE-AUBIGNE,243500667.0,38373211200032,SIRET,UNIVERS,383732112.0,...,Code postal,,data.gouv.fr decp-2024.json,Services d'architecture,71000000,71200000,71200000,71200000,71200000,2021
2,249710047000472024SS_PRD_TRV,2024SS_PRD_TRV,Marché,24971004700047,COMMUNAUTE DE COMMUNES DE MARIE GALANTE,249710047.0,43387249600016,SIRET,COTRAM B.T.P.,433872496.0,...,Code postal,,data.gouv.fr decp-2024.json,Travaux de construction de stations d'épuratio...,45000000,45200000,45230000,45232000,45200000,2024
3,6254801990001124-0806-L2,24-0806-L2,Marché,62548019900011,"LA MAISON POUR TOUS, SOCIETE ANONYME COOPERATI...",625480199.0,55204695502544,SIRET,ENGIE ENERGIE SERVICES,552046955.0,...,Code département,,data.gouv.fr decp-2024.json,Services de réparation et d'entretien de chauf...,50000000,50700000,50720000,50720000,50000000,2024
4,20002563300013202424011BCR,202424011BCR,Marché,20002563300013,SI DE RESTAURATION COLLECTIVE,200025633.0,47698032100238,SIRET,POMONA EPISAVEURS,476980321.0,...,Code postal,,data.gouv.fr decp-2025-04.json,Produits alimentaires divers,15000000,15800000,15800000,15800000,15000000,2024


In [7]:
df_cpv = filter_top_cpv_categories(df, top_n=40, cpv_column=cpv_column)

Filtered from 73 to 40 CPV categories, keeping 283975 rows out of 286850


In [8]:
df_cpv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 283975 entries, 0 to 286849
Data columns (total 38 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   uid                        283975 non-null  object 
 1   id                         283975 non-null  object 
 2   nature                     283975 non-null  object 
 3   acheteur_id                283975 non-null  int64  
 4   acheteur_nom               283788 non-null  object 
 5   acheteur_siren             283810 non-null  float64
 6   titulaire_id               283975 non-null  object 
 7   titulaire_typeIdentifiant  283975 non-null  object 
 8   titulaire_nom              276865 non-null  object 
 9   titulaire_siren            282918 non-null  float64
 10  objet                      283975 non-null  object 
 11  montant                    283975 non-null  float64
 12  codeCPV                    283975 non-null  object 
 13  procedure                  283975 

In [11]:
X_train, X_test = train_test_split(df_cpv, test_size=0.2, random_state=0, stratify=df_cpv[cpv_column])

In [12]:
pipeline = create_pipeline_cat('marche_sim')
X_train_preproc = pipeline.fit_transform(X_train)
X_test_preproc = pipeline.transform(X_test)



In [14]:
X_train_preproc.shape, X_test_preproc.shape

((227180, 74), (56795, 74))

# PCA reduction

In [15]:
# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.9)  # Keep enough components to explain 90% of variance
X_train_pca = pca.fit_transform(X_train_preproc)

print(f"Original dimensions: {X_train_preproc.shape[1]}")
print(f"Reduced dimensions: {X_train_pca.shape[1]}")
print(f"Explained variance: {sum(pca.explained_variance_ratio_):.2f}")

Original dimensions: 74
Reduced dimensions: 16
Explained variance: 0.91


# HDBSCAN clustering 1st

In [16]:
# HDBSCAN hyperparameter exploration
min_samples = 5  # Similar to DBSCAN's min_samples

results = []


# Apply HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=min_samples,
    metric='euclidean',
    gen_min_span_tree=True,
    cluster_selection_method='eom'  # 'eom' is usually better for variable density
)
clusterer.fit(X_train_pca)
labels = clusterer.labels_
# Calculate metrics
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
noise_ratio = 100 * n_noise / len(X_train_pca)
results.append({
    'min_cluster_size': 20,
    'num_clusters': n_clusters,
    'noise_points': n_noise,
    'noise_percentage': noise_ratio
})
print("min_cluster_size = 20")
print(f"  Number of clusters: {n_clusters}")
print(f"  Number of noise points: {n_noise}")
print(f"  Percentage of noise: {noise_ratio:.2f}%")
# Calculate silhouette score if more than one cluster and not all noise
if n_clusters > 1 and n_noise < len(X_train_pca):
    # Filter out noise points
    mask = labels != -1
    silhouette_avg = silhouette_score(X_train_pca[mask], labels[mask])
    print(f"  Silhouette Score: {silhouette_avg:.3f}")
print()

# Display results table
results_df = pd.DataFrame(results)
results_df



min_cluster_size = 20
  Number of clusters: 2140
  Number of noise points: 70640
  Percentage of noise: 31.09%
  Silhouette Score: 0.084



Unnamed: 0,min_cluster_size,num_clusters,noise_points,noise_percentage
0,20,2140,70640,31.094286


In [11]:
# 2D visualization using PCA
# For visualization, use the first 2 PCA components
vis_pca = PCA(n_components=2).fit_transform(df.drop('cluster', axis=1))

plt.figure(figsize=(12, 8))
scatter = plt.scatter(vis_pca[:, 0], vis_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.5, s=5)
plt.colorbar(scatter, label='Cluster')
plt.title(f'HDBSCAN Clustering (min_cluster_size={best_min_cluster_size}, min_samples={min_samples})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True, alpha=0.3)
plt.show()

KeyError: "['cluster'] not found in axis"

In [12]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

# Create 3D PCA components
vis_pca_3d = PCA(n_components=3).fit_transform(df.drop('cluster', axis=1))

# Create DataFrame for plotly
pca_df = pd.DataFrame(vis_pca_3d, columns=['PCA1', 'PCA2', 'PCA3'])
pca_df['cluster'] = df['cluster'].astype(str)

# Create interactive 3D scatter plot
fig = px.scatter_3d(
    pca_df,
    x='PCA1',
    y='PCA2',
    z='PCA3',
    color='cluster',
    opacity=0.7
)

fig.show()

KeyError: "['cluster'] not found in axis"

# Adjust Clustering Parameters


In [13]:
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=20,
    min_samples=5,
    cluster_selection_epsilon=0.2,  # Try values between 0.1-0.5
    cluster_selection_method='eom'
)

In [15]:
# Test with different metrics
metrics = ['euclidean', 'manhattan', 'cosine']
for metric in metrics:
    print(f"\nTesting with {metric} distance:")
    clusterer = hdbscan.HDBSCAN(min_cluster_size=20, metric=metric)

    # Fit and add labels to dataframe
    clusterer.fit(df_pca)
    df['cluster'] = clusterer.labels_

    # Analyze clusters
    n_clusters = len(set(df['cluster'])) - (1 if -1 in df['cluster'] else 0)
    n_noise = list(df['cluster']).count(-1)

    print("HDBSCAN Results with min_cluster_size=20, min_samples=5:")
    print(f"Number of clusters: {n_clusters}")
    print(f"Noise points: {n_noise} ({100 * n_noise / len(df):.2f}%)")



Testing with euclidean distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



HDBSCAN Results with min_cluster_size=20, min_samples=5:
Number of clusters: 968
Noise points: 101209 (44.46%)

Testing with manhattan distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



HDBSCAN Results with min_cluster_size=20, min_samples=5:
Number of clusters: 914
Noise points: 112690 (49.50%)

Testing with cosine distance:



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.



ValueError: Unrecognized metric 'cosine'