In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score

In [2]:
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
from scripts.preprocess_categorielle import preprocess_categorielle, encoding_idAccordCadre
from scripts.preprocess_missing_values import clean_missing_values
from scripts.preprocess_cpv import add_cpv_hierarchy_column
from scripts.preprocess_numerique import log_transformation, scale_numerical_features

In [None]:
df = pd.read_csv('../data/train_preprocessed.csv', encoding='utf-8')


In [5]:
df

Unnamed: 0,montant,dureeMois,offresRecues,nature_Marché,nature_Marché de défense ou de sécurité,nature_Marché de partenariat,procedure_Appel d'offres ouvert,procedure_Appel d'offres restreint,procedure_Dialogue compétitif,procedure_Marché passé sans publicité ni mise en concurrence préalable,...,codeCPV_2_79000000,codeCPV_2_80000000,codeCPV_2_85000000,codeCPV_2_90000000,codeCPV_2_92000000,codeCPV_2_98000000,tauxAvance_cat_large_advance,tauxAvance_cat_medium_advance,tauxAvance_cat_no_advance,tauxAvance_cat_small_advance
0,-0.374605,0.163837,0.191779,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2.060021,0.623807,-1.097563,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1.453588,-0.487659,0.191779,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,1.506181,0.922299,-1.097563,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1.472217,0.623807,-1.097563,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237346,-0.828888,-0.487659,-1.097563,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
237347,0.481805,-0.766452,-1.097563,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
237348,-0.570988,-0.202597,-1.097563,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
237349,0.039790,0.207215,1.005263,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
# 4. Parameter selection for DBSCAN
# Use K-distance plot to find optimal epsilon
k = 5  # Number of neighbors to consider
neigh = NearestNeighbors(n_neighbors=k)
neigh.fit(df)
distances, indices = neigh.kneighbors(df)
distances = np.sort(distances[:, k-1])


In [None]:

plt.figure(figsize=(10, 6))
plt.plot(distances)
plt.xlabel('Points sorted by distance')
plt.ylabel(f'Distance to {k}th nearest neighbor')
plt.title('K-distance Plot for DBSCAN Epsilon Selection')
plt.grid(True)
plt.show()


In [None]:

# 5. Apply DBSCAN (adjust epsilon based on k-distance plot)
# Start with an educated guess, then tune
epsilon = 0.5  # Replace with value from your k-distance plot
min_samples = 5  # Typical starting value

dbscan = DBSCAN(eps=epsilon, min_samples=min_samples)
df['cluster'] = dbscan.fit_predict(df)

# 6. Analyze clustering results
n_clusters = len(set(df['cluster'])) - (1 if -1 in df['cluster'] else 0)
n_noise = list(df['cluster']).count(-1)

print(f'Number of clusters: {n_clusters}')
print(f'Number of noise points: {n_noise}')
print(f'Percentage of noise points: {100 * n_noise / len(df):.2f}%')


In [None]:

# 7. Visualize clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df.drop('cluster', axis=1))

plt.figure(figsize=(12, 8))
scatter = plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['cluster'], cmap='viridis', alpha=0.5)
plt.colorbar(scatter)
plt.title(f'DBSCAN Clustering Results (eps={epsilon}, min_samples={min_samples})')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()


In [None]:

# 8. Analyze cluster characteristics
# For each cluster, compute statistics on key features
cluster_stats = df.groupby('cluster').agg({
    'montant': ['mean', 'std'],
    'dureeMois': ['mean', 'std'],
    'offresRecues': ['mean', 'std']
})

print(cluster_stats)


In [None]:

# 9. CPV composition of clusters
cpv_clusters = pd.crosstab(df['cluster'], df['codeCPV_2'])
cpv_clusters_pct = cpv_clusters.div(cpv_clusters.sum(axis=1), axis=0) * 100

# Display top CPV categories for each cluster
for cluster in sorted(df['cluster'].unique()):
    if cluster == -1:
        print(f"Noise points (-1): {cpv_clusters.loc[-1].sum()} contracts")
    else:
        top_cpvs = cpv_clusters_pct.loc[cluster].nlargest(5)
        print(f"Cluster {cluster}: Top CPV categories")
        print(top_cpvs)
        print("-----\n")


In [None]:

# 10. Evaluate clustering with silhouette score (if there are at least 2 clusters)
if n_clusters >= 2:
    # Filter out noise points for silhouette calculation
    df_clustered = df[df['cluster'] != -1]
    if len(df_clustered) > 0:
        silhouette_avg = silhouette_score(
            df_clustered.drop('cluster', axis=1),
            df_clustered['cluster']
        )
        print(f"Silhouette Score: {silhouette_avg:.3f}")