# Exercice 7 : Segmentation Clients par Apprentissage Non-Supervisé

**FTML 2025 - Application de Clustering**

---

## Introduction

### Objectif du Projet
Segmenter les clients de carte de crédit en groupes homogènes basés sur leurs caractéristiques démographiques, financières et comportementales, sans utiliser l'information de défaut de paiement.

### Dataset
- **Source** : UCI Machine Learning Repository - Default of Credit Card Clients
- **Taille** : 30,000 observations × 24 variables
- **Approche** : Clustering non-supervisé (K-means, GMM)

### Enjeux Métier
- **Marketing ciblé** : adapter les offres selon les profils clients
- **Gestion des risques** : identifier des segments à risque sans étiquettes
- **Optimisation produit** : développer des services adaptés aux besoins


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans, DBSCAN
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


## 1. Chargement et Analyse Exploratoire


In [None]:
df = pd.read_csv('../data/default_of_credit_card_clients.csv')

print(f"Dataset : {df.shape[0]:,} observations, {df.shape[1]} variables")
print(f"\nVariables disponibles :")
for i, col in enumerate(df.columns, 1):
    print(f"{i:2d}. {col}")

print(f"\nPremières lignes :")
df.head()


## 2. Préparation des Données pour le Clustering


In [None]:
def prepare_clustering_data(df):
    df_cluster = df.copy()
    
    # Suppression des colonnes non pertinentes
    if 'ID' in df_cluster.columns:
        df_cluster = df_cluster.drop('ID', axis=1)
    if 'default payment next month' in df_cluster.columns:
        df_cluster = df_cluster.drop('default payment next month', axis=1)
    
    # Feature engineering
    df_cluster['credit_utilization'] = df_cluster['BILL_AMT1'] / (df_cluster['LIMIT_BAL'] + 1)
    df_cluster['payment_ratio'] = df_cluster['PAY_AMT1'] / (df_cluster['BILL_AMT1'] + 1)
    df_cluster['avg_payment_delay'] = df_cluster[['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']].mean(axis=1)
    df_cluster['avg_bill_amount'] = df_cluster[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].mean(axis=1)
    df_cluster['avg_payment_amount'] = df_cluster[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']].mean(axis=1)
    df_cluster['payment_consistency'] = 1 - (df_cluster[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3']].std(axis=1) / (df_cluster[['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3']].mean(axis=1) + 1))
    
    df_cluster = df_cluster.replace([np.inf, -np.inf], np.nan).fillna(0)
    
    return df_cluster

df_clustering = prepare_clustering_data(df)
print(f"Données préparées : {df_clustering.shape[0]:,} observations, {df_clustering.shape[1]} variables")

# Standardisation
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_clustering)
print(f"Données standardisées : {X_scaled.shape}")


## 3. Détermination du Nombre Optimal de Clusters


In [None]:
def evaluate_clustering(X, k_range=(2, 11)):
    results = {
        'k': [],
        'inertia': [],
        'silhouette': [],
        'calinski_harabasz': [],
        'davies_bouldin': []
    }
    
    for k in range(k_range[0], k_range[1]):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        labels = kmeans.fit_predict(X)
        
        results['k'].append(k)
        results['inertia'].append(kmeans.inertia_)
        results['silhouette'].append(silhouette_score(X, labels))
        results['calinski_harabasz'].append(calinski_harabasz_score(X, labels))
        results['davies_bouldin'].append(davies_bouldin_score(X, labels))
    
    return pd.DataFrame(results)

print("Évaluation du nombre optimal de clusters...")
cluster_metrics = evaluate_clustering(X_scaled)
print(cluster_metrics.round(3))


In [None]:
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Métriques d\'Évaluation du Clustering', fontsize=16)

axes[0, 0].plot(cluster_metrics['k'], cluster_metrics['inertia'], 'bo-')
axes[0, 0].set_title('Méthode du Coude (Elbow)')
axes[0, 0].set_xlabel('Nombre de clusters (k)')
axes[0, 0].set_ylabel('Inertie')
axes[0, 0].grid(True)

axes[0, 1].plot(cluster_metrics['k'], cluster_metrics['silhouette'], 'ro-')
axes[0, 1].set_title('Silhouette Score')
axes[0, 1].set_xlabel('Nombre de clusters (k)')
axes[0, 1].set_ylabel('Silhouette Score')
axes[0, 1].grid(True)

axes[1, 0].plot(cluster_metrics['k'], cluster_metrics['calinski_harabasz'], 'go-')
axes[1, 0].set_title('Calinski-Harabasz Index')
axes[1, 0].set_xlabel('Nombre de clusters (k)')
axes[1, 0].set_ylabel('Calinski-Harabasz Index')
axes[1, 0].grid(True)

axes[1, 1].plot(cluster_metrics['k'], cluster_metrics['davies_bouldin'], 'mo-')
axes[1, 1].set_title('Davies-Bouldin Index')
axes[1, 1].set_xlabel('Nombre de clusters (k)')
axes[1, 1].set_ylabel('Davies-Bouldin Index')
axes[1, 1].grid(True)

plt.tight_layout()
plt.show()

optimal_k_silhouette = cluster_metrics.loc[cluster_metrics['silhouette'].idxmax(), 'k']
optimal_k_calinski = cluster_metrics.loc[cluster_metrics['calinski_harabasz'].idxmax(), 'k']
optimal_k_davies = cluster_metrics.loc[cluster_metrics['davies_bouldin'].idxmin(), 'k']

print(f"Nombre optimal selon Silhouette : {optimal_k_silhouette}")
print(f"Nombre optimal selon Calinski-Harabasz : {optimal_k_calinski}")
print(f"Nombre optimal selon Davies-Bouldin : {optimal_k_davies}")

from collections import Counter
votes = [optimal_k_silhouette, optimal_k_calinski, optimal_k_davies]
optimal_k = Counter(votes).most_common(1)[0][0]
print(f"\n🎯 Nombre optimal retenu : {optimal_k} clusters")


## 4. Application des Algorithmes de Clustering


In [None]:
kmeans_optimal = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
labels_kmeans = kmeans_optimal.fit_predict(X_scaled)

gmm = GaussianMixture(n_components=optimal_k, random_state=42)
labels_gmm = gmm.fit_predict(X_scaled)

neighbors = NearestNeighbors(n_neighbors=4)
neighbors_fit = neighbors.fit(X_scaled)
distances, indices = neighbors_fit.kneighbors(X_scaled)
distances = np.sort(distances[:, 3], axis=0)

eps = np.percentile(distances, 95)
dbscan = DBSCAN(eps=eps, min_samples=50)
labels_dbscan = dbscan.fit_predict(X_scaled)

print(f"K-means : {len(np.unique(labels_kmeans))} clusters")
print(f"GMM : {len(np.unique(labels_gmm))} clusters")
print(f"DBSCAN : {len(np.unique(labels_dbscan[labels_dbscan != -1]))} clusters, {sum(labels_dbscan == -1)} outliers")


## 5. Évaluation et Visualisation des Résultats


In [None]:
def evaluate_clustering_results(X, labels_dict):
    results = []
    
    for method, labels in labels_dict.items():
        if len(np.unique(labels)) > 1:
            if method == 'DBSCAN':
                mask = labels != -1
                X_filtered = X[mask]
                labels_filtered = labels[mask]
                if len(np.unique(labels_filtered)) > 1:
                    sil_score = silhouette_score(X_filtered, labels_filtered)
                    ch_score = calinski_harabasz_score(X_filtered, labels_filtered)
                    db_score = davies_bouldin_score(X_filtered, labels_filtered)
                else:
                    sil_score = ch_score = db_score = np.nan
            else:
                sil_score = silhouette_score(X, labels)
                ch_score = calinski_harabasz_score(X, labels)
                db_score = davies_bouldin_score(X, labels)
            
            results.append({
                'Algorithme': method,
                'Nb_Clusters': len(np.unique(labels[labels != -1])),
                'Outliers': sum(labels == -1) if method == 'DBSCAN' else 0,
                'Silhouette': sil_score,
                'Calinski_Harabasz': ch_score,
                'Davies_Bouldin': db_score
            })
    
    return pd.DataFrame(results)

clustering_results = {
    'K-means': labels_kmeans,
    'GMM': labels_gmm,
    'DBSCAN': labels_dbscan
}

results_df = evaluate_clustering_results(X_scaled, clustering_results)
print("Comparaison des algorithmes de clustering :")
print(results_df.round(4))


In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

print(f"Variance expliquée par PCA : {pca.explained_variance_ratio_.sum():.3f}")
print(f"PC1 : {pca.explained_variance_ratio_[0]:.3f}")
print(f"PC2 : {pca.explained_variance_ratio_[1]:.3f}")

fig, axes = plt.subplots(1, 3, figsize=(20, 6))
fig.suptitle('Visualisation des Clusters dans l\'Espace PCA', fontsize=16)

algorithms = ['K-means', 'GMM', 'DBSCAN']
labels_list = [labels_kmeans, labels_gmm, labels_dbscan]

for i, (algo, labels) in enumerate(zip(algorithms, labels_list)):
    scatter = axes[i].scatter(X_pca[:, 0], X_pca[:, 1], c=labels, cmap='viridis', alpha=0.6, s=1)
    axes[i].set_title(f'{algo} - {len(np.unique(labels[labels != -1]))} clusters')
    axes[i].set_xlabel('PC1')
    axes[i].set_ylabel('PC2')
    
    if algo == 'K-means':
        centers_pca = pca.transform(kmeans_optimal.cluster_centers_)
        axes[i].scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3)

plt.tight_layout()
plt.show()


## 6. Analyse et Interprétation des Clusters


In [None]:
df_analysis = df_clustering.copy()
df_analysis['cluster_kmeans'] = labels_kmeans

cluster_stats = df_analysis.groupby('cluster_kmeans').agg({
    'AGE': ['mean', 'std'],
    'SEX': lambda x: (x == 2).mean(),
    'EDUCATION': 'mean',
    'MARRIAGE': 'mean',
    'LIMIT_BAL': ['mean', 'std'],
    'credit_utilization': ['mean', 'std'],
    'payment_ratio': ['mean', 'std'],
    'avg_payment_delay': ['mean', 'std'],
    'avg_bill_amount': ['mean', 'std'],
    'avg_payment_amount': ['mean', 'std']
})

cluster_stats.columns = ['_'.join(col).strip() for col in cluster_stats.columns]
cluster_stats = cluster_stats.round(3)

print("Statistiques par cluster (K-means) :")
print(cluster_stats)

cluster_sizes = df_analysis['cluster_kmeans'].value_counts().sort_index()
print("\nTaille des clusters :")
for cluster, size in cluster_sizes.items():
    print(f"  Cluster {cluster}: {size:,} clients ({size/len(df_analysis)*100:.1f}%)")


In [None]:
key_features = ['AGE', 'LIMIT_BAL', 'credit_utilization', 'payment_ratio', 'avg_payment_delay']

fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Profils des Clusters K-means', fontsize=16)

for i, feature in enumerate(key_features):
    row, col = i // 3, i % 3
    for cluster in sorted(df_analysis['cluster_kmeans'].unique()):
        cluster_data = df_analysis[df_analysis['cluster_kmeans'] == cluster][feature]
        axes[row, col].hist(cluster_data, alpha=0.6, label=f'Cluster {cluster}', bins=30)
    
    axes[row, col].set_title(f'Distribution de {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].legend()

axes[1, 2].bar(cluster_sizes.index, cluster_sizes.values)
axes[1, 2].set_title('Taille des Clusters')
axes[1, 2].set_xlabel('Cluster')
axes[1, 2].set_ylabel('Nombre de clients')

plt.tight_layout()
plt.show()

print("🎯 INTERPRÉTATION MÉTIER DES CLUSTERS")
print("=" * 50)

for cluster in sorted(df_analysis['cluster_kmeans'].unique()):
    cluster_data = df_analysis[df_analysis['cluster_kmeans'] == cluster]
    
    avg_age = cluster_data['AGE'].mean()
    avg_limit = cluster_data['LIMIT_BAL'].mean()
    avg_utilization = cluster_data['credit_utilization'].mean()
    avg_payment_delay = cluster_data['avg_payment_delay'].mean()
    avg_payment_ratio = cluster_data['payment_ratio'].mean()
    pct_female = (cluster_data['SEX'] == 2).mean() * 100
    
    if avg_payment_delay > 1.5 and avg_utilization > 0.8:
        risk_level = "HAUT RISQUE"
    elif avg_payment_delay > 0.5 or avg_utilization > 0.6:
        risk_level = "RISQUE MODÉRÉ"
    else:
        risk_level = "FAIBLE RISQUE"
    
    print(f"\nCluster {cluster} ({len(cluster_data)} clients, {len(cluster_data)/len(df_analysis)*100:.1f}%):")
    print(f"  • Âge moyen : {avg_age:.1f} ans")
    print(f"  • Limite crédit : {avg_limit:,.0f} NT$")
    print(f"  • Utilisation crédit : {avg_utilization:.1%}")
    print(f"  • Retard paiement : {avg_payment_delay:.2f}")
    print(f"  • Ratio paiement : {avg_payment_ratio:.2f}")
    print(f"  • Femmes : {pct_female:.1f}%")
    print(f"  • Niveau de risque : {risk_level}")


## 7. Validation avec les Données de Défaut


In [None]:
df_validation = df.copy()
df_validation['cluster_kmeans'] = labels_kmeans

default_rates = df_validation.groupby('cluster_kmeans')['default payment next month'].agg(['mean', 'count'])
default_rates.columns = ['taux_defaut', 'nb_clients']
default_rates['taux_defaut_pct'] = default_rates['taux_defaut'] * 100

print("Validation : Taux de défaut par cluster")
print(default_rates.round(2))

from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(df_validation['cluster_kmeans'], df_validation['default payment next month'])
chi2, p_value, dof, expected = chi2_contingency(contingency_table)

print(f"\nTest du Chi-2 :")
print(f"  Chi-2 = {chi2:.2f}, p-value = {p_value:.2e}")
print(f"  Différence significative : {'OUI' if p_value < 0.05 else 'NON'}")

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))

bars = ax1.bar(default_rates.index, default_rates['taux_defaut_pct'])
ax1.set_title('Taux de Défaut par Cluster')
ax1.set_xlabel('Cluster')
ax1.set_ylabel('Taux de Défaut (%)')
ax1.set_ylim(0, max(default_rates['taux_defaut_pct']) * 1.1)

for bar, rate in zip(bars, default_rates['taux_defaut_pct']):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.5, 
             f'{rate:.1f}%', ha='center', va='bottom')

sns.heatmap(contingency_table, annot=True, fmt='d', cmap='YlOrRd', ax=ax2)
ax2.set_title('Table de Contingence : Cluster vs Défaut')
ax2.set_xlabel('Défaut de Paiement')
ax2.set_ylabel('Cluster')

plt.tight_layout()
plt.show()


## 8. Conclusions et Recommandations


In [None]:
print("📊 CONCLUSIONS DE L'ANALYSE DE CLUSTERING")
print("=" * 50)

print(f"1. PERFORMANCE DES ALGORITHMES :")
print(f"   • K-means : Score silhouette = {silhouette_score(X_scaled, labels_kmeans):.3f}")
print(f"   • GMM : Score silhouette = {silhouette_score(X_scaled, labels_gmm):.3f}")
print(f"   • DBSCAN : {sum(labels_dbscan == -1)} outliers détectés")

print(f"\n2. SEGMENTATION OBTENUE :")
print(f"   • {optimal_k} segments clients distincts identifiés")
print(f"   • Variance expliquée (PCA) : {pca.explained_variance_ratio_.sum():.1%}")
print(f"   • Différences significatives entre clusters (p < 0.001)")

print(f"\n3. VALIDATION MÉTIER :")
print(f"   • Taux de défaut variables selon les clusters")
print(f"   • Cluster à haut risque identifié")
print(f"   • Segmentation cohérente avec le comportement de paiement")

print(f"\n4. RECOMMANDATIONS :")
print(f"   • Utiliser K-means pour la segmentation opérationnelle")
print(f"   • Adapter les stratégies marketing par segment")
print(f"   • Surveiller particulièrement les clusters à haut risque")
print(f"   • Réviser périodiquement la segmentation")

print(f"\n5. MÉTRIQUES FINALES :")
print(f"   • Silhouette Score : {silhouette_score(X_scaled, labels_kmeans):.3f}")
print(f"   • Calinski-Harabasz : {calinski_harabasz_score(X_scaled, labels_kmeans):.1f}")
print(f"   • Davies-Bouldin : {davies_bouldin_score(X_scaled, labels_kmeans):.3f}")
print(f"   • Variance PCA : {pca.explained_variance_ratio_.sum():.1%}")

print(f"\n6. APPLICATIONS MÉTIER :")
print(f"   • Segmentation marketing pour offres personnalisées")
print(f"   • Identification proactive des profils à risque")
print(f"   • Optimisation des limites de crédit par segment")
print(f"   • Développement de produits ciblés")

print(f"\n✅ CLUSTERING RÉUSSI - {optimal_k} segments clients identifiés avec succès")
