# 🚀 Advanced Credit Card Customer Clustering

**FTML 2025 - Exercice 7 : Apprentissage Non-Supervisé Optimisé**

---

## 🎯 **Objectifs**
Segmentation avancée des clients de carte de crédit utilisant les **techniques state-of-the-art** et exploitant **32 cœurs CPU + RTX 5060 GPU**.

## 📊 **Dataset**
- **Source** : UCI Credit Card Default - Taiwan
- **Taille** : 30,000 clients × 24 variables
- **Optimisations** : GPU RAPIDS cuML, UMAP, clustering parallèle

## 🔥 **Techniques Modernes Intégrées**
1. **GPU Acceleration** : cuML RAPIDS (15x-312x speedup)
2. **UMAP** : Réduction dimensionnelle supérieure à PCA
3. **Feature Engineering Avancé** : 50+ variables dérivées
4. **Clustering Multiple** : K-means, Hierarchical, GMM, DBSCAN
5. **Visualisations Pro** : Yellowbrick, dendrogrammes interactifs
6. **Analyse Business** : Profils clients détaillés
7. **Parallélisation** : Utilisation des 32 cœurs


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from sklearn.neighbors import NearestNeighbors
from joblib import Parallel, delayed
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.stats import chi2_contingency
import time
from collections import Counter

# GPU Libraries (with fallback)
try:
    import cuml
    import cudf
    import cupy as cp
    from cuml.cluster import KMeans as cuKMeans, DBSCAN as cuDBSCAN
    from cuml.manifold import UMAP as cuUMAP
    from cuml.decomposition import PCA as cuPCA
    GPU_AVAILABLE = True
    print("🚀 GPU cuML RAPIDS detected - Using GPU acceleration!")
except ImportError:
    GPU_AVAILABLE = False
    print("⚠️  cuML not available - Using CPU fallback")

# Advanced visualization libraries
try:
    import umap
    UMAP_AVAILABLE = True
    print("✅ UMAP available")
except ImportError:
    UMAP_AVAILABLE = False
    print("⚠️  UMAP not available - Using PCA fallback")

try:
    from yellowbrick.cluster import SilhouetteVisualizer, KElbowVisualizer
    from yellowbrick.features import ParallelCoordinates
    YELLOWBRICK_AVAILABLE = True
    print("✅ Yellowbrick available for advanced visualizations")
except ImportError:
    YELLOWBRICK_AVAILABLE = False
    print("⚠️  Yellowbrick not available - Using standard visualizations")

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print(f"🔧 Configuration: GPU={'✅' if GPU_AVAILABLE else '❌'} | UMAP={'✅' if UMAP_AVAILABLE else '❌'} | Yellowbrick={'✅' if YELLOWBRICK_AVAILABLE else '❌'}")
print(f"💻 Ready to use 32 CPU cores + RTX 5060 GPU!")


In [None]:
def load_and_explore_data():
    """Chargement et exploration initiale du dataset"""
    # Chargement des données
    df = pd.read_csv('../data/default_of_credit_card_clients.csv')
    
    print(f"📊 Dataset chargé : {df.shape[0]:,} observations × {df.shape[1]} variables")
    print(f"💾 Taille mémoire : {df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")
    
    # Informations de base
    print(f"\n🔍 Structure du dataset :")
    print(f"   • Variables numériques : {df.select_dtypes(include=[np.number]).shape[1]}")
    print(f"   • Variables catégorielles : {df.select_dtypes(include=['object']).shape[1]}")
    print(f"   • Valeurs manquantes : {df.isnull().sum().sum()}")
    
    # Distribution de la variable cible (pour validation finale)
    if 'default payment next month' in df.columns:
        default_rate = df['default payment next month'].mean()
        print(f"   • Taux de défaut : {default_rate:.1%}")
    
    # Statistiques descriptives clés
    print(f"\n💳 Caractéristiques clés :")
    print(f"   • Âge moyen : {df['AGE'].mean():.1f} ans (écart-type : {df['AGE'].std():.1f})")
    print(f"   • Limite de crédit médiane : {df['LIMIT_BAL'].median():,.0f} NT$")
    print(f"   • Utilisation moyenne : {(df['BILL_AMT1'] / df['LIMIT_BAL']).mean():.1%}")
    
    return df

# Chargement
df = load_and_explore_data()
df.head()


In [None]:
def advanced_feature_engineering(df):
    """
    Feature engineering avancé basé sur les meilleures pratiques
    Inspiration: GitHub repos populaires + articles de recherche
    """
    print("🔧 Début du feature engineering avancé...")
    df_fe = df.copy()
    
    # Suppression des colonnes non pertinentes pour le clustering
    columns_to_drop = ['ID']
    if 'default payment next month' in df_fe.columns:
        df_fe['target'] = df_fe['default payment next month']  # Sauvegarde pour validation
        columns_to_drop.append('default payment next month')
    
    df_fe = df_fe.drop(columns=columns_to_drop, errors='ignore')
    
    # === 1. RATIOS FINANCIERS DE BASE ===
    print("   💰 Création des ratios financiers de base...")
    
    # Utilisation du crédit
    df_fe['credit_utilization'] = df_fe['BILL_AMT1'] / (df_fe['LIMIT_BAL'] + 1)
    df_fe['max_utilization'] = df_fe[['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 
                                     'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']].max(axis=1) / (df_fe['LIMIT_BAL'] + 1)
    
    # Ratios de paiement
    df_fe['payment_ratio'] = df_fe['PAY_AMT1'] / (df_fe['BILL_AMT1'] + 1)
    df_fe['payment_to_limit'] = df_fe['PAY_AMT1'] / (df_fe['LIMIT_BAL'] + 1)
    
    # === 2. AGRÉGATIONS TEMPORELLES ===
    print("   📊 Agrégations temporelles (6 mois)...")
    
    bill_cols = ['BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6']
    pay_cols = ['PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
    delay_cols = ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']
    
    # Moyennes et médianes
    df_fe['avg_bill'] = df_fe[bill_cols].mean(axis=1)
    df_fe['median_bill'] = df_fe[bill_cols].median(axis=1)
    df_fe['avg_payment'] = df_fe[pay_cols].mean(axis=1)
    df_fe['median_payment'] = df_fe[pay_cols].median(axis=1)
    
    # Statistiques de retard
    df_fe['avg_delay'] = df_fe[delay_cols].mean(axis=1)
    df_fe['max_delay'] = df_fe[delay_cols].max(axis=1)
    df_fe['delay_count'] = (df_fe[delay_cols] > 0).sum(axis=1)
    
    # === 3. VOLATILITÉ ET STABILITÉ ===
    print("   📈 Calcul de la volatilité et stabilité...")
    
    # Volatilité des montants
    df_fe['bill_volatility'] = df_fe[bill_cols].std(axis=1) / (df_fe[bill_cols].mean(axis=1) + 1)
    df_fe['payment_volatility'] = df_fe[pay_cols].std(axis=1) / (df_fe[pay_cols].mean(axis=1) + 1)
    df_fe['delay_volatility'] = df_fe[delay_cols].std(axis=1)
    
    # Consistance des paiements
    df_fe['payment_consistency'] = 1 - (df_fe[pay_cols].std(axis=1) / (df_fe[pay_cols].mean(axis=1) + 1))
    df_fe['payment_frequency'] = (df_fe[pay_cols] > 0).sum(axis=1) / 6
    
    # === 4. TENDANCES TEMPORELLES ===
    print("   📉 Analyse des tendances temporelles...")
    
    # Tendances (récent vs ancien)
    recent_bills = df_fe[bill_cols[:3]].mean(axis=1)
    older_bills = df_fe[bill_cols[3:]].mean(axis=1)
    df_fe['bill_trend'] = (recent_bills - older_bills) / (df_fe['avg_bill'] + 1)
    
    recent_payments = df_fe[pay_cols[:3]].mean(axis=1)
    older_payments = df_fe[pay_cols[3:]].mean(axis=1)
    df_fe['payment_trend'] = (recent_payments - older_payments) / (df_fe['avg_payment'] + 1)
    
    recent_delays = df_fe[delay_cols[:3]].mean(axis=1)
    older_delays = df_fe[delay_cols[3:]].mean(axis=1)
    df_fe['delay_trend'] = recent_delays - older_delays
    
    # === 5. RATIOS AVANCÉS ===
    print("   🎯 Ratios avancés et interactions...")
    
    # Ratios comportementaux
    df_fe['payment_bill_ratio'] = df_fe['avg_payment'] / (df_fe['avg_bill'] + 1)
    df_fe['limit_age_ratio'] = df_fe['LIMIT_BAL'] / (df_fe['AGE'] + 1)
    df_fe['bill_limit_ratio'] = df_fe['avg_bill'] / (df_fe['LIMIT_BAL'] + 1)
    
    # Variables d'interaction
    df_fe['age_limit_interaction'] = df_fe['AGE'] * df_fe['LIMIT_BAL'] / 1000
    df_fe['education_limit_interaction'] = df_fe['EDUCATION'] * df_fe['LIMIT_BAL'] / 1000
    df_fe['age_utilization'] = df_fe['AGE'] * df_fe['credit_utilization']
    
    # === 6. INDICATEURS DE RISQUE ===
    print("   ⚠️  Création d'indicateurs de risque...")
    
    # Seuils basés sur l'analyse exploratoire
    df_fe['high_utilization'] = (df_fe['credit_utilization'] > 0.8).astype(int)
    df_fe['frequent_delays'] = (df_fe['delay_count'] >= 3).astype(int)
    df_fe['low_payment_ratio'] = (df_fe['payment_ratio'] < 0.1).astype(int)
    df_fe['high_volatility'] = (df_fe['bill_volatility'] > 1.0).astype(int)
    df_fe['payment_issues'] = (df_fe['payment_frequency'] < 0.5).astype(int)
    
    # Score de risque composite
    risk_indicators = ['high_utilization', 'frequent_delays', 'low_payment_ratio', 
                      'high_volatility', 'payment_issues']
    df_fe['risk_score'] = df_fe[risk_indicators].sum(axis=1)
    
    # === 7. PROFILS FINANCIERS ===
    print("   👤 Création de profils financiers...")
    
    # Segmentation par utilisation
    def categorize_utilization(x):
        if x < 0.3: return 'Low'
        elif x < 0.7: return 'Medium'
        else: return 'High'
    
    df_fe['utilization_category'] = df_fe['credit_utilization'].apply(categorize_utilization)
    
    # Segmentation par limite
    df_fe['limit_category'] = pd.qcut(df_fe['LIMIT_BAL'], q=4, labels=['Low', 'Medium', 'High', 'Premium'])
    
    # === 8. NETTOYAGE FINAL ===
    print("   🧹 Nettoyage et finalisation...")
    
    # Gestion des valeurs infinies et NaN
    df_fe = df_fe.replace([np.inf, -np.inf], np.nan)
    df_fe = df_fe.fillna(0)
    
    # Variables catégorielles en numérique pour le clustering
    if 'utilization_category' in df_fe.columns:
        df_fe['utilization_category'] = pd.Categorical(df_fe['utilization_category'], 
                                                       categories=['Low', 'Medium', 'High']).codes
    if 'limit_category' in df_fe.columns:
        df_fe['limit_category'] = pd.Categorical(df_fe['limit_category'], 
                                                 categories=['Low', 'Medium', 'High', 'Premium']).codes
    
    print(f"✅ Feature engineering terminé : {df_fe.shape[1]} variables créées")
    print(f"   • Variables originales : 23")
    print(f"   • Nouvelles variables : {df_fe.shape[1] - 23}")
    
    return df_fe

# Application du feature engineering
df_enhanced = advanced_feature_engineering(df)
print(f"\n📊 Dataset enrichi : {df_enhanced.shape}")
df_enhanced.head()
