In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from pathlib import Path
import numpy as np
import matplotlib.pyplot as plt

# --- D√âFINITION DES CHEMINS ---
PROJECT_ROOT_ABSOLUTE = "/home/onyxia/work/Gestion-portefeuille/"

try:
    ROOT_DIR = Path(PROJECT_ROOT_ABSOLUTE)
except Exception:
    ROOT_DIR = Path.cwd()

INTERIM_DATA_PATH = ROOT_DIR / "data" / "interim"
PROCESSED_DATA_PATH = ROOT_DIR / "data" / "processed"
INPUT_FILENAME = "cac40_interim_features.csv"

# --- 1. FONCTION DE CHARGEMENT ET PR√âPARATION ---

def load_and_prepare_data():
    """Charge les donn√©es, s√©lectionne la derni√®re date et met √† l'√©chelle."""
    filepath = INTERIM_DATA_PATH / INPUT_FILENAME
    if not filepath.exists():
        print(f"‚ùå Erreur : Fichier d'entr√©e non trouv√© √† {filepath}. Relancez l'√©tape 2.")
        return None, None

    df = pd.read_csv(filepath)
    df['Date'] = pd.to_datetime(df['Date'])
    latest_date = df['Date'].max()
    df_latest = df[df['Date'] == latest_date].copy().set_index('Ticker')

    FEATURES = ['Volatility', 'Sharpe_Ratio_20D', 'Performance_20D', 'Volume', 'Dividends']

    X = df_latest[FEATURES].copy()

    # Nettoyage et Imputation
    X = X.replace([np.inf, -np.inf], np.nan)
    X = X.fillna(X.mean())

    # Normalisation
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=FEATURES, index=X.index)

    return X_scaled, df_latest

# --- Ex√©cution de l'exploration ---
X_scaled, df_latest = load_and_prepare_data()

if X_scaled is None:
    print("Impossible de continuer sans donn√©es pr√©par√©es.")
else:
    # ----------------------------------------------------
    # I. DIAGNOSTIC K (M√©thode du Coude)
    # ----------------------------------------------------
    print("\nüî¨ Diagnostic : M√©thode du Coude pour K Optimal")

    sse = {}  # Sum of Squared Errors
    for k in range(1, 10):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        sse[k] = kmeans.inertia_

    plt.figure(figsize=(10, 6))
    plt.plot(list(sse.keys()), list(sse.values()), marker='o')
    plt.title('M√©thode du Coude pour K Optimal')
    plt.xlabel("Nombre de Clusters (K)")
    plt.ylabel("Somme des Carr√©s des Erreurs (SSE)")

    # Sauvegarde du plot (pas de show bloquant)
    elbow_path = PROCESSED_DATA_PATH / "diagnostic_elbow_method.png"
    plt.savefig(elbow_path, bbox_inches="tight")
    plt.close()

    # ----------------------------------------------------
    # II. VISUALISATION DES CLUSTERS (PCA)
    # ----------------------------------------------------

    # R√©applique K-Means avec K=4 (valeur choisie)
    K_CHOICE = 4
    kmeans_final = KMeans(n_clusters=K_CHOICE, random_state=42, n_init=10)
    clusters = kmeans_final.fit_predict(X_scaled)
    df_latest['Cluster'] = clusters

    print(f"\nüìä Visualisation de la s√©paration pour K = {K_CHOICE}")

    # R√©duction de dimension (PCA pour 2D)
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(X_scaled)
    pca_df = pd.DataFrame(
        data=principal_components,
        columns=['PC1', 'PC2'],
        index=df_latest.index
    )

    # Ajouter la colonne Cluster pour la visualisation
    pca_df['Cluster'] = clusters

    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(
        pca_df['PC1'],
        pca_df['PC2'],
        c=pca_df['Cluster'],
        cmap='viridis',
        s=50,
        alpha=0.7
    )

    # Annoter les points avec le ticker
    for i, txt in enumerate(pca_df.index):
        plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8, alpha=0.8)

    plt.title(f'Segmentation des Actifs du CAC 40 (K={K_CHOICE} - Visualisation PCA)')
    plt.xlabel("Composante Principale 1")
    plt.ylabel("Composante Principale 2")

    # L√©gende pour la couleur
    legend1 = plt.legend(*scatter.legend_elements(),
                         loc="upper right", title="Clusters")
    plt.gca().add_artist(legend1)

    # Sauvegarde du plot (pas de show bloquant)
    pca_path = PROCESSED_DATA_PATH / "visualization_pca_clusters.png"
    plt.savefig(pca_path, bbox_inches="tight")
    plt.close()

    print(f"\n‚úÖ Figures sauvegard√©es dans :\n - {elbow_path}\n - {pca_path}")



üî¨ Diagnostic : M√©thode du Coude pour K Optimal

üìä Visualisation de la s√©paration pour K = 4


  plt.annotate(txt, (pca_df['PC1'][i], pca_df['PC2'][i]), fontsize=8, alpha=0.8)



‚úÖ Figures sauvegard√©es dans :
 - /home/onyxia/work/Gestion-portefeuille/data/processed/diagnostic_elbow_method.png
 - /home/onyxia/work/Gestion-portefeuille/data/processed/visualization_pca_clusters.png
