In [3]:
import pandas as pd
import numpy as np
import os
from glob import glob
from pathlib import Path


PROJECT_ROOT_ABSOLUTE = "/home/onyxia/work/Gestion-portefeuille/"

try:
    ROOT_DIR = Path(PROJECT_ROOT_ABSOLUTE)
except Exception:
    ROOT_DIR = Path.cwd()

RAW_DATA_PATH = ROOT_DIR / "data" / "raw"
INTERIM_DATA_PATH = ROOT_DIR / "data" / "interim"
PROCESSED_DATA_PATH = ROOT_DIR / "data" / "processed"
OUTPUT_FILENAME = "cac40_interim_features.csv"


def load_and_merge_data(raw_path: Path = RAW_DATA_PATH) -> pd.DataFrame:
    """Charge tous les fichiers CSV et les fusionne."""
    all_files = glob(str(raw_path / "*.csv"))

    if not all_files:
        print(f"‚ùå Erreur : Aucun fichier CSV trouv√© dans {raw_path}")
        return pd.DataFrame()

    list_df = []
    print(f"üì• Chargement de {len(all_files)} fichiers...")
    for filename in all_files:
        try:
            df = pd.read_csv(filename, parse_dates=['Date'])
            list_df.append(df)
        except Exception as e:
            print(f"‚ö†Ô∏è Erreur pour {filename}: {e}")
            continue

    if not list_df:
        return pd.DataFrame()

    full_df = pd.concat(list_df)

    cols_to_convert = ['Close', 'Volume']
    for col in cols_to_convert:
        if col in full_df.columns:
            full_df[col] = pd.to_numeric(full_df[col], errors='coerce')

    full_df = full_df.dropna(subset=['Ticker', 'Date'])
    full_df = full_df.set_index(['Ticker', 'Date']).sort_index()

    return full_df.loc[:, ['Close', 'Volume']].copy()


def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    """RSI classique."""
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)

    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()

    rs = avg_gain / (avg_loss + 1e-10)
    return 100 - (100 / (1 + rs))


def macd(series: pd.Series, fast=12, slow=26, signal=9):
    """MACD et ligne de signal."""
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, signal_line


def bollinger_bands(series: pd.Series, window=20, num_std=2):
    """Bandes de Bollinger."""
    sma = series.rolling(window=window).mean()
    std = series.rolling(window=window).std()
    upper = sma + (std * num_std)
    lower = sma - (std * num_std)
    return upper, lower, sma

def compute_financial_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calcule des features techniques AVEC exponential smoothing (Œ±=0.7).
    Comme dans l'article scientifique.
    """
    if df.empty:
        return df

    grouped = df.groupby(level='Ticker')

    print("üîß Calcul des features avec exponential smoothing (Œ±=0.7)...")

    # ============================================
    # √âTAPE 1 : EXPONENTIAL SMOOTHING (Œ± = 0.7)
    # ============================================
    
    # Lisser les prix AVANT tout calcul
    df['Close_Smooth'] = grouped['Close'].transform(
        lambda x: x.ewm(alpha=0.7, adjust=False).mean()
    )
    
    # Lisser le volume aussi
    df['Volume_Smooth'] = grouped['Volume'].transform(
        lambda x: x.ewm(alpha=0.7, adjust=False).mean()
    )
    
    print("   ‚úÖ Smoothing appliqu√©")

    # ============================================
    # √âTAPE 2 : FEATURES SUR DONN√âES LISS√âES
    # ============================================
    
    # Returns sur prix LISS√âS
    df['Returns_1D'] = grouped['Close_Smooth'].pct_change()
    df['Returns_5D'] = grouped['Close_Smooth'].pct_change(5)
    df['Returns_10D'] = grouped['Close_Smooth'].pct_change(10)
    df['Returns_20D'] = grouped['Close_Smooth'].pct_change(20)
    
    # Volatilit√©
    df['Vol_5D'] = grouped['Returns_1D'].transform(
        lambda x: x.rolling(5).std() * np.sqrt(252)
    )
    df['Vol_20D'] = grouped['Returns_1D'].transform(
        lambda x: x.rolling(20).std() * np.sqrt(252)
    )
    
    # ============================================
    # MOMENTUM ET TENDANCE (sur prix liss√©s)
    # ============================================
    
    df['RSI_14'] = grouped['Close_Smooth'].transform(lambda x: rsi(x, 14))
    df['RSI_7'] = grouped['Close_Smooth'].transform(lambda x: rsi(x, 7))
    
    # MACD sur prix liss√©s
    macd_vals = grouped['Close_Smooth'].transform(lambda x: macd(x)[0])
    macd_signal_vals = grouped['Close_Smooth'].transform(lambda x: macd(x)[1])
    df['MACD'] = macd_vals
    df['MACD_Signal'] = macd_signal_vals
    df['MACD_Histogram'] = df['MACD'] - df['MACD_Signal']
    
    # Moyennes mobiles sur prix liss√©s
    df['SMA_20'] = grouped['Close_Smooth'].transform(lambda x: x.rolling(20).mean())
    df['SMA_50'] = grouped['Close_Smooth'].transform(lambda x: x.rolling(50).mean())
    df['Price_to_SMA20'] = df['Close_Smooth'] / df['SMA_20']
    df['Price_to_SMA50'] = df['Close_Smooth'] / df['SMA_50']
    
    # ============================================
    # BANDES DE BOLLINGER (sur prix liss√©s)
    # ============================================
    
    bb_upper = grouped['Close_Smooth'].transform(lambda x: bollinger_bands(x)[0])
    bb_lower = grouped['Close_Smooth'].transform(lambda x: bollinger_bands(x)[1])
    bb_middle = grouped['Close_Smooth'].transform(lambda x: bollinger_bands(x)[2])
    
    df['BB_Upper'] = bb_upper
    df['BB_Lower'] = bb_lower
    df['BB_Middle'] = bb_middle
    df['BB_Width'] = (df['BB_Upper'] - df['BB_Lower']) / df['BB_Middle']
    df['BB_Position'] = (df['Close_Smooth'] - df['BB_Lower']) / (df['BB_Upper'] - df['BB_Lower'])
    
    # ============================================
    # VOLUME (sur volume liss√©)
    # ============================================
    
    df['Volume_Change'] = grouped['Volume_Smooth'].pct_change()
    df['Volume_SMA_20'] = grouped['Volume_Smooth'].transform(lambda x: x.rolling(20).mean())
    df['Volume_Ratio'] = df['Volume_Smooth'] / df['Volume_SMA_20']
    
    # ============================================
    # FEATURES AVANC√âES
    # ============================================
    
    rolling_mean = grouped['Returns_1D'].transform(lambda x: x.rolling(20).mean())
    rolling_std = grouped['Returns_1D'].transform(lambda x: x.rolling(20).std())
    df['Sharpe_20D'] = rolling_mean / (rolling_std + 1e-10)
    
    rolling_max = grouped['Close_Smooth'].transform(lambda x: x.rolling(20).max())
    df['Drawdown_20D'] = (df['Close_Smooth'] - rolling_max) / rolling_max
    
    df['Momentum_Score'] = (df['Returns_5D'] + df['Returns_10D'] + df['Returns_20D']) / 3
    
    # ============================================
    # NETTOYAGE
    # ============================================
    
    df = df.dropna()
    features_df = df.reset_index()
    
    # IMPORTANT: Garder Close ORIGINAL pour le calcul du Target
    feature_cols = [
        'Date', 'Ticker', 
        'Close',           # Prix ORIGINAL pour Target
        'Close_Smooth',    # Prix LISS√â (nouveau)
        'Volume', 'Volume_Smooth',  # Volume original + liss√©
        'Returns_1D', 'Returns_5D', 'Returns_10D', 'Returns_20D',
        'Vol_5D', 'Vol_20D',
        'RSI_14', 'RSI_7',
        'MACD', 'MACD_Signal', 'MACD_Histogram',
        'SMA_20', 'SMA_50', 'Price_to_SMA20', 'Price_to_SMA50',
        'BB_Upper', 'BB_Lower', 'BB_Middle', 'BB_Width', 'BB_Position',
        'Volume_Change', 'Volume_SMA_20', 'Volume_Ratio',
        'Sharpe_20D', 'Drawdown_20D', 'Momentum_Score'
    ]
    
    return features_df.loc[:, feature_cols]


def run_feature_engineering():
    """Pipeline complet de feature engineering."""
    print("=" * 60)
    print("FEATURE ENGINEERING AM√âLIOR√â - CAC40")
    print("=" * 60)
    
    # Chargement
    full_data = load_and_merge_data(raw_path=RAW_DATA_PATH)
    
    if full_data.empty:
        print("‚ùå Impossible de charger les donn√©es.")
        return
    
    print(f"‚úÖ {len(full_data)} observations charg√©es")
    
    # Calcul des features
    features_df = compute_financial_features(full_data)
    
    if features_df.empty:
        print("‚ùå Aucune donn√©e apr√®s calcul des features.")
        return
    
    print(f"‚úÖ {len(features_df)} observations apr√®s feature engineering")
    print(f"‚úÖ {len(features_df.columns) - 4} features cr√©√©es")
    
    # Sauvegarde
    os.makedirs(INTERIM_DATA_PATH, exist_ok=True)
    output_filepath = INTERIM_DATA_PATH / OUTPUT_FILENAME
    
    features_df.to_csv(output_filepath, index=False)
    print(f"\nüíæ Donn√©es sauvegard√©es : {output_filepath}")
    print("\nüìä Aper√ßu des features :")
    print(features_df.describe().T.round(3))


if __name__ == "__main__":
    run_feature_engineering()

FEATURE ENGINEERING AM√âLIOR√â - CAC40
üì• Chargement de 38 fichiers...
‚úÖ 106779 observations charg√©es
üîß Calcul des features avec exponential smoothing (Œ±=0.7)...
   ‚úÖ Smoothing appliqu√©
‚úÖ 104917 observations apr√®s feature engineering
‚úÖ 28 features cr√©√©es



üíæ Donn√©es sauvegard√©es : /home/onyxia/work/Gestion-portefeuille/data/interim/cac40_interim_features.csv

üìä Aper√ßu des features :
                   count                           mean                  min  \
Date              104917  2020-08-01 17:23:00.028022016  2015-03-12 00:00:00   
Close           104917.0                      190.65764                1.313   
Close_Smooth    104917.0                     190.652867             1.329186   
Volume          104917.0                  2013376.67846                  0.0   
Volume_Smooth   104917.0                 2013430.042679           603.420299   
Returns_1D      104917.0                       0.000856            -0.692654   
Returns_5D      104917.0                       0.004373            -0.990564   
Returns_10D     104917.0                       0.008199            -0.997405   
Returns_20D     104917.0                       0.021378            -0.997033   
Vol_5D          104917.0                       0.188914      