In [None]:
# ============================================
# FEATURE ENGINEERING - CAC40
# ============================================

import pandas as pd
import numpy as np
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

# --- CONFIGURATION ---
PROJECT_ROOT = "/home/onyxia/work/Gestion-portefeuille/"
ROOT_DIR = Path(PROJECT_ROOT)
RAW_DATA_PATH = ROOT_DIR / "data" / "raw"
PROCESSED_DATA_PATH = ROOT_DIR / "data" / "processed"
PROCESSED_DATA_PATH.mkdir(parents=True, exist_ok=True)

print("=" * 70)
print("FEATURE ENGINEERING - CAC40")
print("=" * 70)

# ============================================
# 1. CHARGEMENT DU DATASET GLOBAL
# ============================================

print("\n √âTAPE 1 : Chargement du dataset...")

dataset_path = RAW_DATA_PATH / "cac40_dataset.csv"

if not dataset_path.exists():
    print(f" Fichier introuvable : {dataset_path}")
    print(" Lance d'abord le script de t√©l√©chargement !")
else:
    # Charger avec MultiIndex
    df = pd.read_csv(dataset_path, index_col=['date', 'ticker'], parse_dates=['date'])
    
    print(f"‚úÖ Dataset charg√© : {df.shape}")
    print(f"   Colonnes : {list(df.columns)}")
    print(f"   Tickers : {df.index.get_level_values('ticker').nunique()}")
    print(f"   P√©riode : {df.index.get_level_values('date').min().date()} ‚Üí {df.index.get_level_values('date').max().date()}")
    
    print("\n Aper√ßu des donn√©es :")
    display(df.head(10))
    
    print("\n Info dataset :")
    print(df.info())

# ============================================
# 2. FONCTIONS POUR LES INDICATEURS TECHNIQUES
# ============================================

print("\nüîß √âTAPE 2 : D√©finition des fonctions techniques...")

def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    """RSI (Relative Strength Index)."""
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    
    avg_gain = gain.rolling(window=window, min_periods=window).mean()
    avg_loss = loss.rolling(window=window, min_periods=window).mean()
    
    rs = avg_gain / (avg_loss + 1e-10)
    return 100 - (100 / (1 + rs))


def macd(series: pd.Series, fast=12, slow=26, signal=9):
    """MACD (Moving Average Convergence Divergence)."""
    ema_fast = series.ewm(span=fast, adjust=False).mean()
    ema_slow = series.ewm(span=slow, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal, adjust=False).mean()
    return macd_line, signal_line


def bollinger_bands(series: pd.Series, window=20, num_std=2):
    """Bollinger Bands."""
    sma = series.rolling(window=window).mean()
    std = series.rolling(window=window).std()
    upper = sma + (std * num_std)
    lower = sma - (std * num_std)
    return upper, lower, sma


def garman_klass_volatility(group_df: pd.DataFrame) -> pd.Series:
    """Garman-Klass Volatility - Formule exacte de ton image."""
    high = group_df['high']
    low = group_df['low']
    close = group_df['close']
    open_ = group_df['open']
    
    # Formule : sqrt((ln(H/L))¬≤/2 - (2ln(2)-1)(ln(C/O))¬≤)
    hl_component = (np.log(high / low)) ** 2 / 2
    co_component = (2 * np.log(2) - 1) * (np.log(close / open_)) ** 2
    
    gk_vol = np.sqrt(hl_component - co_component)
    return gk_vol

print("‚úÖ Fonctions techniques d√©finies (RSI, MACD, Bollinger, GK Volatility)")

# ============================================
# 3. CALCUL DES FEATURES
# ============================================

print("\nüîß √âTAPE 3 : Calcul des features techniques...")
print("-" * 70)

# Copie pour √©viter les warnings
df_features = df.copy()

# Group by ticker
grouped = df_features.groupby(level='ticker')

# --- 1. RETURNS (4 features) ---
print("üìä [1/10] Calcul des Returns...")
df_features['returns_1d'] = grouped['close'].pct_change()
df_features['returns_5d'] = grouped['close'].pct_change(5)
df_features['returns_20d'] = grouped['close'].pct_change(20)
df_features['log_returns'] = grouped['close'].transform(lambda x: np.log(x / x.shift(1)))

# --- 2. GARMAN-KLASS VOLATILITY (2 features) ---
print("üìä [2/10] Calcul de Garman-Klass Volatility...")
df_features['gk_volatility'] = grouped.apply(garman_klass_volatility).droplevel(0)
df_features['gk_vol_20d'] = df_features.groupby(level='ticker')['gk_volatility'].transform(
    lambda x: x.rolling(20, min_periods=5).mean()
)

# --- 3. RSI (1 feature) ---
print("üìä [3/10] Calcul du RSI...")
df_features['rsi_14'] = grouped['close'].transform(lambda x: rsi(x, 14))

# --- 4. BOLLINGER BANDS (2 features) ---
print("üìä [4/10] Calcul des Bollinger Bands...")
bb_upper = grouped['close'].transform(lambda x: bollinger_bands(x)[0])
bb_lower = grouped['close'].transform(lambda x: bollinger_bands(x)[1])
bb_middle = grouped['close'].transform(lambda x: bollinger_bands(x)[2])

df_features['bb_position'] = (df_features['close'] - bb_lower) / (bb_upper - bb_lower + 1e-10)
df_features['bb_width'] = (bb_upper - bb_lower) / (bb_middle + 1e-10)

# --- 5. MACD (2 features) ---
print("üìä [5/10] Calcul du MACD...")
macd_vals = grouped['close'].transform(lambda x: macd(x)[0])
macd_signal_vals = grouped['close'].transform(lambda x: macd(x)[1])
df_features['macd'] = macd_vals
df_features['macd_histogram'] = macd_vals - macd_signal_vals

# --- 6. ATR (1 feature) ---
print("üìä [6/10] Calcul de l'ATR...")
df_features['prev_close'] = grouped['close'].shift(1)
df_features['tr'] = df_features[['high', 'low', 'prev_close']].apply(
    lambda x: max(x['high'] - x['low'], 
                  abs(x['high'] - x['prev_close']), 
                  abs(x['low'] - x['prev_close'])), 
    axis=1
)
df_features['atr_14'] = df_features.groupby(level='ticker')['tr'].transform(
    lambda x: x.rolling(14, min_periods=1).mean()
)
df_features.drop(['tr', 'prev_close'], axis=1, inplace=True)

# --- 7. MOVING AVERAGES (3 features) ---
print("üìä [7/10] Calcul des Moving Averages...")
df_features['sma_20'] = grouped['close'].transform(lambda x: x.rolling(20, min_periods=5).mean())
df_features['sma_50'] = grouped['close'].transform(lambda x: x.rolling(50, min_periods=10).mean())
df_features['price_to_sma20'] = df_features['close'] / (df_features['sma_20'] + 1e-10)

# --- 8. VOLUME (3 features) ---
print("üìä [8/10] Calcul des Volume features...")
df_features['volume_sma_20'] = grouped['volume'].transform(lambda x: x.rolling(20, min_periods=5).mean())
df_features['volume_ratio'] = df_features['volume'] / (df_features['volume_sma_20'] + 1)
df_features['euro_volume'] = df_features['close'] * df_features['volume']

# --- 9. RISK METRICS (2 features) ---
print("üìä [9/10] Calcul des Risk Metrics...")
rolling_mean = grouped['returns_1d'].transform(lambda x: x.rolling(20, min_periods=5).mean())
rolling_std = grouped['returns_1d'].transform(lambda x: x.rolling(20, min_periods=5).std())
df_features['sharpe_20d'] = rolling_mean / (rolling_std + 1e-10)

rolling_max = grouped['close'].transform(lambda x: x.rolling(20, min_periods=5).max())
df_features['drawdown_20d'] = (df_features['close'] - rolling_max) / (rolling_max + 1e-10)

# --- 10. MOMENTUM COMPOSITE (1 feature) ---
print("üìä [10/10] Calcul du Momentum Score...")
df_features['momentum_score'] = (df_features['returns_5d'] + df_features['returns_20d']) / 2

print("\n‚úÖ Toutes les features calcul√©es !")

# ============================================
# 4. NETTOYAGE
# ============================================

print("\nüßπ √âTAPE 4 : Nettoyage des donn√©es...")

initial_rows = len(df_features)
print(f"   Lignes avant nettoyage : {initial_rows:,}")

# Supprimer les NaN
df_features = df_features.dropna()

final_rows = len(df_features)
dropped_rows = initial_rows - final_rows
print(f"   Lignes apr√®s nettoyage : {final_rows:,}")
print(f"   Lignes supprim√©es : {dropped_rows:,} ({dropped_rows/initial_rows*100:.1f}%)")

# ============================================
# 5. STATISTIQUES
# ============================================

print("\nüìä √âTAPE 5 : Statistiques des features...")
print("-" * 70)

# Liste des features cr√©√©es
feature_cols = [col for col in df_features.columns 
                if col not in ['close', 'high', 'low', 'open', 'volume']]

print(f"\n‚úÖ DATASET FINAL :")
print(f"   Shape : {df_features.shape}")
print(f"   Features techniques : {len(feature_cols)}")
print(f"   Tickers : {df_features.index.get_level_values('ticker').nunique()}")
print(f"   P√©riode : {df_features.index.get_level_values('date').min().date()} ‚Üí {df_features.index.get_level_values('date').max().date()}")

print(f"\nüìã Liste des {len(feature_cols)} features cr√©√©es :")
for i, col in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {col}")

print("\nüìä Statistiques descriptives :")
display(df_features[feature_cols].describe().T.round(3))

# ============================================
# 6. VISUALISATION (Aper√ßu)
# ============================================

print("\nüìä √âTAPE 6 : Aper√ßu des donn√©es...")

print("\nüîç Premi√®res lignes :")
display(df_features.head(15))

print("\nüîç Exemple pour un ticker (AI.PA) :")
if 'AI.PA' in df_features.index.get_level_values('ticker'):
    display(df_features.xs('AI.PA', level='ticker').tail(10))

# ============================================
# 7. SAUVEGARDE
# ============================================

print("\nüíæ √âTAPE 7 : Sauvegarde du dataset...")

output_file = PROCESSED_DATA_PATH / "cac40_features.csv"
df_features.to_csv(output_file)

print(f"‚úÖ Dataset sauvegard√© : {output_file}")
print(f"   Taille du fichier : {output_file.stat().st_size / 1024 / 1024:.2f} MB")

# Sauvegarder aussi la liste des features
features_list = pd.DataFrame({'feature': feature_cols})
features_list_path = PROCESSED_DATA_PATH / "features_list.csv"
features_list.to_csv(features_list_path, index=False)
print(f"‚úÖ Liste des features sauvegard√©e : {features_list_path}")

print("\n" + "=" * 70)
print("‚úÖ FEATURE ENGINEERING TERMIN√â !")
print("=" * 70)

print(f"\nüìÅ Fichiers g√©n√©r√©s :")
print(f"   1. {output_file}")
print(f"   2. {features_list_path}")

print("\nüéØ Prochaines √©tapes :")
print("   1. Train/Test split")
print("   2. Normalisation des features")
print("   3. Mod√®le ML (LSTM, XGBoost, etc.)")


FEATURE ENGINEERING AM√âLIOR√â - CAC40
üì• Chargement de 38 fichiers...
‚úÖ 106779 observations charg√©es
üîß Calcul de 15 features essentielles...
‚úÖ 106019 observations apr√®s feature engineering
‚úÖ 12 features cr√©√©es

üíæ Donn√©es sauvegard√©es : /home/onyxia/work/Gestion-portefeuille/data/interim/cac40_interim_features.csv

üìä Aper√ßu des features :
                   count                           mean                  min  \
Date              106019  2020-07-12 00:52:15.920920064  2015-01-30 00:00:00   
Close           106019.0                     190.063469                1.313   
Volume          106019.0                 2020907.569596                  0.0   
Returns_1D      106019.0                       0.001234            -0.985377   
Returns_5D      106019.0                       0.004826            -0.997102   
Returns_20D     106019.0                       0.022157            -0.997335   
Vol_20D         106019.0                       0.327736             0.04706

In [None]:
df['garman_klass_vol'] = ((np.log(df['high'])-np.log(df['low']))**2)/2-(2*np.log(2)-1)*((np.log(df['adj close'])-np.log(df['open']))**2)

df['rsi'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.rsi(close=x, length=20))

df['bb_low'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,0])
                                                          
df['bb_mid'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,1])
                                                          
df['bb_high'] = df.groupby(level=1)['adj close'].transform(lambda x: pandas_ta.bbands(close=np.log1p(x), length=20).iloc[:,2])

def compute_atr(stock_data):
    atr = pandas_ta.atr(high=stock_data['high'],
                        low=stock_data['low'],
                        close=stock_data['close'],
                        length=14)
    return atr.sub(atr.mean()).div(atr.std())

df['atr'] = df.groupby(level=1, group_keys=False).apply(compute_atr)

def compute_macd(close):
    macd = pandas_ta.macd(close=close, length=20).iloc[:,0]
    return macd.sub(macd.mean()).div(macd.std())

df['macd'] = df.groupby(level=1, group_keys=False)['adj close'].apply(compute_macd)

df['dollar_volume'] = (df['adj close']*df['volume'])/1e6

df