In [15]:
import pandas as pd
import numpy as np
from statsmodels.tsa.stattools import adfuller

In [None]:
# 1. Chargement optimisé (date_id en index)
df = pd.read_csv('../data/raw/train.csv')
df = df.set_index('date_id').sort_index()


In [None]:
# 2. Gestion basique des manquants (LdP style: ffill)
# On ne drop pas tout de suite, on propage la dernière valeur connue
df = df.ffill()

In [None]:
# 3. Séparer les groupes de features
# On exclut les cibles (targets)
targets = ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
features_cols = [c for c in df.columns if c not in targets]

In [None]:
# 3. Séparer les groupes de features
# On exclut les cibles (targets)
targets = ['forward_returns', 'risk_free_rate', 'market_forward_excess_returns']
features_cols = [c for c in df.columns if c not in targets]
# Identifier les groupes (P, I, E, etc.)
p_features = [c for c in features_cols if c.startswith('P')]
i_features = [c for c in features_cols if c.startswith('I')]
e_features = [c for c in features_cols if c.startswith('E')]
v_features = [c for c in features_cols if c.startswith('V')]

print(f"Total Features: {len(features_cols)}")
print(f"Prix/Valuation (P): {len(p_features)}")
print(f"Taux (I): {len(i_features)}")

Total Features: 94
Prix/Valuation (P): 13
Taux (I): 9


In [20]:
# 4. Test de Stationnarité (ADF)
def get_adf_summary(df, cols):
    results = []
    print(f"Test de {len(cols)} features...")
    for col in cols:
        # On prend un échantillon si c'est trop long, ou tout si possible
        series = df[col].dropna()
        if len(series) < 50: continue
        
        try:
            # Test ADF
            res = adfuller(series, maxlag=1, regression='c', autolag=None)
            p_val = res[1]
            results.append({
                'feature': col,
                'p_value': p_val,
                'is_stationary': p_val < 0.05
            })
        except:
            pass # Erreur de calcul possible sur séries constantes
        
    return pd.DataFrame(results)

In [25]:
# Lançons le test sur les P (Prix) et I (Taux) car ce sont les suspects n°1
res_p = get_adf_summary(df, p_features[:10]) # Testons les 10 premiers P
res_i = get_adf_summary(df, i_features[:10]) # Testons les 10 premiers I

print("\n--- RÉSULTATS PRIX (P) ---")
print(res_p.head(10))

print("\n--- RÉSULTATS TAUX (I) ---")
print(res_i.head(10))

Test de 10 features...
Test de 9 features...

--- RÉSULTATS PRIX (P) ---
  feature       p_value  is_stationary
0      P1  3.256254e-10           True
1     P10  6.036371e-02          False
2     P11  9.600409e-02          False
3     P12  0.000000e+00           True
4     P13  0.000000e+00           True
5      P2  7.828848e-03           True
6      P3  0.000000e+00           True
7      P4  0.000000e+00           True
8      P5  0.000000e+00           True
9      P6  0.000000e+00           True

--- RÉSULTATS TAUX (I) ---
  feature       p_value  is_stationary
0      I1  1.096091e-05           True
1      I2  1.798980e-02           True
2      I3  1.798807e-03           True
3      I4  3.205022e-08           True
4      I5  1.005218e-01          False
5      I6  6.484908e-04           True
6      I7  3.592869e-04           True
7      I8  1.002987e-01          False
8      I9  1.245318e-01          False
