In [1]:
import pandas as pd

In [125]:
EFFR = pd.read_csv("FEDFUNDS.csv", sep=",", )
FFF = pd.read_csv("Federal_Fund_Future.csv", sep=";")

# Convertir les colonnes en type date
EFFR['observation_date'] = pd.to_datetime(EFFR['observation_date'])  
FFF['Date'] = pd.to_datetime(FFF['Date'])    # Remplace 'date' par le nom de la colonne appropriée
FFF = FFF.drop(columns=['Open', 'High', 'Low', 'Vol.', 'Change %'], errors='ignore')
FFF['implicite_fund_rate'] = 100 - FFF['Price']
merged_df = pd.merge(FFF[['Date', 'implicite_fund_rate']], EFFR[['observation_date', 'FEDFUNDS']], 
                      left_on='Date', right_on='observation_date', how='inner')
merged_df = merged_df.sort_values('Date').reset_index(drop=True)
merged_df

Unnamed: 0,Date,implicite_fund_rate,observation_date,FEDFUNDS
0,2003-01-01,1.230,2003-01-01,1.24
1,2003-02-01,1.260,2003-02-01,1.26
2,2003-03-01,1.250,2003-03-01,1.25
3,2003-04-01,1.250,2003-04-01,1.26
4,2003-05-01,1.250,2003-05-01,1.26
...,...,...,...,...
268,2025-05-01,4.330,2025-05-01,4.33
269,2025-06-01,4.330,2025-06-01,4.33
270,2025-07-01,4.330,2025-07-01,4.33
271,2025-08-01,4.330,2025-08-01,4.33


In [128]:

import pandas as pd
import numpy as np


merged_df_=merged_df.copy()

n_lags = 6  # number of previous months to use

merged_df_['spread'] = merged_df_['implicite_fund_rate'] - merged_df_['FEDFUNDS']


for i in range(1, n_lags + 1):
    merged_df_[f'implicite_fund_rate_lag_{i}'] = merged_df_['implicite_fund_rate'].shift(i)

n_lags_spread = 3
for i in range(1, n_lags_spread + 1):
    merged_df_[f'spread_lag_{i}'] = merged_df_['spread'].shift(i)

# Nettoyer données
merged_df_ = merged_df_.dropna()


merged_df_['FEDFUNDS_change'] = merged_df_['FEDFUNDS'].diff() # variation au prochain pas de temps
merged_df_['target'] = (merged_df_['FEDFUNDS_change'] > 0).astype(int)  # 1 si hausse, 0 sinon



In [129]:
merged_df_["target"].value_counts()#use accuracy

target
0    147
1    120
Name: count, dtype: int64

In [145]:
import mlflow
import optuna
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

def objective(trial):
            # Define hyperparameter search space
            l1_ratio = trial.suggest_float("l1_ratio", 0.1, 0.9)

            model = LogisticRegression(
                penalty='elasticnet',    
                solver='saga',            # only 'saga' supports elasticnet
                l1_ratio=l1_ratio,             # mix between L1 (lasso) and L2 (ridge), 0 ≤ l1_ratio ≤ 1
                class_weight='balanced',  
                random_state=42,         
                      
            )

            model.fit(X_train_scaled, y_train)

            score = log_loss(y_train, model.predict_proba(X_train_scaled))

            return score

def Logresmlflow(merged_df_, feature_columns,elnet=False,n_trials=10):

    with mlflow.start_run(run_name="LogisticRegression_FedFunds"):
        
        mlflow.log_param("model_type", "LogisticRegression")
        mlflow.log_param("variables", feature_columns)
        mlflow.log_param("scaler", "StandardScaler")
        mlflow.log_param("train_test_split", "80/20 chronological")

        X = merged_df_[feature_columns]
        y = merged_df_['target']

        split_idx = int(len(merged_df_) * 0.8)
        X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
        y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]


        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Model

        if elnet:

            storage = "sqlite:///optuna_elnetlogres.db"
            study = optuna.create_study(
                direction="minimize", study_name="elnetlogres", storage=storage, load_if_exists=True
            )  # 'minimize' for loss functions
            study.optimize(objective, n_trials=n_trials)


            print("Best Hyperparameters:", study.best_params)

            
            model = LogisticRegression(
                penalty='elasticnet',    
                solver='saga',         
                l1_ratio=study.best_params["l1_ratio"],          
                class_weight='balanced',  
                random_state=42,         
                      
            )
        else:

            model = LogisticRegression(random_state=42, class_weight='balanced')

        model.fit(X_train_scaled, y_train)
        # Predictions
        y_pred = model.predict(X_test_scaled)
        y_proba = model.predict_proba(X_test_scaled)[:, 1]

        # Evaluate
        acc = accuracy_score(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("accuracy", acc)

        # Log model (use the correct variable)
        mlflow.sklearn.log_model(model, name="model")

        print(f"Run logged to MLflow: accuracy={acc:.4f}")


In [131]:
feature_columns =  [f'spread_lag_{i}' for i in range(1, n_lags_spread + 1)] 
Logresmlflow(merged_df_,feature_columns)



Run logged to MLflow: accuracy=0.6481


In [132]:
#extract new features with AI AGENT -> finish code after
df = pd.read_csv("macro_features_monthly.csv")
df['DATE'] = pd.to_datetime(df['DATE'], dayfirst=True)


df['DATE'] = df['DATE'].values.astype('datetime64[M]')
df_=df.copy()
n_lags=4
for el in df_.columns[1:].tolist():
    for i in range(1, n_lags + 1):
        df_[f'{el}_{i}'] = df_[el].shift(i)
df_ = df_.dropna()
final=pd.merge(merged_df_, df_, 
                      left_on='Date', right_on='DATE', how='inner')

In [143]:
feature_columns = [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
Logresmlflow(final,feature_columns)



Run logged to MLflow: accuracy=0.8148


In [146]:
feature_columns = [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
Logresmlflow(final,feature_columns,elnet=True)

[I 2025-11-04 14:03:31,867] A new study created in RDB with name: elnetlogres
[I 2025-11-04 14:03:32,228] Trial 0 finished with value: 0.6675765316270827 and parameters: {'l1_ratio': 0.4408586012564829}. Best is trial 0 with value: 0.6675765316270827.
[I 2025-11-04 14:03:32,475] Trial 1 finished with value: 0.6676027873537315 and parameters: {'l1_ratio': 0.508665112102722}. Best is trial 0 with value: 0.6675765316270827.
[I 2025-11-04 14:03:32,872] Trial 2 finished with value: 0.6677223762673321 and parameters: {'l1_ratio': 0.7897956556080005}. Best is trial 0 with value: 0.6675765316270827.
[I 2025-11-04 14:03:33,127] Trial 3 finished with value: 0.6677318808170154 and parameters: {'l1_ratio': 0.8101110148286794}. Best is trial 0 with value: 0.6675765316270827.
[I 2025-11-04 14:03:33,451] Trial 4 finished with value: 0.6677749395780589 and parameters: {'l1_ratio': 0.8994653840153611}. Best is trial 0 with value: 0.6675765316270827.
[I 2025-11-04 14:03:33,843] Trial 5 finished with val

Best Hyperparameters: {'l1_ratio': 0.3364606212611263}




Run logged to MLflow: accuracy=0.8519


In [None]:
"""

TO DO LIST::


- do nice sklearn pipelines   (code clean)


- do this with multiple models(rf) + update optuna objectives    (max accuracy)

for model_name, model in [("LogReg", logres), ("MLP", mlp), ("RF", forest)]:
    with mlflow.start_run(run_name=model_name):
        mlflow.log_param("model_type", model_name)
        ...
        mlflow.log_metric("accuracy", acc)
        mlflow.sklearn.log_model(model, artifact_path=model_name)


- do gen AI agent extracting data (once) as with macro_features_monthly.csv   (code  clean )

- do gen AI agent to extract news etc   (max accuracy)


- optional : do 3 class prediction (hard, as everything may change)

"""