In [59]:
import pandas as pd
import mlflow
import optuna
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [77]:
EFFR = pd.read_csv("FEDFUNDS.csv", sep=",", )
FFF = pd.read_csv("Federal_Fund_Future.csv", sep=";")

# Convertir les colonnes en type date
EFFR['observation_date'] = pd.to_datetime(EFFR['observation_date'])  
FFF['Date'] = pd.to_datetime(FFF['Date'])    # Remplace 'date' par le nom de la colonne appropriée
FFF = FFF.drop(columns=['Open', 'High', 'Low', 'Vol.', 'Change %'], errors='ignore')
FFF['implicite_fund_rate'] = 100 - FFF['Price']
merged_df = pd.merge(FFF[['Date', 'implicite_fund_rate']], EFFR[['observation_date', 'FEDFUNDS']], 
                      left_on='Date', right_on='observation_date', how='inner')
merged_df = merged_df.sort_values('Date').reset_index(drop=True)
merged_df

Unnamed: 0,Date,implicite_fund_rate,observation_date,FEDFUNDS
0,2003-01-01,1.230,2003-01-01,1.24
1,2003-02-01,1.260,2003-02-01,1.26
2,2003-03-01,1.250,2003-03-01,1.25
3,2003-04-01,1.250,2003-04-01,1.26
4,2003-05-01,1.250,2003-05-01,1.26
...,...,...,...,...
268,2025-05-01,4.330,2025-05-01,4.33
269,2025-06-01,4.330,2025-06-01,4.33
270,2025-07-01,4.330,2025-07-01,4.33
271,2025-08-01,4.330,2025-08-01,4.33


In [78]:



merged_df_=merged_df.copy()

n_lags = 6  # number of previous months to use

merged_df_['spread'] = merged_df_['implicite_fund_rate'] - merged_df_['FEDFUNDS']


for i in range(1, n_lags + 1):
    merged_df_[f'implicite_fund_rate_lag_{i}'] = merged_df_['implicite_fund_rate'].shift(i)

n_lags_spread = 3
for i in range(1, n_lags_spread + 1):
    merged_df_[f'spread_lag_{i}'] = merged_df_['spread'].shift(i)

# Nettoyer données
merged_df_ = merged_df_.dropna()


merged_df_['FEDFUNDS_change'] = merged_df_['FEDFUNDS'].diff() # variation au prochain pas de temps
merged_df_['target'] = (merged_df_['FEDFUNDS_change'] > 0).astype(int)  # 1 si hausse, 0 sinon



In [79]:
merged_df_["target"].value_counts()#use accuracy

target
0    147
1    120
Name: count, dtype: int64

In [80]:



def check_params(allowed,kwargs):


    for key,val in kwargs.items():

                if key not in allowed:
                    raise ValueError(f"{key} unrecongized, use : {allowed.keys()}")
                if val not in allowed[key]:
                    raise ValueError(f"{val} unrecongized, for {key} use : {allowed[key]}")

class MlFlowModel:

    def __init__(self,exper_name,model_instance,**kwargs):

        """
        supports two models: logistic regression and random forest only 

        """
        self.model_instance=model_instance
        self.exper_name=exper_name
        self.kwargs=kwargs
        self.n_trials=10

        if isinstance(self.model_instance,type):
            raise ValueError(f"dont forget parantheses in your model_instance")
        

        if self.model_instance.__class__ is not   LogisticRegression and  self.model_instance.__class__ is not RandomForestClassifier:
            raise ValueError('sorry, we can use logistic regression and random forest only for the moment ')


        if self.model_instance.__class__ is  LogisticRegression:
            allowed={
                "penalty" : ["l1", "l2", "elasticnet", None],
                "solver" : "saga"
            }

            if "penalty" not in kwargs.keys() or "solver" not in kwargs.keys():
                raise ValueError("need to specify both penalty and solver for logistic regression ")

        if self.model_instance.__class__ is  RandomForestClassifier:
            allowed={
                "max_features" : ["sqrt", "log2", None],
                "criterion" : ["gini", "entropy", "log_loss"]
            }
            if "max_features" not in kwargs.keys() or "criterion" not in kwargs.keys():
                raise ValueError("need to specify both max_features and criterion for randm forest ")

        
        check_params(allowed,self.kwargs)

    
    def objective(self,trial):
                

                if self.model_instance.__class__ is  LogisticRegression:

                    if "elasticnet" == self.kwargs["penalty"]:
                        # Define hyperparameter search space
                        params_candidate_space={
                            "l1_ratio": trial.suggest_float("l1_ratio", 0.1, 0.9),
                            "C":  trial.suggest_float("C", 0.1, 10),
                        }
                    elif "l1" == self.kwargs["penalty"] or "l2" == self.kwargs["penalty"]:
                        # Define hyperparameter search space
                        params_candidate_space={
                            "C":  trial.suggest_float("C", 0.1, 10),
                        }
                    
                if self.model_instance.__class__ is  RandomForestClassifier:

                    params_candidate_space={
                            'max_depth': trial.suggest_int("max_depth", 10, 50),
                            "min_samples_split":  trial.suggest_int("min_samples_split", 2, 10),
                            "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 4),
                 
                        }

                

                model = self.model_instance.__class__(
                    **params_candidate_space, 
                    **self.fixed_params,
                    **self.kwargs  
                )
            

                model.fit(self.X_train, self.y_train)


                #log_loss is sklearn is negative loglikelihood so we minimise it
                #in randomforest if we use gini, then we need to modify this line, but we do simple:
                score = log_loss(self.y_train, model.predict_proba(self.X_train))

                return score



    def run_optuna_study(self):

        storage = f"sqlite:///optuna_{self.exper_name}.db"
        study = optuna.create_study(
                            direction="minimize", 
                            study_name=f"{self.exper_name}",
                            storage=storage,
                            load_if_exists=True
        )  # 'minimize' for loss functions
        study.optimize(self.objective, n_trials=self.n_trials)
        study_best_params=study.best_params
        return study_best_params

    def train(self,merged_df_,feature_columns):
        
        with mlflow.start_run(run_name=f"{self.exper_name}_FedFunds"):
        
                mlflow.log_param("model_type", self.model_instance.__class__.__name__)
                mlflow.log_param("variables", feature_columns)
                mlflow.log_param("scaler", "StandardScaler")
                mlflow.log_param("train_test_split", "80/20 time series")
                mlflow.log_param("kwargs",self.kwargs)
                
                #do easy splits and transforms:
                X = merged_df_[feature_columns]
                y = merged_df_['target']
                split_idx = int(len(merged_df_) * 0.8)
                self.X_train, self.X_test = X.iloc[:split_idx], X.iloc[split_idx:]
                self.y_train, self.y_test = y.iloc[:split_idx], y.iloc[split_idx:]
                #always standartise variables for logit 
                if self.model_instance.__class__ is  LogisticRegression:
                    scaler = StandardScaler()
                    self.X_train = scaler.fit_transform(self.X_train)
                    self.X_test = scaler.transform(self.X_test)

                # Model penalisations

                if self.model_instance.__class__ is  LogisticRegression:
                    print("normalising data")
                    self.fixed_params={
                        
                        "class_weight" : 'balanced',  
                        "random_state" : 42,
                    }
                

                    if self.kwargs["penalty"] is not None:
                        #in this case need to search space for best hyperparameter
                        study_best_params=self.run_optuna_study()
                        print("Best Hyperparameters:",study_best_params)
                    else:
                        study_best_params={}

                if self.model_instance.__class__ is  RandomForestClassifier:
                    self.fixed_params={
                        "n_estimators" : 150,
                        "class_weight" : 'balanced',  
                       
                    }
                    study_best_params=self.run_optuna_study()
                    print("Best Hyperparameters:",study_best_params)
                    

                        
                model = self.model_instance.__class__(
                     **self.fixed_params,
                     **study_best_params,
                     **self.kwargs                
                )
                print("-----------")
                

               
                model.fit(self.X_train, self.y_train)
                # Predictions
                y_pred = model.predict(self.X_test)
                
                acc = accuracy_score(self.y_test, y_pred)
                mlflow.log_metric("accuracy", acc)
                mlflow.sklearn.log_model(model, name="model")

                print(f"Run logged to MLflow: accuracy={acc:.4f}")


def var_selection_with_permutation(model,X,y,threshold_below_which_to_drop=0.01):

    '''
    supports any classification model, models implying gradient descent (parametric models) require dataset to be normalised 
    models such as random forests or other non gradient tree models do not require variable standartisation 
    '''
    
    split_idx = int(len(X) * 0.8)

    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    if model.__class__.__name__ in ["LogisticRegression",]:
        
        scaler = StandardScaler().set_output(transform="pandas")
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)



    model.fit(X_train, y_train)
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42,scoring="accuracy")

    perm_importances = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': result.importances_mean,
        'importance_std': result.importances_std
    })
    
    return perm_importances[perm_importances["importance_mean"]>threshold_below_which_to_drop]["feature"].tolist()


Gradient based models

In [81]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns =  [f'spread_lag_{i}' for i in range(1, n_lags_spread + 1)] 
X = merged_df_[feature_columns]
y = merged_df_['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lrsimplest",LogisticRegression(),penalty=None,solver="saga")
model.train(merged_df_,feature_columns)



{'nb variables before permutation selection ': 3, 'nb variables after permutation selection ': 3}
normalising data
-----------




Run logged to MLflow: accuracy=0.6481


In [82]:
#extract new features with AI AGENT -> finish code after
df = pd.read_csv("macro_features_monthly.csv")
df['DATE'] = pd.to_datetime(df['DATE'], dayfirst=True)


df['DATE'] = df['DATE'].values.astype('datetime64[M]')
df_=df.copy()
n_lags=4
for el in df_.columns[1:].tolist():
    for i in range(1, n_lags + 1):
        df_[f'{el}_{i}'] = df_[el].shift(i)
df_ = df_.dropna()
final=pd.merge(merged_df_, df_, 
                      left_on='Date', right_on='DATE', how='inner')

In [84]:
df.columns

Index(['DATE', 'CPIAUCSL', 'PCEPI', 'UNRATE', 'PAYEMS', 'GDP', 'INDPRO',
       'FinStress'],
      dtype='object')

In [73]:
feature_columns = [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]


In [75]:

model=MlFlowModel("lr-all-vars",LogisticRegression(),penalty=None,solver="saga")
model.train(final,feature_columns)

normalising data
-----------




Run logged to MLflow: accuracy=0.6111


In [16]:

model=MlFlowModel("lr-all-vars-elnet",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(final,feature_columns)

normalising data


[I 2025-11-13 21:36:22,674] A new study created in RDB with name: lr-all-vars-elnet
[I 2025-11-13 21:36:22,838] Trial 0 finished with value: 0.5259614301045616 and parameters: {'l1_ratio': 0.6518244308407477, 'C': 8.726561244029567}. Best is trial 0 with value: 0.5259614301045616.
[I 2025-11-13 21:36:22,989] Trial 1 finished with value: 0.5259045274812394 and parameters: {'l1_ratio': 0.4022262885635456, 'C': 8.02802110164171}. Best is trial 1 with value: 0.5259045274812394.
[I 2025-11-13 21:36:23,117] Trial 2 finished with value: 0.5276832404314976 and parameters: {'l1_ratio': 0.5401095795699872, 'C': 5.449078283412642}. Best is trial 1 with value: 0.5259045274812394.
[I 2025-11-13 21:36:23,262] Trial 3 finished with value: 0.5684442893535757 and parameters: {'l1_ratio': 0.5848653662095064, 'C': 0.5322278093855329}. Best is trial 1 with value: 0.5259045274812394.
[I 2025-11-13 21:36:23,385] Trial 4 finished with value: 0.5299672560177435 and parameters: {'l1_ratio': 0.40749396759304224

Best Hyperparameters: {'l1_ratio': 0.4022262885635456, 'C': 8.02802110164171}
-----------




Run logged to MLflow: accuracy=0.8148


In [17]:
model=MlFlowModel("lr-all-vars-l1",LogisticRegression(),penalty="l1",solver="saga")
model.train(final,feature_columns)

normalising data


[I 2025-11-13 21:36:43,406] A new study created in RDB with name: lr-all-vars-l1
[I 2025-11-13 21:36:43,597] Trial 0 finished with value: 0.5330379585010842 and parameters: {'C': 3.00657035841268}. Best is trial 0 with value: 0.5330379585010842.
[I 2025-11-13 21:36:43,701] Trial 1 finished with value: 0.5336040421656751 and parameters: {'C': 2.8476418651802975}. Best is trial 0 with value: 0.5330379585010842.
[I 2025-11-13 21:36:43,834] Trial 2 finished with value: 0.569874988640743 and parameters: {'C': 0.6376796339358232}. Best is trial 0 with value: 0.5330379585010842.
[I 2025-11-13 21:36:43,939] Trial 3 finished with value: 0.5672601263757869 and parameters: {'C': 0.6898050241395346}. Best is trial 0 with value: 0.5330379585010842.
[I 2025-11-13 21:36:44,083] Trial 4 finished with value: 0.5263818271523549 and parameters: {'C': 8.756330034724071}. Best is trial 4 with value: 0.5263818271523549.
[I 2025-11-13 21:36:44,203] Trial 5 finished with value: 0.5356721908354839 and paramete

Best Hyperparameters: {'C': 9.606003882764288}
-----------




Run logged to MLflow: accuracy=0.8148


In [18]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
X = final[feature_columns]
y = final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lr-per-vars0.01-elnet",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(final,feature_columns_after_permutation_test)

{'nb variables before permutation selection ': 37, 'nb variables after permutation selection ': 11}
normalising data


[I 2025-11-13 21:37:59,855] A new study created in RDB with name: lr-per-vars0.01-elnet
[I 2025-11-13 21:38:00,015] Trial 0 finished with value: 0.6340546017231301 and parameters: {'l1_ratio': 0.46645173945990737, 'C': 0.21945811587525393}. Best is trial 0 with value: 0.6340546017231301.
[I 2025-11-13 21:38:00,157] Trial 1 finished with value: 0.5645381584513496 and parameters: {'l1_ratio': 0.17205941459952462, 'C': 7.575706047975876}. Best is trial 1 with value: 0.5645381584513496.
[I 2025-11-13 21:38:00,308] Trial 2 finished with value: 0.5717693813694031 and parameters: {'l1_ratio': 0.21120937483670563, 'C': 3.076176934629406}. Best is trial 1 with value: 0.5645381584513496.
[I 2025-11-13 21:38:00,502] Trial 3 finished with value: 0.5612342427182211 and parameters: {'l1_ratio': 0.7844675733909673, 'C': 9.53618313612696}. Best is trial 3 with value: 0.5612342427182211.
[I 2025-11-13 21:38:00,645] Trial 4 finished with value: 0.5645708905775546 and parameters: {'l1_ratio': 0.102640235

Best Hyperparameters: {'l1_ratio': 0.7844675733909673, 'C': 9.53618313612696}
-----------




Run logged to MLflow: accuracy=0.8889


In [19]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
X = final[feature_columns]
y = final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y,0.02)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lr-per-vars0.02-elnet",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(final,feature_columns_after_permutation_test)

{'nb variables before permutation selection ': 37, 'nb variables after permutation selection ': 10}
normalising data


[I 2025-11-13 21:38:36,181] A new study created in RDB with name: lr-per-vars0.02-elnet
[I 2025-11-13 21:38:36,326] Trial 0 finished with value: 0.5625799457796318 and parameters: {'l1_ratio': 0.6924280386143199, 'C': 8.293790572683685}. Best is trial 0 with value: 0.5625799457796318.
[I 2025-11-13 21:38:36,458] Trial 1 finished with value: 0.6115209634002232 and parameters: {'l1_ratio': 0.21525778385165442, 'C': 0.4272315713993765}. Best is trial 0 with value: 0.5625799457796318.
[I 2025-11-13 21:38:36,589] Trial 2 finished with value: 0.5646687202353088 and parameters: {'l1_ratio': 0.1495823409752366, 'C': 8.400189546594419}. Best is trial 0 with value: 0.5625799457796318.
[I 2025-11-13 21:38:36,782] Trial 3 finished with value: 0.5656112825728503 and parameters: {'l1_ratio': 0.1768549667965914, 'C': 6.939857703652378}. Best is trial 0 with value: 0.5625799457796318.
[I 2025-11-13 21:38:36,938] Trial 4 finished with value: 0.5656831785208424 and parameters: {'l1_ratio': 0.24579334157

Best Hyperparameters: {'l1_ratio': 0.6924280386143199, 'C': 8.293790572683685}
-----------




Run logged to MLflow: accuracy=0.9259


In [20]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
X = final[feature_columns]
y = final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y,0.02)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lr-per-vars0.02-l1",LogisticRegression(),penalty="l1",solver="saga")
model.train(final,feature_columns_after_permutation_test)

{'nb variables before permutation selection ': 37, 'nb variables after permutation selection ': 10}
normalising data


[I 2025-11-13 21:38:59,011] A new study created in RDB with name: lr-per-vars0.02-l1
[I 2025-11-13 21:38:59,223] Trial 0 finished with value: 0.5665365502330372 and parameters: {'C': 2.7244807514662597}. Best is trial 0 with value: 0.5665365502330372.
[I 2025-11-13 21:38:59,345] Trial 1 finished with value: 0.5628350025819375 and parameters: {'C': 4.987497787647181}. Best is trial 1 with value: 0.5628350025819375.
[I 2025-11-13 21:38:59,463] Trial 2 finished with value: 0.5913303586898634 and parameters: {'C': 0.6926224282170393}. Best is trial 1 with value: 0.5628350025819375.
[I 2025-11-13 21:38:59,610] Trial 3 finished with value: 0.5609561847354283 and parameters: {'C': 9.741431163041298}. Best is trial 3 with value: 0.5609561847354283.
[I 2025-11-13 21:38:59,699] Trial 4 finished with value: 0.5675685301423785 and parameters: {'C': 2.4147884959961474}. Best is trial 3 with value: 0.5609561847354283.
[I 2025-11-13 21:38:59,806] Trial 5 finished with value: 0.568858656729954 and par

Best Hyperparameters: {'C': 9.741431163041298}
-----------




Run logged to MLflow: accuracy=0.9259


Tree models

In [24]:
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
model=MlFlowModel("rf-all-vars-sqrt-gini",RandomForestClassifier(),max_features="sqrt",criterion="gini")
model.train(final,feature_columns)

[I 2025-11-13 21:43:36,491] A new study created in RDB with name: rf-all-vars-sqrt-gini
[I 2025-11-13 21:43:36,980] Trial 0 finished with value: 0.31180734629285534 and parameters: {'max_depth': 13, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.31180734629285534.
[I 2025-11-13 21:43:37,428] Trial 1 finished with value: 0.27003947065336487 and parameters: {'max_depth': 13, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 1 with value: 0.27003947065336487.
[I 2025-11-13 21:43:37,903] Trial 2 finished with value: 0.18240230400532625 and parameters: {'max_depth': 23, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.18240230400532625.
[I 2025-11-13 21:43:38,361] Trial 3 finished with value: 0.21448407334194788 and parameters: {'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.18240230400532625.
[I 2025-11-13 21:43:38,859] Trial 4 finished with value: 0.27355673839178396 and

Best Hyperparameters: {'max_depth': 23, 'min_samples_split': 4, 'min_samples_leaf': 1}
-----------




Run logged to MLflow: accuracy=0.6296


In [25]:
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
model=MlFlowModel("rf-all-vars-sqrt-entropy",RandomForestClassifier(),max_features="sqrt",criterion="entropy")
model.train(final,feature_columns)

[I 2025-11-13 21:44:21,908] A new study created in RDB with name: rf-all-vars-sqrt-entropy
[I 2025-11-13 21:44:22,382] Trial 0 finished with value: 0.2955613610782683 and parameters: {'max_depth': 47, 'min_samples_split': 7, 'min_samples_leaf': 4}. Best is trial 0 with value: 0.2955613610782683.
[I 2025-11-13 21:44:22,850] Trial 1 finished with value: 0.2943127983110874 and parameters: {'max_depth': 24, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.2943127983110874.
[I 2025-11-13 21:44:23,339] Trial 2 finished with value: 0.20340014105745388 and parameters: {'max_depth': 31, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 2 with value: 0.20340014105745388.
[I 2025-11-13 21:44:23,797] Trial 3 finished with value: 0.23016930971452096 and parameters: {'max_depth': 45, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 2 with value: 0.20340014105745388.
[I 2025-11-13 21:44:24,299] Trial 4 finished with value: 0.2787517747244563 and p

Best Hyperparameters: {'max_depth': 31, 'min_samples_split': 3, 'min_samples_leaf': 2}
-----------




Run logged to MLflow: accuracy=0.6296


In [29]:
model = LogisticRegression( random_state=42,class_weight='balanced')
feature_columns =  [ s for s in final.columns if s[-2] == '_' and s[-1].isdigit()]
X = final[feature_columns]
y = final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y,0.02)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("rf-per-vars0.02-sqrt-logloss",RandomForestClassifier(),max_features="sqrt",criterion="log_loss")
model.train(final,feature_columns_after_permutation_test)

{'nb variables before permutation selection ': 37, 'nb variables after permutation selection ': 10}


[I 2025-11-13 21:47:46,167] A new study created in RDB with name: rf-per-vars0.02-sqrt-logloss
[I 2025-11-13 21:47:46,614] Trial 0 finished with value: 0.27360833536187656 and parameters: {'max_depth': 23, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.27360833536187656.
[I 2025-11-13 21:47:47,025] Trial 1 finished with value: 0.26467643383733996 and parameters: {'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 1}. Best is trial 1 with value: 0.26467643383733996.
[I 2025-11-13 21:47:47,501] Trial 2 finished with value: 0.32152465624486354 and parameters: {'max_depth': 46, 'min_samples_split': 4, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.26467643383733996.
[I 2025-11-13 21:47:47,957] Trial 3 finished with value: 0.2839479950075895 and parameters: {'max_depth': 19, 'min_samples_split': 9, 'min_samples_leaf': 2}. Best is trial 1 with value: 0.26467643383733996.
[I 2025-11-13 21:47:48,390] Trial 4 finished with value: 0.237893104413201

Best Hyperparameters: {'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 1}
-----------




Run logged to MLflow: accuracy=0.6111


In [38]:


import mlflow 
experiments = mlflow.search_experiments()
runs=mlflow.search_runs(experiment_ids=[exp.experiment_id for exp in experiments])

In [57]:
best=runs.sort_values("metrics.accuracy",ascending=False)[["status","run_id","metrics.accuracy","params.model_type","params.variables","params.kwargs","tags.mlflow.runName"]].reset_index(drop=True)
champion_model_id=best["run_id"][0]
champion_model_variables=best["params.variables"][0]
best.head()

Unnamed: 0,status,run_id,metrics.accuracy,params.model_type,params.variables,params.kwargs,tags.mlflow.runName
0,FINISHED,463ad99c2bbb42d2bd44f4290e2632f1,0.925926,LogisticRegression,"['implicite_fund_rate_lag_1', 'implicite_fund_...","{'penalty': 'l1', 'solver': 'saga'}",lr-per-vars0.02-l1_FedFunds
1,FINISHED,bb0460e0e5424f0d89d024fd7ebe0dc3,0.925926,LogisticRegression,"['implicite_fund_rate_lag_1', 'implicite_fund_...","{'penalty': 'elasticnet', 'solver': 'saga'}",lr-per-vars0.02-elnet_FedFunds
2,FINISHED,d59f138ec0b84ef1bb32cf63b2f2cfb2,0.888889,LogisticRegression,"['implicite_fund_rate_lag_1', 'implicite_fund_...","{'penalty': 'elasticnet', 'solver': 'saga'}",lr-per-vars0.01-elnet_FedFunds
3,FINISHED,1a9faa98a77c4a07967372918d007932,0.814815,LogisticRegression,"['implicite_fund_rate_lag_1', 'implicite_fund_...","{'penalty': None, 'solver': 'saga'}",lr-all-vars_FedFunds
4,FINISHED,e48e00ddea284b2ca52294c47dfda620,0.814815,LogisticRegression,"['implicite_fund_rate_lag_1', 'implicite_fund_...","{'penalty': 'elasticnet', 'solver': 'saga'}",lr-all-vars-elnet_FedFunds


In [55]:
champion_model=mlflow.pyfunc.load_model(f"runs:/{champion_model_id}/model")

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]
Downloading artifacts: 100%|██████████| 5/5 [00:00<00:00, 1055.91it/s] 


In [58]:
champion_model_variables

"['implicite_fund_rate_lag_1', 'implicite_fund_rate_lag_2', 'implicite_fund_rate_lag_4', 'implicite_fund_rate_lag_5', 'implicite_fund_rate_lag_6', 'PAYEMS_1', 'PAYEMS_4', 'GDP_2', 'INDPRO_1', 'FinStress_3']"

In [None]:
"""

TO DO LIST::




do rolling training and plot curves

do mlflow UI

reassemble data in data folder, find ressources

target is right? 

do clean notebook and folder structure 

push to main 


"""