In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime


# Load Federal Funds Effective Rate (actual historical rates)
EFFR = pd.read_csv("FEDFUNDS.csv", sep=",")


# Load Federal Funds Futures (market expectations)
FFF = pd.read_csv("Federal_Fund_Future.csv", sep=";")


# Convert date columns to datetime
EFFR['observation_date'] = pd.to_datetime(EFFR['observation_date'])
FFF['Date'] = pd.to_datetime(FFF['Date'])

# Clean futures data - keep only necessary columns
FFF = FFF.drop(columns=['Open', 'High', 'Low', 'Vol.', 'Change %'], errors='ignore')

# Calculate implicit fund rate from futures price
# Futures price = 100 - implied rate (standard convention)
FFF['implicite_fund_rate'] = 100 - FFF['Price']

# Merge datasets on date
merged_df = pd.merge(
    FFF[['Date', 'implicite_fund_rate']], 
    EFFR[['observation_date', 'FEDFUNDS']], 
    left_on='Date', 
    right_on='observation_date', 
    how='inner'
)


# Load macro features
macro_df = pd.read_csv("macro_features_monthly.csv")
macro_df['DATE'] = pd.to_datetime(macro_df['DATE'], dayfirst=True)


# Convert to monthly period for proper alignment
macro_df['DATE'] = macro_df['DATE'].values.astype('datetime64[M]')



merged_all = pd.merge(
    merged_df, 
    macro_df, 
    left_on='Date', 
    right_on='DATE', 
    how='inner'
)





In [24]:



# Sort by date and reset index
merged_all = merged_all.sort_values('Date').reset_index(drop=True)


# Create working copy
df_features = merged_all.copy()

# Calculate spread (market expectation vs actual rate)
df_features['spread_change'] = df_features['implicite_fund_rate'] - df_features['FEDFUNDS']

#Calculate CHANGES (first differences) for stationarity

for el in ["implicite_fund_rate",'CPIAUCSL', 'PCEPI', 'UNRATE', 'PAYEMS', 'GDP', 'INDPRO','FinStress']:
       df_features[f'{el}_change']=df_features[el].diff()


n_lags = 6  # Use past 6 months

# Lagged variables creation

for el in ["spread","implicite_fund_rate",'CPIAUCSL', 'PCEPI', 'UNRATE', 'PAYEMS', 'GDP', 'INDPRO','FinStress']:
    for i in range(1, n_lags + 1):
        df_features[f'{el}_change_lag_{i}'] = df_features[f'{el}_change'].shift(i)


df_features['future_fedfunds_change'] = df_features['FEDFUNDS'].diff().shift(-1)

# Binary classification: 1 if rate increases, 0 otherwise
df_features['target'] = (df_features['future_fedfunds_change'] > 0).astype(int)




In [25]:
df_final=df_features.dropna()
print("balanced dataset for accuracy : ",df_final['target'].value_counts())



balanced dataset for accuracy :  target
0    146
1    119
Name: count, dtype: int64


In [28]:
import pandas as pd
import mlflow
import optuna
import mlflow.sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.inspection import permutation_importance
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)


def check_params(allowed,kwargs):


    for key,val in kwargs.items():

                if key not in allowed:
                    raise ValueError(f"{key} unrecongized, use : {allowed.keys()}")
                if val not in allowed[key]:
                    raise ValueError(f"{val} unrecongized, for {key} use : {allowed[key]}")

class MlFlowModel:

    def __init__(self,exper_name,model_instance,**kwargs):

        """
        supports two models: logistic regression and random forest only 

        """
        self.model_instance=model_instance
        self.exper_name=exper_name
        self.kwargs=kwargs
        self.n_trials=10

        if isinstance(self.model_instance,type):
            raise ValueError(f"dont forget parantheses in your model_instance")
        

        if self.model_instance.__class__ is not   LogisticRegression and  self.model_instance.__class__ is not RandomForestClassifier:
            raise ValueError('sorry, we can use logistic regression and random forest only for the moment ')


        if self.model_instance.__class__ is  LogisticRegression:
            allowed={
                "penalty" : ["l1", "l2", "elasticnet", None],
                "solver" : "saga"
            }

            if "penalty" not in kwargs.keys() or "solver" not in kwargs.keys():
                raise ValueError("need to specify both penalty and solver for logistic regression ")

        if self.model_instance.__class__ is  RandomForestClassifier:
            allowed={
                "max_features" : ["sqrt", "log2", None],
                "criterion" : ["gini", "entropy", "log_loss"]
            }
            if "max_features" not in kwargs.keys() or "criterion" not in kwargs.keys():
                raise ValueError("need to specify both max_features and criterion for randm forest ")

        
        check_params(allowed,self.kwargs)

    
    def objective(self,trial):
                

                if self.model_instance.__class__ is  LogisticRegression:

                    if "elasticnet" == self.kwargs["penalty"]:
                        # Define hyperparameter search space
                        params_candidate_space={
                            "l1_ratio": trial.suggest_float("l1_ratio", 0.1, 0.9),
                            "C":  trial.suggest_float("C", 0.1, 10),
                        }
                    elif "l1" == self.kwargs["penalty"] or "l2" == self.kwargs["penalty"]:
                        # Define hyperparameter search space
                        params_candidate_space={
                            "C":  trial.suggest_float("C", 0.1, 10),
                        }
                    
                if self.model_instance.__class__ is  RandomForestClassifier:

                    params_candidate_space={
                            'max_depth': trial.suggest_int("max_depth", 10, 50),
                            "min_samples_split":  trial.suggest_int("min_samples_split", 2, 10),
                            "min_samples_leaf":  trial.suggest_int("min_samples_leaf", 1, 4),
                 
                        }

                

                model = self.model_instance.__class__(
                    **params_candidate_space, 
                    **self.fixed_params,
                    **self.kwargs  
                )
            

                model.fit(self.X_train, self.y_train)


                #log_loss is sklearn is negative loglikelihood so we minimise it
                #in randomforest if we use gini, then we need to modify this line, but we do simple:
                score = log_loss(self.y_train, model.predict_proba(self.X_train))

                return score



    def run_optuna_study(self):

        storage = f"sqlite:///optuna_{self.exper_name}.db"
        study = optuna.create_study(
                            direction="minimize", 
                            study_name=f"{self.exper_name}",
                            storage=storage,
                            load_if_exists=True
        )  # 'minimize' for loss functions
        study.optimize(self.objective, n_trials=self.n_trials)
        study_best_params=study.best_params
        return study_best_params

    def train(self,merged_df_,feature_columns):
        
        with mlflow.start_run(run_name=f"{self.exper_name}_FedFunds"):
        
                mlflow.log_param("model_type", self.model_instance.__class__.__name__)
                mlflow.log_param("variables", feature_columns)
                mlflow.log_param("scaler", "StandardScaler")
                mlflow.log_param("train_test_split", "80/20 time series")
                mlflow.log_param("kwargs",self.kwargs)
                
                #do easy splits and transforms:
                X = merged_df_[feature_columns]
                y = merged_df_['target']
                split_idx = int(len(merged_df_) * 0.8)
                self.X_train, self.X_test = X.iloc[:split_idx], X.iloc[split_idx:]
                self.y_train, self.y_test = y.iloc[:split_idx], y.iloc[split_idx:]
                #always standartise variables for logit 
                if self.model_instance.__class__ is  LogisticRegression:
                    scaler = StandardScaler()
                    self.X_train = scaler.fit_transform(self.X_train)
                    self.X_test = scaler.transform(self.X_test)

                # Model penalisations

                if self.model_instance.__class__ is  LogisticRegression:
                    print("normalising data")
                    self.fixed_params={
                        
                        "class_weight" : 'balanced',  
                        "random_state" : 42,
                    }
                

                    if self.kwargs["penalty"] is not None:
                        #in this case need to search space for best hyperparameter
                        study_best_params=self.run_optuna_study()
                        print("Best Hyperparameters:",study_best_params)
                    else:
                        study_best_params={}

                if self.model_instance.__class__ is  RandomForestClassifier:
                    self.fixed_params={
                        "n_estimators" : 150,
                        "class_weight" : 'balanced',  
                       
                    }
                    study_best_params=self.run_optuna_study()
                    print("Best Hyperparameters:",study_best_params)
                    

                        
                model = self.model_instance.__class__(
                     **self.fixed_params,
                     **study_best_params,
                     **self.kwargs                
                )
                print("-----------")
                

               
                model.fit(self.X_train, self.y_train)
                # Predictions
                y_pred = model.predict(self.X_test)
                
                acc = accuracy_score(self.y_test, y_pred)
                mlflow.log_metric("accuracy", acc)
                mlflow.sklearn.log_model(model, name="model")

                print(f"Run logged to MLflow: accuracy={acc:.4f}")


def var_selection_with_permutation(model,X,y,threshold_below_which_to_drop=0.01):

    '''
    supports any classification model, models implying gradient descent (parametric models) require dataset to be normalised 
    models such as random forests or other non gradient tree models do not require variable standartisation 
    '''
    
    split_idx = int(len(X) * 0.8)

    X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
    y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

    if model.__class__.__name__ in ["LogisticRegression",]:
        
        scaler = StandardScaler().set_output(transform="pandas")
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)



    model.fit(X_train, y_train)
    result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42,scoring="accuracy")

    perm_importances = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': result.importances_mean,
        'importance_std': result.importances_std
    })
    
    return perm_importances[perm_importances["importance_mean"]>threshold_below_which_to_drop]["feature"].tolist()


In [9]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns =  [f'spread_change_lag_{i}' for i in range(1, 3 + 1)] 
X = final_df[feature_columns]
y = final_df['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lrsimplest",LogisticRegression(),penalty=None,solver="saga")
model.train(final_df,feature_columns)

{'nb variables before permutation selection ': 3, 'nb variables after permutation selection ': 3}
normalising data
-----------


  return FileStore(store_uri, store_uri)


Run logged to MLflow: accuracy=0.6038


In [32]:
feature_columns=[col for col in df_final.columns if 'spread' in col.lower()  and col[-1].isdigit() ]


model=MlFlowModel("lrsimplest",LogisticRegression(),penalty=None,solver="saga")
model.train(df_final,feature_columns)


normalising data
-----------




Run logged to MLflow: accuracy=0.6226


In [35]:
feature_columns=[col for col in df_final.columns if 'change' in col.lower()  and col[-1].isdigit() ]


model=MlFlowModel("lr-all-vars",LogisticRegression(),penalty=None,solver="saga")
model.train(df_final,feature_columns)

normalising data
-----------




Run logged to MLflow: accuracy=0.6604


In [36]:
feature_columns=[col for col in df_final.columns if 'change' in col.lower()  and col[-1].isdigit() ]


model=MlFlowModel("lr-all-vars",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(df_final,feature_columns)

normalising data


[I 2025-11-13 22:55:27,376] A new study created in RDB with name: lr-all-vars
[I 2025-11-13 22:55:27,541] Trial 0 finished with value: 0.5051579097197709 and parameters: {'l1_ratio': 0.6085026294298703, 'C': 6.05591792396805}. Best is trial 0 with value: 0.5051579097197709.
[I 2025-11-13 22:55:27,682] Trial 1 finished with value: 0.5051176951185505 and parameters: {'l1_ratio': 0.3140243690147676, 'C': 4.397897380859839}. Best is trial 1 with value: 0.5051176951185505.
[I 2025-11-13 22:55:27,831] Trial 2 finished with value: 0.5105740347495147 and parameters: {'l1_ratio': 0.8079833866071162, 'C': 1.939630114780993}. Best is trial 1 with value: 0.5051176951185505.
[I 2025-11-13 22:55:27,997] Trial 3 finished with value: 0.5062370193997643 and parameters: {'l1_ratio': 0.5435535037559394, 'C': 3.575123780844866}. Best is trial 1 with value: 0.5051176951185505.
[I 2025-11-13 22:55:28,170] Trial 4 finished with value: 0.5050139301039904 and parameters: {'l1_ratio': 0.16777888921494605, 'C': 

Best Hyperparameters: {'l1_ratio': 0.16777888921494605, 'C': 3.7414640543944437}
-----------




Run logged to MLflow: accuracy=0.6604


In [None]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns=[col for col in df_final.columns if 'change' in col.lower()  and col[-1].isdigit() ]
X = df_final[feature_columns]
y = df_final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lr-per-vars0.01-elnet",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(df_final,feature_columns_after_permutation_test)

[I 2025-11-13 22:57:01,717] Using an existing study with name 'lr-per-vars0.01-elnet' instead of creating a new one.


{'nb variables before permutation selection ': 54, 'nb variables after permutation selection ': 13}
normalising data


[I 2025-11-13 22:57:02,384] Trial 10 finished with value: 0.5485562006802969 and parameters: {'l1_ratio': 0.5345677177674106, 'C': 9.692397403325243}. Best is trial 10 with value: 0.5485562006802969.
[I 2025-11-13 22:57:02,535] Trial 11 finished with value: 0.5485509180132122 and parameters: {'l1_ratio': 0.5309322206305545, 'C': 9.936698549812881}. Best is trial 11 with value: 0.5485509180132122.
[I 2025-11-13 22:57:02,660] Trial 12 finished with value: 0.5485518911853527 and parameters: {'l1_ratio': 0.5551783486995054, 'C': 9.968375676833631}. Best is trial 11 with value: 0.5485509180132122.
[I 2025-11-13 22:57:02,785] Trial 13 finished with value: 0.5486016179394072 and parameters: {'l1_ratio': 0.43253833872312986, 'C': 7.711986725941861}. Best is trial 11 with value: 0.5485509180132122.
[I 2025-11-13 22:57:02,904] Trial 14 finished with value: 0.5487350181468039 and parameters: {'l1_ratio': 0.5863154920008881, 'C': 5.576269618877306}. Best is trial 11 with value: 0.5485509180132122.

Best Hyperparameters: {'l1_ratio': 0.5309322206305545, 'C': 9.936698549812881}
-----------




Run logged to MLflow: accuracy=0.7170


In [None]:
model = LogisticRegression(random_state=42, class_weight='balanced')
feature_columns=[col for col in df_final.columns if 'change' in col.lower()  and col[-1].isdigit() ]
X = df_final[feature_columns]
y = df_final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("lr-per-vars0.01-elnet",LogisticRegression(),penalty="elasticnet",solver="saga")
model.train(df_final,feature_columns_after_permutation_test)

In [39]:
model = LogisticRegression( random_state=42,class_weight='balanced')
feature_columns=[col for col in df_final.columns if 'change' in col.lower()  and col[-1].isdigit() ]
X = df_final[feature_columns]
y = df_final['target']
feature_columns_after_permutation_test=var_selection_with_permutation(model,X,y,0.02)
print(
{
    "nb variables before permutation selection " : len(feature_columns),
    "nb variables after permutation selection " : len(feature_columns_after_permutation_test)

}
)
model=MlFlowModel("rf-per-vars0.02-sqrt-logloss",RandomForestClassifier(),max_features="sqrt",criterion="log_loss")
model.train(df_final,feature_columns_after_permutation_test)

[I 2025-11-13 22:58:19,174] Using an existing study with name 'rf-per-vars0.02-sqrt-logloss' instead of creating a new one.


{'nb variables before permutation selection ': 54, 'nb variables after permutation selection ': 8}


[I 2025-11-13 22:58:19,604] Trial 10 finished with value: 0.1682347534295147 and parameters: {'max_depth': 31, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.16018747069960343.
[I 2025-11-13 22:58:20,019] Trial 11 finished with value: 0.16829147929633806 and parameters: {'max_depth': 31, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.16018747069960343.
[I 2025-11-13 22:58:20,441] Trial 12 finished with value: 0.16920938820941034 and parameters: {'max_depth': 37, 'min_samples_split': 2, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.16018747069960343.
[I 2025-11-13 22:58:20,842] Trial 13 finished with value: 0.18926948630812726 and parameters: {'max_depth': 28, 'min_samples_split': 3, 'min_samples_leaf': 1}. Best is trial 5 with value: 0.16018747069960343.
[I 2025-11-13 22:58:21,262] Trial 14 finished with value: 0.25716567185937006 and parameters: {'max_depth': 44, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best 

Best Hyperparameters: {'max_depth': 26, 'min_samples_split': 3, 'min_samples_leaf': 1}
-----------




Run logged to MLflow: accuracy=0.8302


In [None]:
"""
experiment with this ntb more (first delete other experiments old)


"""