# **Random Forest Model**

## 1. Import libraries

In [None]:
# Library imports
import numpy as np
import pandas as pd
import os
from sklearn.ensemble import RandomForestRegressor
from config.paths import dir_input_raw, dir_input_cleaned
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
# helper function, generate lagged datasets for testing on vintages
def gen_lagged_data(metadata, data, last_date, lag):
    # only go up to the last date
    lagged_data = data.loc[data.date <= last_date, :].reset_index(drop=True)
    for col in lagged_data.columns[1:]:
        pub_lag = metadata.loc[metadata.series == col, "months_lag"].values[0] # publication lag of this particular variable
        # go back as far as needed for the pub_lag of the variable, then + the lag (so -2 for 2 months back), also -1 because 0 lag means in month, last month data available, not current month in
        lagged_data.loc[(len(lagged_data) - pub_lag + lag - 1) :, col] = np.nan

    return lagged_data

# helper function, flatten a dataset for methods that don't do timeseries, extra columns for each lag
def flatten_data(data, target_variable, n_lags):
    flattened_data = data.loc[~pd.isna(data[target_variable]), :]
    orig_index = flattened_data.index
    for i in range(1, n_lags + 1):
        lagged_indices = orig_index - i
        lagged_indices = lagged_indices[lagged_indices >= 0]
        tmp = data.loc[lagged_indices, :]
        tmp.date = tmp.date + pd.DateOffset(months=i)
        tmp = tmp.drop([target_variable], axis=1)
        tmp.columns = [j + "_" + str(i) if j != "date" else j for j in tmp.columns]
        flattened_data = flattened_data.merge(tmp, how="left", on="date")

    return flattened_data

# helper function fill missings in a dataset with the mean from the training set
def mean_fill_dataset(training, test):
    mean_dict = {}
    for col in training.columns[1:]:
        mean_dict[col] = np.nanmean(training[col])
    filled = test.copy()
    for col in training.columns[1:]:
        filled.loc[pd.isna(filled[col]), col] = mean_dict[col]
    return filled

## 2. Data set up

In [None]:
# Params
# ==============================================================================
target = "gdpc1"
gdp_lag = 1
lags_to_test = list(range(-2, 3))  # Lags of -2, -1, 0, 1, 2 months
start_train = "1947-01-01" 
start_val   = "2005-03-01"
start_test  = "2010-04-01"

# Data read and preprocessing
# ==============================================================================
metadata = pd.read_csv(os.path.join(dir_input_raw, "meta_data.csv"))
data     = pd.read_csv(os.path.join(dir_input_cleaned, "data_tf.csv"), parse_dates=["date"])
data     = data.set_index("date").asfreq("MS")

# Split data into training, validation, and test sets
data_train = data[start_train: "2005-02-01"]
data_val   = data[start_val: "2010-03-01"]
data_test  = data[start_test:]

# Plot training, validation, and test data
# ==============================================================================
plt.figure(figsize=(10, 6))
plt.plot(data_train[target].dropna(),label="Training")
plt.plot(data_val[target].dropna(),label="Validation")
plt.title('GDP growth for different sets', fontsize=16)
plt.xlabel('Time', fontsize=14)
plt.ylabel('GDP Growth', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

## 3. Training the model

The model is trained on a rolling basis. So if we are predicting 2000-03-01, the model is trained on data as it would have appeared in 1999-12-01, right before the beginning of the prediction period.

In [None]:
data_train_val = pd.concat([data_train, data_val])     # Monthly data for training and validation
data_train_val.reset_index(inplace=True)
data_train_val.tail()

In [None]:
# this cell is form information only. The process is repeated below for each test date.
transformed_train = mean_fill_dataset(data_train_val, data_train_val) # fill any missing values with the mean
transformed_train = flatten_data(transformed_train, target, 4) # 4 means include 4 additional lags of each variable
# only keep quarterly observations and drop early observations with not enough history for lagged variables
transformed_train = transformed_train.loc[transformed_train.date.dt.month.isin([3,6,9,12]),:].dropna(axis=0, how="any").reset_index(drop=True)

# we can see that e.g. the variable `payems` now has 4 columns in the data, for each of the lags, and that the data now has one row per quarter
transformed_train.loc[:, [True] + list(transformed_train.columns[1:].str.contains("payems"))].tail()

## 4. Rolling nowcast on artificial data vintages

In [None]:
# Dates and actual values for validation
# ==============================================================================
dates         = data_val.dropna().index.strftime("%Y-%m-%d").to_list()  # Quarterly dates
actual_values = data_val[target].dropna().values                        # Quarterly GDP values

In [None]:
pred_dict = {k: [] for k in lags_to_test}
for date in dates:
    # preparing the data for the model
    train = data_train_val.loc[data_train_val.date <= str(pd.to_datetime(date) - pd.tseries.offsets.DateOffset(months=3))[:10],:] # data as it would have appeared at beginning of prediction period
    transformed_train = mean_fill_dataset(train, train) # fill any missing values with the mean
    transformed_train = flatten_data(transformed_train, target, 4) # 4 means include 4 additional lags of each variable
    # only keep quarterly observations and drop early observations with not enough history for lagged variables
    transformed_train = transformed_train.loc[transformed_train.date.dt.month.isin([3,6,9,12]),:].dropna(axis=0, how="any").reset_index(drop=True)
    
    # train 10 models to average outputs because of stochasticity
    models = []
    for i in range(10):
        model = RandomForestRegressor(
            n_estimators = 100, 
            criterion = "squared_error", 
            max_depth = None, 
            min_samples_split = 0.01, 
            min_samples_leaf = 0.01,
            max_features = "sqrt",
            bootstrap = False
        )

        x = transformed_train.drop(["date", target], axis=1)
        y = transformed_train[target]

        # fitting the actual models
        model.fit(x, y)
        models.append(model)

    for lag in lags_to_test:
        # the data available for this date at this artificial vintage
        tmp_data = gen_lagged_data(metadata, data_train_val, date, lag)
        # get data in format necessary for model
        tmp_data = mean_fill_dataset(train, tmp_data) # fill with the mean of the training set
        tmp_data = flatten_data(tmp_data, target, 4)
        x = tmp_data.loc[tmp_data.date == date, :].drop(["date", target], axis=1)
        # average results of 10 models' predictions
        preds = []
        for i in range(10):
            prediction = models[i].predict(x)[0]
            preds.append(prediction)
        
        pred_dict[lag].append(np.nanmean(preds))

In [None]:
final_df = pd.DataFrame(pred_dict)
final_df["Actual GDP Growth"] = actual_values
final_df = final_df[["Actual GDP Growth", -2, -1, 0, 1, 2]]
final_df

## 5. Assess and visualize model performance

In [None]:
# table of RMSE by vintage
performance = pd.DataFrame(columns=["Vintage", "RMSE"])
for lag in lags_to_test:
    tmp = pd.DataFrame({
        "Vintage":lag,
        "RMSE":np.sqrt(np.mean((np.array(actual_values) - np.array(pred_dict[lag])) ** 2))
    }, index=[0])
    performance = pd.concat([performance, tmp]).reset_index(drop=True)
performance.round(4)

In [None]:
plt.figure(figsize=(15, 10))

# Plot each lag series using a loop
for label in final_df.columns:
    linestyle = '-' if label == "Actual GDP Growth" else '--'  # Solid line for actual, dashed for lags
    plt.plot(data_val.dropna().index, final_df[label], label=label, marker='o', markersize=10 ,linestyle=linestyle, linewidth=3)

# Add labels and title
plt.title('Predictions vs Actual GDP', fontsize=16)
plt.xlabel('Time', fontsize=14)
plt.ylabel('GDP Growth', fontsize=14)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()