In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# 1. Load and Preprocess Data (Ensuring consistency with previous steps)
df = pd.read_csv('../data/raw/M2SL.csv')
df['observation_date'] = pd.to_datetime(df['observation_date'])
df.set_index('observation_date', inplace=True)
df.rename(columns={'M2SL': 'M2'}, inplace=True)

# Calculate Monthly Growth Rate (Log-Difference) * 100
df['M2_Growth'] = np.log(df['M2']).diff() * 100
df_clean = df.dropna()

# 2. Feature Engineering: Create Lag Features
# We transform the time series problem into a supervised learning problem.
# Use past 12 months (lags) to predict the current month.
def create_lag_features(data, n_lags=12):
    df_ml = pd.DataFrame(data)
    # Create columns for each lag (t-1, t-2, ..., t-n)
    columns = [df_ml.shift(i) for i in range(1, n_lags + 1)]
    df_lags = pd.concat(columns, axis=1)
    
    # Rename columns to meaningful names
    df_lags.columns = [f'Lag_{i}' for i in range(1, n_lags + 1)]
    
    # Add the target variable (current value at t)
    df_lags['Target'] = df_ml
    
    # Drop rows with NaN values created by shifting
    df_lags.dropna(inplace=True)
    return df_lags

# Apply transformation
n_lags = 12 
data_ml = create_lag_features(df_clean['M2_Growth'], n_lags)

# 3. Train-Test Split (Chronological)
# We calculate the split point to match the 80/20 split used in ARIMA
n_obs_original = len(df_clean)
n_test_obs = int(n_obs_original * 0.2)
# We use the index to split strictly by time
split_date = df_clean.index[-n_test_obs]

train_ml = data_ml[data_ml.index < split_date]
test_ml = data_ml[data_ml.index >= split_date]

# Separate Features (X) and Target (y)
X_train = train_ml.drop('Target', axis=1)
y_train = train_ml['Target']
X_test = test_ml.drop('Target', axis=1)
y_test = test_ml['Target']

print("Data preparation complete.")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples:     {X_test.shape[0]}")

FileNotFoundError: [Errno 2] No such file or directory: 'M2SL.csv'

In [None]:
# 2. Train Random Forest Model
print("Training Random Forest...")

# n_estimators=100: number of trees
# random_state=42: ensures reproducibility
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

print("Training completed.")

# Check Feature Importance (Optional but good for the paper)
# Shows which lags are most useful for prediction
importances = rf_model.feature_importances_
indices = np.argsort(importances)[::-1]

print("\nTop 5 Most Important Lags:")
for i in range(5):
    print(f"{X_train.columns[indices[i]]}: {importances[indices[i]]:.4f}")