In [1]:
# Enable auto-reloading of external modules - useful during development
%load_ext autoreload
%autoreload 2

# Configure Python path to find our custom modules
import sys
from pathlib import Path

# Add project root to the Python path for proper imports
project_root = Path.cwd().parent
if project_root not in sys.path:
    sys.path.insert(0, str(project_root))

In [2]:
# Import necessary libraries
import src.processing as processing
import src.config as lists

In [3]:
# Load and process data
df = processing.load_data("/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv")
df_prepared = processing.prepare_data(df)
df_added_features = processing.create_all_model_features_orchestrated(df_prepared)
df_missing = processing.drop_missing_final_vars_streamlined(df_added_features, lists.final_set_A_predictor_names_and_dependent)
df_final = processing.annual_winsorize_variables(df_missing, lists.columns_to_winsorize)

  data = pd.read_csv(file_path)


Data loaded successfully from /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv
Original number of observations: 317304
Number of columns after selection: 30
Observations after year filter (2000-2023): 302751
Observations after excluding financial and utility firms: 170598
Starting feature construction. Initial df shape: (170598, 30)
  Creating lags for: ['at', 'ni', 'rect', 'invt', 'ap', 'sale']

Performing pre-calculation validity checks & preparations...
  Missing 'xrd' values filled with 0.
  'ipo_year' created from 'ipodate'.

Constructing dependent variable...
  OCF_Scaled_t_plus_1 created.

Constructing Set A (OLS) predictors...
  Set A predictors constructed.

Constructing control dummy variables...
  Dummy variables constructed.

Constructing Set B (additional ML) predictors...
  Set B predictors constructed.

Selecting final model variables and dropping intermediate columns...
  Shape of DataFrame 

In [4]:
# Split data chronologically - train on pre-2018, test on 2018+
train_df, test_df = processing.split_data_chronologically(df_final, 'fyear', split_year=2018)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Training period: {train_df['fyear'].min()} - {train_df['fyear'].max()}")
print(f"Test period: {test_df['fyear'].min()} - {test_df['fyear'].max()}")

Training set: 102567 obs (Predictor years <= 2018)
Test set: 20882 obs (Predictor years > 2018)
Training data shape: (102567, 30)
Test data shape: (20882, 30)
Training period: 2001.0 - 2018.0
Test period: 2019.0 - 2022.0


In [5]:
# Prepare features and dependent variable for Stacking regression
X_train = train_df[lists.SET_A_FEATURES + lists.CONTROL_DUMMY_FEATURES]
y_train = train_df[lists.DEPENDENT_VARIABLE]

X_test = test_df[lists.SET_A_FEATURES + lists.CONTROL_DUMMY_FEATURES]
y_test = test_df[lists.DEPENDENT_VARIABLE]

# Drop dummy variables that caused multicollinearity in OLS
X_train = X_train.drop(columns=['ASC842_dummy', 'COVID_dummy'])
X_test = X_test.drop(columns=['ASC842_dummy', 'COVID_dummy'])

print("Features included in the model:")
for i, feature in enumerate(X_train.columns, 1):
    print(f"{i:2d}. {feature}")

print(f"\nDependent variable: {lists.DEPENDENT_VARIABLE}")
print(f"Training observations: {len(X_train)}")
print(f"Test observations: {len(X_test)}")

Features included in the model:
 1. OCF_Scaled_Lag_t
 2. NI_Scaled_t
 3. Accruals_Scaled_t
 4. Delta_Rec_Scaled_t
 5. Delta_Inv_Scaled_t
 6. Delta_AP_Scaled_t
 7. DP_Scaled_t
 8. ln_at_t
 9. ASC606_dummy
10. TCJA_dummy

Dependent variable: OCF_Scaled_t_plus_1
Training observations: 102567
Test observations: 20882


In [6]:
# =============================================================================
# STACKING ENSEMBLE MODEL
# =============================================================================

from sklearn.ensemble import StackingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import time

# Define base models (estimators) for the stack
base_models = [
    ('decision_tree', DecisionTreeRegressor(max_depth=5, min_samples_split=20, min_samples_leaf=2, random_state=42)),
    ('random_forest', RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)),
    ('xgboost', xgb.XGBRegressor(max_depth=3, learning_rate=0.1, n_estimators=200, subsample=1.0, colsample_bytree=0.8, random_state=42)),
    ('linear_reg', LinearRegression())
]

# Define meta-learner (final estimator)
meta_learner = LinearRegression()

# Create stacking regressor
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,  # 5-fold cross-validation for generating meta-features
    n_jobs=-1
)

print("Training Stacking Ensemble Model...")
print(f"Base models: {len(base_models)}")
print(f"Meta-learner: {type(meta_learner).__name__}")
print(f"Cross-validation folds: 5")

start_time = time.time()

# Fit the stacking model
stacking_regressor.fit(X_train, y_train)

elapsed_time = time.time() - start_time
print(f"\nStacking model training completed in {elapsed_time:.2f} seconds")

Training Stacking Ensemble Model...
Base models: 4
Meta-learner: LinearRegression
Cross-validation folds: 5

Stacking model training completed in 94.11 seconds


In [7]:
# =============================================================================
# STACKING MODEL PERFORMANCE
# =============================================================================

# Predictions from stacking model
y_train_pred_stack = stacking_regressor.predict(X_train)
y_test_pred_stack = stacking_regressor.predict(X_test)

# Simple metrics function for any model
def print_model_performance(y_true, y_pred, model_name, dataset):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{model_name} ({dataset}): R²={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")

# Print stacking results
print("STACKING ENSEMBLE PERFORMANCE:")
print_model_performance(y_train, y_train_pred_stack, "Stacking", "Train")
print_model_performance(y_test, y_test_pred_stack, "Stacking", "Test")

# Compare with individual base models
print("\nINDIVIDUAL BASE MODEL PERFORMANCE:")
for name, model in base_models:
    model.fit(X_train, y_train)
    y_test_pred_individual = model.predict(X_test)
    print_model_performance(y_test, y_test_pred_individual, name.replace('_', ' ').title(), "Test")

STACKING ENSEMBLE PERFORMANCE:
Stacking (Train): R²=0.8028 | RMSE=0.4706 | MAE=0.1609
Stacking (Test): R²=0.6615 | RMSE=0.5674 | MAE=0.1921

INDIVIDUAL BASE MODEL PERFORMANCE:
Decision Tree (Test): R²=0.6246 | RMSE=0.5976 | MAE=0.2162
Random Forest (Test): R²=0.6619 | RMSE=0.5671 | MAE=0.1925
Xgboost (Test): R²=0.6473 | RMSE=0.5792 | MAE=0.1963
Linear Reg (Test): R²=0.6072 | RMSE=0.6112 | MAE=0.2260


In [8]:
# =============================================================================
# STACKING MODEL ANALYSIS
# =============================================================================

import pandas as pd

# Get base model predictions (meta-features)
print("META-LEARNER ANALYSIS:")
print("="*50)

# Get meta-learner coefficients (how much weight each base model gets)
meta_coefficients = stacking_regressor.final_estimator_.coef_
base_model_names = [name for name, _ in base_models]

# Create DataFrame for meta-learner weights
meta_weights = pd.DataFrame({
    'base_model': base_model_names,
    'weight': meta_coefficients
}).sort_values('weight', ascending=False, key=abs)

print("Base Model Weights in Final Ensemble:")
for i, (_, row) in enumerate(meta_weights.iterrows(), 1):
    print(f"{i:2d}. {row['base_model']:<15} {row['weight']:>8.4f}")

print(f"\nMeta-learner intercept: {stacking_regressor.final_estimator_.intercept_:.4f}")

# Model interpretation
print("\nMODEL INTERPRETATION:")
print("="*50)
print("Positive weights: Model contributes positively to final prediction")
print("Negative weights: Model provides contrarian signal")
print("Larger absolute weights: Model has more influence on final prediction")

META-LEARNER ANALYSIS:
Base Model Weights in Final Ensemble:
 1. xgboost           0.4914
 2. random_forest     0.4896
 3. linear_reg        0.1025
 4. decision_tree    -0.0769

Meta-learner intercept: 0.0021

MODEL INTERPRETATION:
Positive weights: Model contributes positively to final prediction
Negative weights: Model provides contrarian signal
Larger absolute weights: Model has more influence on final prediction
