In [87]:
# Enable auto-reloading of external modules - useful during development
%load_ext autoreload
%autoreload 2

# Configure Python path to find our custom modules
import sys
from pathlib import Path

# Add project root to the Python path for proper imports
project_root = Path.cwd().parent
if project_root not in sys.path:
    sys.path.insert(0, str(project_root))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
# Import necessary libraries
import src.processing as processing
import src.config as lists

In [89]:
# Restart kernel first to clear import cache
# Then reload the modules with the corrected configuration

# Force reimport of modules after configuration changes
import importlib
import sys

# Remove from cache if already imported
modules_to_reload = ['src.config', 'src.processing']
for module in modules_to_reload:
    if module in sys.modules:
        del sys.modules[module]

# Now import fresh
import src.processing as processing
import src.config as lists

# Verify the configuration is correct now
print("After fresh import:")
print(f"DEPENDENT_VARIABLE type: {type(lists.DEPENDENT_VARIABLE)}")
print(f"DEPENDENT_VARIABLE value: {lists.DEPENDENT_VARIABLE}")

# Load and process data
df = processing.load_data("/Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv")
df_prepared = processing.prepare_data(df)
df_added_features = processing.create_all_model_features_orchestrated(df_prepared)
df_missing = processing.drop_missing_final_vars_streamlined(df_added_features, lists.final_set_A_predictor_names_and_dependent)
df_final = processing.annual_winsorize_variables(df_missing, lists.columns_to_winsorize)

After fresh import:
DEPENDENT_VARIABLE type: <class 'str'>
DEPENDENT_VARIABLE value: OCF_Scaled_t_plus_1


  data = pd.read_csv(file_path)


Data loaded successfully from /Users/luis.m/Library/Mobile Documents/com~apple~CloudDocs/Documents ☁️/VSC Projects/Master_Thesis/data/raw/nvzfxcoxdvh1at7i.csv
Original number of observations: 317304
Number of columns after selection: 30
Observations after year filter (2000-2023): 302751
Observations after excluding financial and utility firms: 170598
Starting feature construction. Initial df shape: (170598, 30)
  Creating lags for: ['at', 'ni', 'rect', 'invt', 'ap', 'sale']

Performing pre-calculation validity checks & preparations...
  Missing 'xrd' values filled with 0.
  'ipo_year' created from 'ipodate'.

Constructing dependent variable...
  OCF_Scaled_t_plus_1 created.

Constructing Set A (OLS) predictors...
  Set A predictors constructed.

Constructing control dummy variables...
  Dummy variables constructed.

Constructing Set B (additional ML) predictors...
  Set B predictors constructed.

Selecting final model variables and dropping intermediate columns...
  Shape of DataFrame 

In [90]:
# Split data chronologically - train on pre-2018, test on 2018+
train_df, test_df = processing.split_data_chronologically(df_final, 'fyear', split_year=2018)

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"Training period: {train_df['fyear'].min()} - {train_df['fyear'].max()}")
print(f"Test period: {test_df['fyear'].min()} - {test_df['fyear'].max()}")


Training set: 102567 obs (Predictor years <= 2018)
Test set: 20882 obs (Predictor years > 2018)
Training data shape: (102567, 30)
Test data shape: (20882, 30)
Training period: 2001.0 - 2018.0
Test period: 2019.0 - 2022.0


In [91]:
# Prepare features and dependent variable for OLS regression
X_train = train_df[lists.SET_A_FEATURES + lists.CONTROL_DUMMY_FEATURES]
y_train = train_df[lists.DEPENDENT_VARIABLE]  # Now using string directly

X_test = test_df[lists.SET_A_FEATURES + lists.CONTROL_DUMMY_FEATURES]
y_test = test_df[lists.DEPENDENT_VARIABLE]

print("Features included in the model:")
for i, feature in enumerate(X_train.columns, 1):
    print(f"{i:2d}. {feature}")

print(f"\nDependent variable: {lists.DEPENDENT_VARIABLE}")
print(f"Training observations: {len(X_train)}")
print(f"Test observations: {len(X_test)}")

Features included in the model:
 1. OCF_Scaled_Lag_t
 2. NI_Scaled_t
 3. Accruals_Scaled_t
 4. Delta_Rec_Scaled_t
 5. Delta_Inv_Scaled_t
 6. Delta_AP_Scaled_t
 7. DP_Scaled_t
 8. ln_at_t
 9. ASC606_dummy
10. ASC842_dummy
11. TCJA_dummy
12. COVID_dummy

Dependent variable: OCF_Scaled_t_plus_1
Training observations: 102567
Test observations: 20882


In [93]:
# =============================================================================
# SCIKIT-LEARN OLS - For ML Model Comparisons
# =============================================================================

from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

# Fit OLS model
ols_model = LinearRegression()
ols_model.fit(X_train, y_train)

# Predictions
y_train_pred = ols_model.predict(X_train)
y_test_pred = ols_model.predict(X_test)

# Simple metrics function for any model
def print_model_performance(y_true, y_pred, model_name, dataset):
    r2 = r2_score(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    print(f"{model_name} ({dataset}): R²={r2:.4f} | RMSE={rmse:.4f} | MAE={mae:.4f}")

# Print results
print("BASELINE OLS PERFORMANCE:")
print_model_performance(y_train, y_train_pred, "OLS", "Train")
print_model_performance(y_test, y_test_pred, "OLS", "Test")

BASELINE OLS PERFORMANCE:
OLS (Train): R²=0.6372 | RMSE=0.6383 | MAE=0.2176
OLS (Test): R²=0.6072 | RMSE=0.6112 | MAE=0.2260


In [94]:
# =============================================================================
# STATSMODELS REGRESSION TABLE - For Thesis Appendix
# =============================================================================

import statsmodels.api as sm
import pandas as pd

# Fit statsmodels for detailed regression table
X_train_sm = sm.add_constant(X_train)
regression_model = sm.OLS(y_train, X_train_sm).fit()

# Print publication-ready summary
print("REGRESSION TABLE FOR THESIS APPENDIX:")
print("="*80)
print(regression_model.summary())

# LaTeX table for appendix
print("\nLATEX CODE FOR APPENDIX:")
print("="*80)
print(regression_model.summary().as_latex())

REGRESSION TABLE FOR THESIS APPENDIX:
                             OLS Regression Results                            
Dep. Variable:     OCF_Scaled_t_plus_1   R-squared:                       0.637
Model:                             OLS   Adj. R-squared:                  0.637
Method:                  Least Squares   F-statistic:                 2.002e+04
Date:                 Wed, 28 May 2025   Prob (F-statistic):               0.00
Time:                         19:31:17   Log-Likelihood:                -99485.
No. Observations:               102567   AIC:                         1.990e+05
Df Residuals:                   102557   BIC:                         1.991e+05
Df Model:                            9                                         
Covariance Type:             nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------

  return np.sqrt(eigvals[0]/eigvals[-1])
