In [4]:
import numpy as np
import os
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1) POINT THIS TO THE FOLDER CONTAINING THE 6 NPZ FILES:
DATA_DIR = "./usa_soybeans_data"

# 2) HYPERPARAMETERS & REPRODUCIBILITY SETTINGS
N_TREES = 100
RANDOM_SEED = 0
N_JOBS = 1  # Force single-thread for more reproducible results

# Load each split from the pre-split NPZ files
train_X = np.load(os.path.join(DATA_DIR, "train_hists.npz"))["data"]
train_y = np.load(os.path.join(DATA_DIR, "train_yields.npz"))["data"]

val_X = np.load(os.path.join(DATA_DIR, "dev_hists.npz"))["data"]
val_y = np.load(os.path.join(DATA_DIR, "dev_yields.npz"))["data"]

test_X = np.load(os.path.join(DATA_DIR, "test_hists.npz"))["data"]
test_y = np.load(os.path.join(DATA_DIR, "test_yields.npz"))["data"]

print(f"Train set: {train_X.shape}, Validation set: {val_X.shape}, Test set: {test_X.shape}")

# Flatten 32x32x9 histograms -> 9216 features
train_X_flat = train_X.reshape(train_X.shape[0], -1)
val_X_flat   = val_X.reshape(val_X.shape[0], -1)
test_X_flat  = test_X.reshape(test_X.shape[0], -1)

# 3) TRAIN RANDOM FOREST
rf = RandomForestRegressor(
    n_estimators=N_TREES,
    random_state=RANDOM_SEED,
    n_jobs=N_JOBS
)
print("Fitting RandomForestRegressor...")
rf.fit(train_X_flat, train_y)

# 4) EVALUATE ON VALIDATION SET
val_preds = rf.predict(val_X_flat)
val_rmse = mean_squared_error(val_y, val_preds, squared=False)
val_r2 = r2_score(val_y, val_preds)
print(f"[Validation] RMSE = {val_rmse:.3f} t/ha, R² = {val_r2:.3f}")

# 5) EVALUATE ON TEST SET
test_preds = rf.predict(test_X_flat)
test_rmse = mean_squared_error(test_y, test_preds, squared=False)
test_r2 = r2_score(test_y, test_preds)
print(f"[Test] RMSE = {test_rmse:.3f} t/ha, R² = {test_r2:.3f}")


Train set: (5792, 32, 32, 9), Validation set: (1448, 32, 32, 9), Test set: (1809, 32, 32, 9)
Fitting RandomForestRegressor...
[Validation] RMSE = 0.356 t/ha, R² = 0.732
[Test] RMSE = 0.344 t/ha, R² = 0.746
