In [1]:
%pip install pandas numpy matplotlib seaborn scikit-learn scipy

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, precision_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

Defaulting to user installation because normal site-packages is not writeable
Collecting matplotlib
  Downloading matplotlib-3.10.3-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp313-cp313-win_amd64.whl.metadata (15 kB)
Collecting scipy
  Downloading scipy-1.15.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.2-cp313-cp313-win_amd64.whl.metadata (5.5 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.58.0-cp313-cp313-win_amd64.whl.metadata (106 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.8-cp313-cp313-win_amd64.whl.metadata (6.3 kB)
Collecting pillow>=8 (from matplotlib)
  Downloading pillow-11.2.1-cp313-cp313-win_a

In [2]:
df = pd.read_csv('basil_greenhouse_real_world_simulated.csv')


In [3]:
X = df[['temperature_C', 'humidity_air_percent', 'light_lux', 'soil_humidity_percent']]
y = df['delta_soil_humidity_per_hour']

In [4]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.25, random_state=42)

In [5]:
forest = RandomForestRegressor(n_estimators=200, max_depth=10, min_samples_split=2,
                                max_features=10, random_state=42)
forest.fit(X_train, y_train)

In [6]:
# Validation set
y_val_pred = forest.predict(X_val)
val_r2 = forest.score(X_val, y_val)
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)

print(f"Validation R2 score: {val_r2:.4f}")
print(f"Validation MSE: {val_mse:.4f}")
print(f"Validation MAE: {val_mae:.4f}")

# Test set
y_test_pred = forest.predict(X_test)
test_r2 = forest.score(X_test, y_test)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Test R2 score: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test MAE: {test_mae:.4f}")


Validation R2 score: 0.5259
Validation MSE: 8.5463
Validation MAE: 0.3659
Test R2 score: 0.6278
Test MSE: 3.3768
Test MAE: 0.2369


In [7]:
param_dist = {
    'n_estimators': randint(100, 500),        # tree count
    'max_depth': randint(4, 20),              # max tree depth
    'min_samples_split': randint(2, 10),      # min split samples
    'min_samples_leaf': randint(1, 10),       # min leaf samples
    'max_features': ['sqrt', 'log2', None], # feature count
}


rf = RandomForestRegressor(random_state=42)


In [8]:
# RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=50,                # number of iterations
    cv=5,                     # cross validation folds
    scoring='r2',             # scoring metric
    verbose=2,
    n_jobs=-1,
    random_state=42
)
# Fit the model
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [9]:
# Best parameters & best score
print("Best Params:", random_search.best_params_)
print("Best CV R2 Score:", random_search.best_score_)

# evaluation on validation set
# Using the best model from RandomizedSearchCV
best_model = random_search.best_estimator_

y_val_pred = best_model.predict(X_val)
val_r2 = r2_score(y_val, y_val_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)

print(f"Validation R2 score: {val_r2:.4f}")
print(f"Validation MSE: {val_mse:.4f}")
print(f"Validation MAE: {val_mae:.4f}")

# evaluation on test set
y_test_pred = best_model.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Test R2 score: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test MAE: {test_mae:.4f}")

Best Params: {'max_depth': 8, 'max_features': None, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 104}
Best CV R2 Score: 0.9571095925762609
Validation R2 score: 0.5264
Validation MSE: 8.5375
Validation MAE: 0.3651
Test R2 score: 0.6345
Test MSE: 3.3158
Test MAE: 0.2360


In [10]:
%pip install xgboost

from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from scipy.stats import randint, uniform

# Hyperparameter search space for XGBoost
param_dist = {
    'n_estimators': randint(100, 500),         # Number of boosting rounds
    'max_depth': randint(3, 10),               # Maximum tree depth
    'learning_rate': uniform(0.01, 0.3),       # Learning rate (eta)
    'subsample': uniform(0.7, 0.3),            # Fraction of samples per tree
    'colsample_bytree': uniform(0.7, 0.3),     # Fraction of features per tree
    'min_child_weight': randint(1, 10),        # Minimum child weight for a split
    'gamma': uniform(0, 0.5)                   # Minimum loss reduction to make a split
}

# Initialize XGBoost Regressor
xgb = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)

# RandomizedSearchCV for hyperparameter tuning
random_search_xgb = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=50,                # Number of random parameter combinations to try
    cv=5,                     # 5-fold cross-validation
    scoring='r2',             # Optimize R2 score
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Train the model on training set
random_search_xgb.fit(X_train, y_train)

# Display best hyperparameters and best CV R2 score
print("Best Params:", random_search_xgb.best_params_)
print("Best CV R2 Score:", random_search_xgb.best_score_)

# Evaluate on Validation set
best_xgb = random_search_xgb.best_estimator_

y_val_pred = best_xgb.predict(X_val)
val_r2 = r2_score(y_val, y_val_pred)
val_mse = mean_squared_error(y_val, y_val_pred)
val_mae = mean_absolute_error(y_val, y_val_pred)

print(f"Validation R2 score: {val_r2:.4f}")
print(f"Validation MSE: {val_mse:.4f}")
print(f"Validation MAE: {val_mae:.4f}")

# Evaluate on Test set
y_test_pred = best_xgb.predict(X_test)
test_r2 = r2_score(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print(f"Test R2 score: {test_r2:.4f}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test MAE: {test_mae:.4f}")

import joblib
from pathlib import Path

# 1. define where to save
model_dir = Path("./models")
model_dir.mkdir(exist_ok=True)
model_path = model_dir / "best_xgb.joblib"

# 2. dump best_model
joblib.dump(best_xgb, model_path)
print(f"Saved best model to {model_path}")

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Using cached xgboost-3.0.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Using cached xgboost-3.0.1-py3-none-win_amd64.whl (150.0 MB)
Installing collected packages: xgboost
Successfully installed xgboost-3.0.1
Note: you may need to restart the kernel to use updated packages.
Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best Params: {'colsample_bytree': np.float64(0.74442607898602), 'gamma': np.float64(0.49887024252447093), 'learning_rate': np.float64(0.09003430428258549), 'max_depth': 4, 'min_child_weight': 2, 'n_estimators': 319, 'subsample': np.float64(0.7154436253749967)}
Best CV R2 Score: 0.8609084260975445
Validation R2 score: 0.6061
Validation MSE: 7.0993
Validation MAE: 0.6208
Test R2 score: 0.7009
Test MSE: 2.7139
Test MAE: 0.5387
Saved best model to models\best_xgb.joblib


In [11]:
example_input = pd.DataFrame({
    'temperature_C': [30.0],           # Temperature in Celsius
    'humidity_air_percent': [50.0],    # Air humidity in %
    'light_lux': [400.0],              # Light intensity in lux
    'soil_humidity_percent': [50.0]    # Current soil humidity in %
})

# Scale the input if you used scaling before (be careful)
# If you scaled X before training, apply the same scaler:
# example_input_scaled = scaler.transform(example_input)

# Predict using best XGBoost model
predicted_delta = best_xgb.predict(example_input)[0]

print(f"Predicted delta soil humidity per hour: {predicted_delta:.4f}")

Predicted delta soil humidity per hour: 0.4720
