In [1]:
import optuna
import pandas as pd
import xgboost as xgb
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from xgboost.callback import EarlyStopping

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv(filepath_or_buffer="../data/crop_yield_train.csv")
df = df[["soil_moisture", "total_rainfall", "fertilizer_amount", "sunlight_hours", "field_id", "yield_tpha"]]

In [3]:
df

Unnamed: 0,soil_moisture,total_rainfall,fertilizer_amount,sunlight_hours,field_id,yield_tpha
0,37.192725,145.627849,124.830027,1911.116978,F0138,3.790277
1,23.715684,599.005355,120.168428,2011.488102,F0393,5.660778
2,20.481400,333.247698,270.799112,1929.725597,F0066,7.098251
3,42.446504,523.610747,99.013588,2231.228584,F0150,5.461535
4,16.614817,1005.931705,169.955045,2826.831668,F0498,6.336988
...,...,...,...,...,...,...
4795,17.025059,878.536336,158.700801,1285.428969,F0223,6.546929
4796,16.138273,273.478409,151.216406,1333.150567,F0091,7.003236
4797,38.105084,575.764272,184.379434,2009.721557,F0164,5.834223
4798,10.438546,946.630937,109.656487,1155.118005,F0391,4.761456


In [4]:
def encodecategorical(data):
    
    '''
    Input
    data: Pandas data frame
    
    Output
    1. categorical value map keys
    2. Updated data using map

    '''
    # get all the columns with object data type
    allcols = data.columns.to_list()
    objectcols = []
    allunq_mapper = []
    for col in allcols:
        if data[col].dtype=="O":
            objectcols.append(col)

    # count the total unique object value and convert them to numeric
    # map data to df
    for col in objectcols:
        unq_mapper = {unq:unq_id+1 for unq_id, unq in enumerate(data[col].unique())}
        data[col] = data[col].map(unq_mapper)
        allunq_mapper.append(unq_mapper)

    return (allunq_mapper, data)

In [5]:
df_mapper, df_update = encodecategorical(df)

In [6]:
df_update

Unnamed: 0,soil_moisture,total_rainfall,fertilizer_amount,sunlight_hours,field_id,yield_tpha
0,37.192725,145.627849,124.830027,1911.116978,1,3.790277
1,23.715684,599.005355,120.168428,2011.488102,2,5.660778
2,20.481400,333.247698,270.799112,1929.725597,3,7.098251
3,42.446504,523.610747,99.013588,2231.228584,4,5.461535
4,16.614817,1005.931705,169.955045,2826.831668,5,6.336988
...,...,...,...,...,...,...
4795,17.025059,878.536336,158.700801,1285.428969,210,6.546929
4796,16.138273,273.478409,151.216406,1333.150567,243,7.003236
4797,38.105084,575.764272,184.379434,2009.721557,360,5.834223
4798,10.438546,946.630937,109.656487,1155.118005,180,4.761456


In [7]:
x = df_update.iloc[:, :-1]
y = df_update.iloc[:, -1]

In [8]:
xtrainval, xeval, ytrainval, yeval = train_test_split(x, y, train_size=0.8, random_state=42)

In [9]:
xtrainval.shape, ytrainval.shape, xeval.shape, yeval.shape

((3840, 5), (3840,), (960, 5), (960,))

In [10]:
data = xtrainval
target = ytrainval

In [16]:
def objective(trial, data=data, target=target):

    # split data and target into train and validation
    xtrain, xval, ytrain, yval = train_test_split(data, target, test_size = 0.4, random_state=42)

    # Dmatrix
    dtrain = xgb.DMatrix(xtrain, ytrain)
    dval = xgb.DMatrix(xval, yval)
    
    # all parameters
    param = {'objective': "reg:squarederror",
             "tree_method": "hist",
            'learning_rate': trial.suggest_categorical('learning_rate', [0.008,0.009,0.01,0.012,0.014,0.016,0.018, 0.02]),
            'max_depth': trial.suggest_int("max_depth", 3, 20),
             "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
             "gamma": trial.suggest_float("gamma", 0, 5),
             "subsample": trial.suggest_float("subsample", 0.6, 1.0),
             "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
             "reg_alpha": trial.suggest_float("alpha", 0, 10),
             "reg_lambda": trial.suggest_float("lambda", 0, 10),
            'seed': 42,
            }

    # model
    model = xgb.train(params=param, dtrain=dtrain, num_boost_round=1000, evals=[(dval, "val")], early_stopping_rounds=20, verbose_eval=False)
    preds = model.predict(dval)
    rmse = root_mean_squared_error(yval, preds)
    return rmse

In [17]:
study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
study.optimize(objective, n_trials=10)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2025-11-30 23:06:24,649] A new study created in memory with name: no-name-8e3426f5-d74d-4be0-8553-1c1ef5fa5cad
[I 2025-11-30 23:06:25,623] Trial 0 finished with value: 0.7174282391600432 and parameters: {'learning_rate': 0.009, 'max_depth': 13, 'min_child_weight': 15, 'gamma': 0.10292247147901223, 'subsample': 0.9879639408647978, 'colsample_bytree': 0.9329770563201687, 'alpha': 2.1233911067827616, 'lambda': 1.8182496720710062}. Best is trial 0 with value: 0.7174282391600432.
[I 2025-11-30 23:06:26,073] Trial 1 finished with value: 0.7062299597962516 and parameters: {'learning_rate': 0.016, 'max_depth': 9, 'min_child_weight': 10, 'gamma': 3.925879806965068, 'subsample': 0.6798695128633439, 'colsample_bytree': 0.8056937753654446, 'alpha': 5.924145688620425, 'lambda': 0.46450412719997725}. Best is trial 1 with value: 0.7062299597962516.
[I 2025-11-30 23:06:26,868] Trial 2 finished with value: 0.7048912423024576 and parameters: {'learning_rate': 0.014, 'max_depth': 15, 'min_child_weight

Number of finished trials: 10
Best trial: {'learning_rate': 0.009, 'max_depth': 3, 'min_child_weight': 11, 'gamma': 2.087055015743895, 'subsample': 0.6888431241882921, 'colsample_bytree': 0.6479461469334731, 'alpha': 3.3761517140362796, 'lambda': 9.429097039125192}


In [14]:
study.best_trial.params

{'learning_rate': 0.009,
 'max_depth': 3,
 'min_child_weight': 11,
 'gamma': 2.087055015743895,
 'subsample': 0.6888431241882921,
 'colsample_bytree': 0.6479461469334731,
 'alpha': 3.3761517140362796,
 'lambda': 9.429097039125192}

In [98]:
study.best_trial

FrozenTrial(number=8, state=<TrialState.COMPLETE: 1>, values=[0.7033982135936252], datetime_start=datetime.datetime(2025, 11, 30, 19, 53, 21, 372462), datetime_complete=datetime.datetime(2025, 11, 30, 19, 53, 22, 202437), params={'learning_rate': 0.009, 'max_depth': 3, 'min_child_weight': 11, 'gamma': 2.087055015743895, 'subsample': 0.6888431241882921, 'colsample_bytree': 0.6479461469334731, 'alpha': 3.3761517140362796, 'lambda': 9.429097039125192}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'learning_rate': CategoricalDistribution(choices=(0.008, 0.009, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02)), 'max_depth': IntDistribution(high=20, log=False, low=3, step=1), 'min_child_weight': IntDistribution(high=20, log=False, low=1, step=1), 'gamma': FloatDistribution(high=5.0, log=False, low=0.0, step=None), 'subsample': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'colsample_bytree': FloatDistribution(high=1.0, log=False, low=0.6, step=None), 'alpha': F

In [26]:
time_format = "%Y-%m-%d %H:%M:%S"
st_ = study.best_trial.datetime_start.strftime(format=time_format)
et_ = study.best_trial.datetime_complete.strftime(format=time_format)
optuna_start_time_dt = datetime.strptime(st_, time_format)
optuna_end_time_dt = datetime.strptime(et_, time_format)

In [27]:
f"xgb_base_model_{et_.replace(':', '-')}"

'xgb_base_model_2025-11-30 23-06-30'

In [122]:
st_, et_

('2025-11-30 19:53:21', '2025-11-30 19:53:22')

In [121]:
(optuna_end_time_dt - optuna_start_time_dt).total_seconds()

1.0

In [99]:
study.best_trial.values

[0.7033982135936252]

In [117]:
st_ = study.best_trial.datetime_start.strftime(format=time_format)
st_

NameError: name 'time_format' is not defined

In [115]:
et_ = study.best_trial.datetime_complete.strftime(format=time_format)
et_

'2025-11-30 19:53:22'

In [116]:
et_ - st_

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [18]:
best_p = study.best_trial.params
best_p

{'learning_rate': 0.009,
 'max_depth': 3,
 'min_child_weight': 11,
 'gamma': 2.087055015743895,
 'subsample': 0.6888431241882921,
 'colsample_bytree': 0.6479461469334731,
 'alpha': 3.3761517140362796,
 'lambda': 9.429097039125192}

In [19]:
xtrain, xval, ytrain, yval = train_test_split(data, target, test_size = 0.4, random_state=42)
dtrain = xgb.DMatrix(xtrain, ytrain)
dval = xgb.DMatrix(xval, yval)
param = best_p.copy()
param.update({"objective": "reg:squarederror", "seed": 42, "tree_method": "hist"})
param

{'learning_rate': 0.009,
 'max_depth': 3,
 'min_child_weight': 11,
 'gamma': 2.087055015743895,
 'subsample': 0.6888431241882921,
 'colsample_bytree': 0.6479461469334731,
 'alpha': 3.3761517140362796,
 'lambda': 9.429097039125192,
 'objective': 'reg:squarederror',
 'seed': 42,
 'tree_method': 'hist'}

In [37]:
model_xgb = xgb.train(
    params=param,
    dtrain=dtrain,
    num_boost_round=1000,
    evals=[(dval, "val")],
    early_stopping_rounds=20,
    verbose_eval=False
)

In [38]:
model_xgb.save_model("xgb.json")

In [39]:
pred_model_xgb = model_xgb.predict(dval)

In [40]:
pred_model_xgb

array([7.612388 , 6.635779 , 4.790819 , ..., 6.1285224, 7.209784 ,
       5.1529393], shape=(1536,), dtype=float32)

In [41]:
pred_model_xgb.tolist()

[7.6123881340026855,
 6.635778903961182,
 4.79081916809082,
 6.691402912139893,
 6.4496283531188965,
 7.539228439331055,
 4.488142490386963,
 6.667664527893066,
 7.259749889373779,
 6.79990291595459,
 6.597284317016602,
 5.195976257324219,
 6.7155046463012695,
 4.911404132843018,
 5.510222434997559,
 5.885545253753662,
 5.831955432891846,
 6.426097869873047,
 7.379449367523193,
 6.668776512145996,
 5.489181041717529,
 6.771294593811035,
 5.274528980255127,
 5.475515842437744,
 7.122891902923584,
 5.508407115936279,
 6.306341648101807,
 5.417660713195801,
 6.538536071777344,
 4.799254417419434,
 6.0505452156066895,
 6.787131309509277,
 4.927353382110596,
 5.595654010772705,
 7.018255233764648,
 6.005708694458008,
 5.08219051361084,
 7.2934370040893555,
 7.351672649383545,
 6.040790557861328,
 6.447099208831787,
 7.099109172821045,
 6.935304164886475,
 7.12477445602417,
 6.326260566711426,
 7.046944618225098,
 4.77875280380249,
 5.638068675994873,
 6.715355396270752,
 7.243086814880371,


In [109]:
root_mean_squared_error(yval, pred_model_xgb)

0.7033982135936252

In [110]:
dtest = xgb.DMatrix(xeval, label=None)

In [111]:
pred_test = model_xgb.predict(dtest)

In [112]:
root_mean_squared_error(yeval, pred_test)

0.7425669097016274

In [33]:
import pandas as pd
import re

In [34]:
pattern = (
    r"^(.*?) - INFO - Optuna Training Summary \| Start: (.*?) \| End: (.*?) "
    r"\| Duration: (.*?) seconds \| OPTUNA RMSE: (.*?) \| TEST RMSE: (.*?) "
    r"\| Model: (.*)$"
)

records = []

with open("../train/training logs/train.log", "r") as f:
    for line in f:
        match = re.match(pattern, line.strip())
        if match:
            records.append(match.groups())

df = pd.DataFrame(records, columns=[
    "log_time",
    "optuna_start",
    "optuna_end",
    "duration_sec",
    "optuna_rmse",
    "test_rmse",
    "model_name"
])

# Convert numerical fields
df["duration_sec"] = df["duration_sec"].astype(float)
df["optuna_rmse"] = df["optuna_rmse"].astype(float)
df["test_rmse"] = df["test_rmse"].astype(float)

In [35]:
df

Unnamed: 0,log_time,optuna_start,optuna_end,duration_sec,optuna_rmse,test_rmse,model_name
0,"2025-12-01 01:17:26,138",2025-12-01 01:17:21,2025-12-01 01:17:22,1.0,0.703,0.7425,xgb_base_model_2025-12-01 01-17-22
1,"2025-12-01 01:20:30,830",2025-12-01 01:20:27,2025-12-01 01:20:28,1.0,0.7026,0.7415,xgb_base_model_2025-12-01 01-20-28
2,"2025-12-01 02:20:07,706",2025-12-01 02:19:35,2025-12-01 02:19:35,0.0,0.7015,0.7415,xgb_base_model_2025-12-01 02-19-35


In [None]:
2025-12-01 01:17:26,138 - INFO - Optuna Training Summary | Start: 2025-12-01 01:17:21 | End: 2025-12-01 01:17:22 | Duration: 1.00 seconds | OPTUNA RMSE: 0.7030 | TEST RMSE: 0.7425 | Model: xgb_base_model_2025-12-01 01-17-22
2025-12-01 01:20:30,830 - INFO - Optuna Training Summary | Start: 2025-12-01 01:20:27 | End: 2025-12-01 01:20:28 | Duration: 1.00 seconds | OPTUNA RMSE: 0.7026 | TEST RMSE: 0.7415 | Model: xgb_base_model_2025-12-01 01-20-28
2025-12-01 02:20:07,706 - INFO - Optuna Training Summary | Start: 2025-12-01 02:19:35 | End: 2025-12-01 02:19:35 | Duration: 0.00 seconds | OPTUNA RMSE: 0.7015 | TEST RMSE: 0.7415 | Model: xgb_base_model_2025-12-01 02-19-35
