In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("../Biomass_data/processed/after_eda.csv")

In [3]:
df.head()

Unnamed: 0,sample_id,image_path,Sampling_Date,State,Species,Pre_GSHH_NDVI,Height_Ave_cm,target_name,target
0,ID1011485656__Dry_Clover_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Clover_g,0.0
1,ID1011485656__Dry_Dead_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Dead_g,31.9984
2,ID1011485656__Dry_Green_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Green_g,16.2751
3,ID1011485656__Dry_Total_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,Dry_Total_g,48.2735
4,ID1011485656__GDM_g,train/ID1011485656.jpg,2015/9/4,Tas,Ryegrass_Clover,0.62,4.6667,GDM_g,16.275


In [4]:
cat_cols = ['State', 'Species', 'target_name']
features = ['Pre_GSHH_NDVI', 'Height_Ave_cm', 'State', 'Species', 'target_name']

for col in cat_cols:
    df[col] = df[col].astype("category")

## Splitting Data

In [5]:
from sklearn.model_selection import train_test_split
import numpy as np

In [6]:
unique_imgs = df['image_path'].unique()

train_imgs, test_imgs = train_test_split(
    unique_imgs,
    test_size=0.2, 
    random_state=42
    )

In [7]:
train_df = df[df['image_path'].isin(train_imgs)].reset_index(drop=True)
test_df  = df[df['image_path'].isin(test_imgs)].reset_index(drop=True)

In [8]:
X_train = train_df[features]
y_train = train_df['target']
X_test  = test_df[features]
y_test  = test_df['target']

## XGBoost Training

In [9]:
from xgboost import XGBRegressor

In [10]:
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=6,
    enable_categorical=True,  
    tree_method="hist",    
    n_jobs=-1,
    random_state=42
)

In [11]:
print("XGBoost model training...")
xgb_model.fit(X_train, y_train)

XGBoost model training...


0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,True


In [12]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import cross_validate, GroupKFold

In [13]:
def evaluate_model_cv (scores):
    print("---------------------EVALUATION----------------------")
    print(f"Fit Time: {np.mean(scores['fit_time'])}")
    print(f"Score Time: {np.mean(scores['score_time'])}")
    print(f"RMSE:{-np.mean(scores['test_neg_root_mean_squared_error'])}")
    print(f"MAE:{-np.mean(scores['test_neg_mean_absolute_error'])}")
    print(f"MSE:{-np.mean(scores['test_neg_mean_squared_error'])}")
    print(f"R-squared (R2):{np.mean(scores['test_r2'])}")

In [14]:
groups = train_df['image_path']
gkf = GroupKFold(n_splits=5)

scoring = {
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error',
    'neg_mean_absolute_error': 'neg_mean_absolute_error',
    'neg_mean_squared_error': 'neg_mean_squared_error',
    'r2': 'r2'
}

scores = cross_validate(
    xgb_model,
    X_train,
    y_train,
    cv=gkf,
    groups=groups,
    scoring=scoring,
    return_train_score=False
)

evaluate_model_cv(scores)

---------------------EVALUATION----------------------
Fit Time: 0.40343427658081055
Score Time: 0.006232643127441406
RMSE:11.782826032955702
MAE:7.959912847799534
MSE:142.16549102053366
R-squared (R2):0.7842623023955392


In [15]:
def evaluate_model(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print("---------------------TEST RESULTS----------------------")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE:  {mae:.4f}")
    print(f"R2:   {r2:.4f}")

In [16]:
y_pred = xgb_model.predict(X_test)

evaluate_model(y_test, y_pred)

---------------------TEST RESULTS----------------------
RMSE: 12.5621
MAE:  8.0845
R2:   0.7910


In [None]:
results_df = X_test.copy()
results_df['Actual'] = y_test
results_df['Predicted'] = y_pred

unique_targets = results_df['target_name'].unique()

print(f"{'Target Name':<20} | {'R2 Score':<10} | {'RMSE':<10}")
print("-" * 45)

for target in unique_targets:
    subset = results_df[results_df['target_name'] == target]
    
    r2 = r2_score(subset['Actual'], subset['Predicted'])
    rmse = np.sqrt(mean_squared_error(subset['Actual'], subset['Predicted']))
    
    print(f"{target:<20} | {r2:.4f}     | {rmse:.4f}")

Target Name          | R2 Score   | RMSE      
---------------------------------------------
Dry_Clover_g         | 0.2859     | 7.6969
Dry_Dead_g           | 0.3384     | 10.4290
Dry_Green_g          | 0.7332     | 14.5318
Dry_Total_g          | 0.7216     | 16.0053
GDM_g                | 0.7997     | 12.3968
