## Importing Libraries

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

## Loading the Data

In [2]:
ds = pd.read_csv("TB_ModelingdataV1.csv")
da = pd.read_csv("Yield_05-05-25.csv")
db = pd.read_csv("Yield_04-15-25.csv")

In [3]:
#Comment if you want to run only on Testbed Data
ds = ds[ds["Experiment"] != "SHTFG"]

In [4]:
ds  = ds.rename(columns = 
          {'ndvi_mean' : "NDVI_mean",
          'gndvi_mean' : "GNDVI_mean",
          'savi_mean' : "SAVI_mean",
          'msavi_mean' : "MSAVI_mean",
         'PT Height (mm)' : "MeanHeight(mm)",
         "Total Biomass (kg/ha)" : "Biomass(kg/ha)"})

In [5]:
db = db.rename(columns = 
        {
         'PT_Height(mm)' : "MeanHeight(mm)",
        "Total Biomass (kg/ha)" : "Biomass(kg/ha)"})

In [6]:
da = da.rename(columns = 
        {
         'PT_Height(mm)' : "MeanHeight(mm)" ,
        "Total Biomass (kg/ha)" : "Biomass(kg/ha)"})

In [7]:
# Concatenate only the matching columns (intersection)
df = pd.concat([da, db, ds], join='inner')

df


Unnamed: 0,Experiment,Date,Plot,Strip,MeanHeight(mm),NDVI_mean,GNDVI_mean,SAVI_mean,MSAVI_mean,Biomass(kg/ha)
0,TB,2025-05-05,14.1,14.0,227.62,0.692,0.632,1.038,0.818,1549.29
1,TB,2025-05-05,14.1,15.0,223.68,0.700,0.642,1.050,0.824,1591.26
2,TB,2025-05-05,14.1,16.0,176.82,0.708,0.652,1.062,0.829,1455.70
3,TB,2025-05-05,14.1,17.0,192.36,0.708,0.657,1.062,0.829,2020.84
4,TB,2025-05-05,14.1,18.0,209.77,0.713,0.653,1.070,0.833,1759.82
...,...,...,...,...,...,...,...,...,...,...
295,Testbed,11/1/2024,28.4,18.0,34.88,0.440,0.460,0.670,0.610,1663.27
296,Testbed,11/1/2024,28.4,19.0,40.84,0.440,0.460,0.650,0.610,1356.82
297,Testbed,11/1/2024,28.4,20.0,41.15,0.440,0.460,0.650,0.610,1431.21
298,Testbed,11/1/2024,28.4,21.0,44.63,0.420,0.460,0.630,0.590,1476.08


## Upgrading the Date Column

In [10]:
# Ensure the 'Date' column is in datetime format (handles mixed formats)
df['Date'] = pd.to_datetime(df['Date'], format='mixed')

# Calculate Julian date (day of the year)
df['JulianDate'] = df['Date'].dt.dayofyear
df

Unnamed: 0,Experiment,Date,Plot,Strip,MeanHeight(mm),NDVI_mean,GNDVI_mean,SAVI_mean,MSAVI_mean,Biomass(kg/ha),JulianDate
0,TB,2025-05-05,14.1,14.0,227.62,0.692,0.632,1.038,0.818,1549.29,125
1,TB,2025-05-05,14.1,15.0,223.68,0.700,0.642,1.050,0.824,1591.26,125
2,TB,2025-05-05,14.1,16.0,176.82,0.708,0.652,1.062,0.829,1455.70,125
3,TB,2025-05-05,14.1,17.0,192.36,0.708,0.657,1.062,0.829,2020.84,125
4,TB,2025-05-05,14.1,18.0,209.77,0.713,0.653,1.070,0.833,1759.82,125
...,...,...,...,...,...,...,...,...,...,...,...
295,Testbed,2024-11-01,28.4,18.0,34.88,0.440,0.460,0.670,0.610,1663.27,306
296,Testbed,2024-11-01,28.4,19.0,40.84,0.440,0.460,0.650,0.610,1356.82,306
297,Testbed,2024-11-01,28.4,20.0,41.15,0.440,0.460,0.650,0.610,1431.21,306
298,Testbed,2024-11-01,28.4,21.0,44.63,0.420,0.460,0.630,0.590,1476.08,306


## Train, Validate and Test the Model

In [24]:
# Define the independent variables (features) and the target variable
features = ["MeanHeight(mm)","NDVI_mean", "GNDVI_mean", "SAVI_mean",  "JulianDate"]
target = 'Biomass(kg/ha)'

# Ensure your data is clean and handle missing values
X = df[features]
y = df[target]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Preprocessing pipeline - Include all your preprocessing steps in below function to be applied to new data
preprocessor = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# have variable names as shown below _preprocessed
# Apply preprocessing
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

# Define Ridge regression and hyperparameter tuning
param_grid = {'alpha': [0.2]}  # Exploring different alpha values
ridge = Ridge()
# Perform Grid Search with Cross-Validation
grid_search = GridSearchCV(ridge, param_grid, cv=9, scoring='neg_root_mean_squared_error')
grid_search.fit(X_train_preprocessed, y_train)

# Always register your model into variable named "model"
model = grid_search.best_estimator_

# Predict on test set
y_pred = model.predict(X_test_preprocessed)

# Evaluation Metrics (using absolute predictions)
test_mse = mean_squared_error(y_test, y_pred)
test_rmse = test_mse ** 0.5 
test_mae = mean_absolute_error(y_test, y_pred)  # MAE
test_r2 = r2_score(y_test, y_pred)  # R² Score

# Print evaluation results
print(f"Best Alpha: {grid_search.best_params_['alpha']}")
print(f"Test MSE: {test_mse:.4f}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test MAE: {test_mae:.4f}")
print(f"Test R² Score: {test_r2:.4f}")


Best Alpha: 0.2
Test MSE: 262819.3063
Test RMSE: 512.6591
Test MAE: 363.0429
Test R² Score: 0.8905
