In [93]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score, root_mean_squared_error
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor

In [94]:
# load data
df = pd.read_csv("Training_Dataset.csv")
df.head()

Unnamed: 0,Latitude,Longitude,Sample Date,nir,green,swir16,swir22,NDMI,MNDWI,pet,Total Alkalinity,Electrical Conductance,Dissolved Reactive Phosphorus
0,-28.760833,17.730278,02-01-2011,11190.0,11426.0,7687.5,7645.0,0.185538,0.195595,174.2,128.912,555.0,10.0
1,-26.861111,28.884722,03-01-2011,17658.5,9550.0,13746.5,10574.0,0.124566,-0.180134,124.1,74.72,162.9,163.0
2,-26.45,28.085833,03-01-2011,15210.0,10720.0,17974.0,14201.0,-0.083293,-0.252805,127.5,89.254,573.0,80.0
3,-27.671111,27.236944,03-01-2011,14887.0,10943.0,13522.0,11403.0,0.048048,-0.105416,129.7,82.0,203.6,101.0
4,-27.356667,27.286389,03-01-2011,16828.5,9502.5,12665.5,9643.0,0.141147,-0.142683,129.2,56.1,145.1,151.0


In [95]:
df.isna().sum()

Latitude                            0
Longitude                           0
Sample Date                         0
nir                              1085
green                            1085
swir16                           1085
swir22                           1085
NDMI                             1085
MNDWI                            1085
pet                                 0
Total Alkalinity                    0
Electrical Conductance              0
Dissolved Reactive Phosphorus       0
dtype: int64

In [96]:
# Extract features and targets
def preprocess_data(data):
    data = data.drop(columns = ['Latitude', 'Longitude', 'Sample Date', 'nir', 'green', 'swir16'])
    targets = data[['Total Alkalinity', 'Electrical Conductance', 'Dissolved Reactive Phosphorus']]
    X = data.drop(columns = targets)
    y_ta = targets['Total Alkalinity']
    y_ec = targets['Electrical Conductance']
    y_drp = targets['Dissolved Reactive Phosphorus']
    return X, y_ta, y_ec, y_drp

# Apply the function to the data and assign them to variables
X, y_ta, y_ec, y_drp = preprocess_data(df)

In [97]:
# Split data for all targets to X variable
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state = 42)
    return X_train, X_test, y_train, y_test

In [98]:
"""
Build a baseline pipeline for scaling and model training with baseline
xgbregessors without hyperparameter tunning
"""

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBRegressor(random_state = 42)
    )
])

In [99]:
"""
Iterates through targets, splits data, fits the pipeline, 
and prints the baseline RMSE score.
"""

def fit_and_evaluate(pipe, X, targets_dict):
    
    results_train = {}
    results_test = {}
    
    for name, y in targets_dict.items():
        # 1. Split the data (unpack the tuple)
        X_train, X_test, y_train, y_test = split_data(X, y)
        
        # 2. Fit pipeline on training data only
        pipe.fit(X_train, y_train)
        
        # 3. Predict on test data to evaluate performance
        y_pred_train = pipe.predict(X_train)
        y_pred_test = pipe.predict(X_test)
        
        # 4. Calculate metric (RMSE and R2) for training data
        rmse_train = root_mean_squared_error(y_train, y_pred_train)
        r2_train = r2_score(y_train, y_pred_train)
        
        # 5. Calculate metric (RMSE and R2) for test data
        rmse_test = root_mean_squared_error(y_test, y_pred_test)
        r2_test = r2_score(y_test, y_pred_test)
        
        # 6. Store and print results
        results_train[name] = {'RMSE': rmse_train, 'R2': r2_train}
        results_test[name] = {'RMSE': rmse_test , 'R2': r2_test}
        
    return pd.DataFrame(results_train).T, pd.DataFrame(results_test).T



# Run the function
baseline_results = fit_and_evaluate(pipe, X, target_dict)

print(baseline_results)


(                                     RMSE        R2
Total Alkalinity                32.718098  0.806352
Electrical Conductance         139.004270  0.834816
Dissolved Reactive Phosphorus   22.482179  0.804518,                                      RMSE        R2
Total Alkalinity                54.865048  0.471504
Electrical Conductance         231.821926  0.539704
Dissolved Reactive Phosphorus   37.108000  0.476175)


In [100]:
def run_pipeline(pipe, X, y, param_name="Parameter"):
    print(f"\n{'='*60}")
    print(f"Training Model for {param_name}")
    print(f"{'='*60}")
    
    # 1. Split data (using your existing split_data function)
    X_train, X_test, y_train, y_test = split_data(X, y)
    
    # 2. Train the Pipeline
    # The pipeline handles scaling internally, so we pass raw X_train
    pipe.fit(X_train, y_train)
    
    # 3. Helper function to calculate metrics
    def get_metrics(y_true, X_input, split_name):
        y_pred = pipe.predict(X_input)
        rmse = root_mean_squared_error(y_true, y_pred)
        r2 = r2_score(y_true, y_pred)
        print(f"{split_name} Set -> R2: {r2:.4f}, RMSE: {rmse:.4f}")
        return r2, rmse

    # 4. Evaluate (In-sample / Train)
    r2_train, rmse_train = get_metrics(y_train, X_train, "Train")
    
    # 5. Evaluate (Out-sample / Test)
    r2_test, rmse_test = get_metrics(y_test, X_test, "Test")
    
    # 6. Return summary
    results = {
        "Parameter": param_name,
        "R2_Train": r2_train,
        "RMSE_Train": rmse_train,
        "R2_Test": r2_test,
        "RMSE_Test": rmse_test
    }
    
    # Return the fitted pipe and the results table
    return pipe, pd.DataFrame([results])

# --- Usage Example ---

# Iterate through your dictionary and run the pipeline for each
all_results = []

for name, target in target_dict.items():
    fitted_pipe, result_df = run_pipeline(pipe, X, target, param_name=name)
    all_results.append(result_df)

# Combine all results into one table
final_summary = pd.concat(all_results, ignore_index=True)
print("\nFinal Summary:")
print(final_summary)


Training Model for Total Alkalinity
Train Set -> R2: 0.8064, RMSE: 32.7181
Test Set -> R2: 0.4715, RMSE: 54.8650

Training Model for Electrical Conductance
Train Set -> R2: 0.8348, RMSE: 139.0043
Test Set -> R2: 0.5397, RMSE: 231.8219

Training Model for Dissolved Reactive Phosphorus
Train Set -> R2: 0.8045, RMSE: 22.4822
Test Set -> R2: 0.4762, RMSE: 37.1080

Final Summary:
                       Parameter  R2_Train  RMSE_Train   R2_Test   RMSE_Test
0               Total Alkalinity  0.806352   32.718098  0.471504   54.865048
1         Electrical Conductance  0.834816  139.004270  0.539704  231.821926
2  Dissolved Reactive Phosphorus  0.804518   22.482179  0.476175   37.108000
