IMPORTS

In [1]:
import sys 
from pathlib import Path

import pandas as pd
import numpy as np


import warnings
warnings.filterwarnings('ignore')  # To keep output clean
pd.pandas.set_option('display.max_columns', None)

project_root = Path().resolve().parent
sys.path.append(str(project_root))

from src.dataset import load_data
from src.config import PROCESSED_DATA_DIR, logger
from src.features import build_preprocessor
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from src.modeling.train import evaluate_model, save_model


[32m2025-07-11 23:30:53.681[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /Users/omorinsolamakinde/nutrient-gap[0m


LOAD DATA

In [2]:
train_df = load_data("clean_train.csv", data_type="processed")
train_df.head()

[32m2025-07-11 23:31:00.520[0m | [1mINFO    [0m | [36msrc.dataset[0m:[36mload_data[0m:[36m25[0m - [1mloading data from/Users/omorinsolamakinde/nutrient-gap/data/processed/clean_train.csv[0m


Unnamed: 0,lon,lat,pH,alb,bio1,bio12,bio15,bio7,bp,cec20,dows,ecec20,hp20,ls,lstd,mb1,mb2,mb3,mb7,para,parv,ph20,slope,snd20,soc20,tim,wp,BulkDensity,N,P,K,Ca,Mg,S,Fe,Mn,Zn,Cu,B
0,70.603761,46.173798,7.75,176,248,920,108,190,0.581573,22.0,21.500278,11.00779,0.00779,0.03,44.908058,2006.000488,3182.000732,855.000244,2363.000732,20.544283,126.83548,7.05,1.962921,39.0,9.75,7.962668,0.016853,1.46,1300,0.34,147,6830,2310,5.66,75.2,85.0,0.82,2.98,0.24
1,70.590479,46.078924,7.1,181,250,1080,113,191,0.707011,24.0,21.389599,14.0235,0.0235,0.03,44.985626,1637.000122,2839.000488,707.000061,2039.000488,18.869566,109.835541,6.975,0.162065,40.0,8.0,8.4395,0.018321,1.52,1400,11.7,151,1180,235,19.4,96.2,409.0,2.57,4.32,0.1
2,70.582553,46.04882,6.95,188,250,1109,111,191,0.362439,15.25,18.900057,16.062401,0.0624,0.03,44.167717,1639.999634,2903.0,758.999939,2003.999878,24.719807,214.385269,6.725,0.744845,46.0,9.25,8.289246,0.020588,1.46,3500,21.8,151,1890,344,11.0,76.7,65.0,1.95,1.24,0.22
3,70.573267,46.02191,7.83,174,250,1149,112,191,0.531739,22.0,17.022963,18.030899,0.0309,0.03,43.281063,1325.000122,2413.000244,631.999939,1961.0,27.230274,255.713043,6.625,0.708708,43.75,10.0,8.666523,0.016913,1.48,2300,39.9,201,6660,719,14.9,81.9,73.0,4.9,3.08,0.87
4,70.58533,46.204336,8.07,188,250,869,114,191,0.039202,14.75,23.103102,11.0,0.0,0.155324,45.654484,1628.999512,2685.999023,732.999939,2427.0,20.434782,86.220909,6.7,0.634153,49.25,7.0,15.139549,0.019791,1.43,940,1.0,90,7340,1160,8.66,69.4,149.0,0.55,3.03,0.31


TARGET COLUMNS

In [3]:
# target columns
target_cols = ['N', 'P', 'K', 'Ca', 'Mg', 'S', 'Fe', 'Mn', 'Zn', 'Cu', 'B']

PREPROCESSOR PIPELINE

In [4]:
# build processor 
preprocessor,  feature_cols = build_preprocessor(
    df=train_df,
    target_cols=target_cols,
    impute_strategy="mean",
    scale=True,
    cap_outliers=True,
    power_transformer=True
    
)

# Split feature and target
X = train_df[feature_cols]
y = train_df[target_cols]


# Fit and transform training data
X_preprocessed = preprocessor.fit_transform(X)

print(X_preprocessed.shape)

(7744, 28)


BASELINE MODEL(DUMMY REGRESSOR)

In [5]:
dummy = DummyRegressor(strategy="mean")
multi_dummy = MultiOutputRegressor(dummy)


MODEL PIPELINE

In [6]:
baseline_pipeline, train_rmse, val_rmse = evaluate_model(
    preprocessor=preprocessor,
    model=multi_dummy,
    X=X,
    y=y,
    cv_folds=5
)

Training RMSE: [490.85048603 489.88861575 489.72734397 490.42028733 485.87743715]
Validation RMSE: [483.10120617 486.31832091 487.65862859 484.28928517 502.36044272]
Avg Train RMSE: 489.35
Avg Val RMSE: 488.75
Gap (Val - Train): -0.61


SAVE BASELINE MODEL 

In [7]:
save_model(baseline_pipeline, "baseline_model.joblib")

Model saved to /Users/omorinsolamakinde/nutrient-gap/models/baseline_model.joblib


LINEAR REGRESSION 


In [8]:
multi_linear = MultiOutputRegressor(LinearRegression())

linear_pipeline, train_rmse, val_rmse = evaluate_model(
    preprocessor=preprocessor,
    model=multi_linear,
    X=X,
    y=y
)

Training RMSE: [319.77645922 316.11663796 316.70885886 318.60719882 313.98274277]
Validation RMSE: [308.16333168 320.45588113 318.68928252 311.7973517  330.97461375]
Avg Train RMSE: 317.04
Avg Val RMSE: 318.02
Gap (Val - Train): 0.98


SAVE LINEAR MODEL

In [9]:
save_model(linear_pipeline, "01_linear_model.joblib")

Model saved to /Users/omorinsolamakinde/nutrient-gap/models/01_linear_model.joblib
