# Boron Prediction Model
This notebook demonstrates a robust machine learning workflow to predict Boron content using Random Forest. It supports both small and large datasets.

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv(r"C:\Users\Prabakaran\Desktop\SOIL TESTING KIT\randomforest\model training datasheet\boron_template.csv")
df = df.dropna()
df = df.drop(columns=["Sample ID", "Notes"], errors='ignore')


In [4]:
df.head()

Unnamed: 0,Boron(ppm),F1 (415nm),F2 (445nm),F3 (480nm),F4 (515nm),F5 (555nm),F6 (590nm),F7 (630nm),F8 (680nm),Clear,NIR
0,10.0,456.0,654.0,2132.0,546.0,3511.0,513.0,655.0,6515.0,65.0,51.0
1,20.0,774.0,1355.0,115.0,3154.0,164.0,215.0,1654.0,15.0,16.0,63.0
2,30.0,855.0,5151.0,3165.0,2035.0,1542.0,5416.0,6221.0,16.0,0.0,21.0
3,40.0,922.0,1651.0,225.0,1652.0,1215.0,6541.0,5652.0,215.0,1.0,315.0
4,50.0,1102.0,1354.0,1655.0,1352.0,1515.0,6635.0,5452.0,152.0,61.0,355.0


In [5]:
X = df.drop(columns=["Boron(ppm)"])
y = df["Boron(ppm)"]

In [6]:
X


Unnamed: 0,F1 (415nm),F2 (445nm),F3 (480nm),F4 (515nm),F5 (555nm),F6 (590nm),F7 (630nm),F8 (680nm),Clear,NIR
0,456.000000,654.000000,2132.000000,546.000000,3511.000000,513.000000,655.000000,6515.000000,65.000000,51.000000
1,774.000000,1355.000000,115.000000,3154.000000,164.000000,215.000000,1654.000000,15.000000,16.000000,63.000000
2,855.000000,5151.000000,3165.000000,2035.000000,1542.000000,5416.000000,6221.000000,16.000000,0.000000,21.000000
3,922.000000,1651.000000,225.000000,1652.000000,1215.000000,6541.000000,5652.000000,215.000000,1.000000,315.000000
4,1102.000000,1354.000000,1655.000000,1352.000000,1515.000000,6635.000000,5452.000000,152.000000,61.000000,355.000000
...,...,...,...,...,...,...,...,...,...,...
83,1489.732237,3183.857754,3651.607896,4449.841547,219.338642,526.331135,548.205608,430.747155,596.521593,140.416089
84,777.208560,1601.845574,231.673837,1753.529513,1160.602030,6588.893417,6072.842217,200.650517,1.004139,345.438718
85,949.898600,1818.801971,222.402619,1392.012569,1095.400894,7274.251492,4256.325595,200.029444,1.125791,291.975875
86,896.875581,1997.373227,213.375374,1545.538011,1319.330645,5887.676120,5588.244219,210.404690,1.029125,274.763400


In [7]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

In [9]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [10]:
Train_r2 = r2_score(y_train, model.predict(X_train))
Test_r2 = r2_score(y_test, model.predict(X_test))

In [11]:
Train_r2

0.9977441909916877

In [12]:
Test_r2

0.9901921195652174

In [13]:
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
cv=cv_scores.mean()

In [14]:
cv

np.float64(0.9804356239688632)

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint


param_dist = {
    'n_estimators': randint(50, 300),
    'max_depth': randint(5, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'max_features': [ 'sqrt', 'log2']
}

search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,               # number of combinations to try
    cv=5,                    # 5-fold cross-validation
    scoring='r2',
    verbose=2,
    random_state=42,
    n_jobs=-1)

search.fit(X_scaled, y)
print("Best R2:", search.best_score_)
print("Best Params:", search.best_params_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best R2: 0.9900168668229032
Best Params: {'max_depth': 43, 'max_features': 'log2', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 170}


In [16]:
import joblib
# Save the model to a file
joblib.dump(model, 'soil_nutrient_model.pkl')

['soil_nutrient_model.pkl']