In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# Load the dataset
df = pd.read_excel("soil_data.xlsx")

# Quick inspection
df.head()


Unnamed: 0,ID,Sand %,Clay %,Silt %,pH,EC mS/cm,O.M. %,CACO3 %,N_NO3 ppm,P ppm,K ppm,Mg ppm,Fe ppm,Zn ppm,Mn ppm,Cu ppm,B ppm
0,1,42.0,38.0,20.0,5.162,0.274,3.783414,0.0,14.462366,18.60884,147,1115,91.32,0.894,27.06,1.768,0.4
1,2,54.0,26.0,20.0,6.071,0.355,1.451688,0.0,14.587814,32.742137,180,1055,60.04,3.66,17.58,1.684,0.41
2,3,28.0,36.0,36.0,6.921,0.675,1.767599,0.4,14.910394,19.024525,183,1215,61.24,3.06,18.0,1.84,0.46
3,4,58.0,20.0,22.0,7.704,0.355,0.718322,6.8,4.21147,6.276846,116,327,19.54,1.64,5.04,0.844,0.5
4,5,33.2,41.2,25.6,7.609,0.392,0.996625,4.39,6.111111,6.55397,228,939,25.84,0.18,7.16,1.216,0.4


In [3]:
df.shape



(781, 17)

In [4]:
df = df.drop(columns=["ID"])
df.head()


Unnamed: 0,Sand %,Clay %,Silt %,pH,EC mS/cm,O.M. %,CACO3 %,N_NO3 ppm,P ppm,K ppm,Mg ppm,Fe ppm,Zn ppm,Mn ppm,Cu ppm,B ppm
0,42.0,38.0,20.0,5.162,0.274,3.783414,0.0,14.462366,18.60884,147,1115,91.32,0.894,27.06,1.768,0.4
1,54.0,26.0,20.0,6.071,0.355,1.451688,0.0,14.587814,32.742137,180,1055,60.04,3.66,17.58,1.684,0.41
2,28.0,36.0,36.0,6.921,0.675,1.767599,0.4,14.910394,19.024525,183,1215,61.24,3.06,18.0,1.84,0.46
3,58.0,20.0,22.0,7.704,0.355,0.718322,6.8,4.21147,6.276846,116,327,19.54,1.64,5.04,0.844,0.5
4,33.2,41.2,25.6,7.609,0.392,0.996625,4.39,6.111111,6.55397,228,939,25.84,0.18,7.16,1.216,0.4


In [21]:
def fertility_score(row):
    score = 0

    if 6.0 <= row["ph"] <= 7.5:
        score += 1

    if row["om"] > 2.0:
        score += 1

    if row["n_no3_ppm"] > 50:
        score += 1

    if row["p_ppm"] > 15:
        score += 1

    if row["k_ppm"] > 150:
        score += 1

    if row["ec_ms_cm"] < 1.0:
        score += 1

    return score


In [22]:
df.columns = (
    df.columns
    .str.strip()
    .str.lower()
    .str.replace('%', '', regex=False)
    .str.replace('/', '_', regex=False)
    .str.replace('.', '', regex=False)
    .str.replace(' ', '_', regex=False)
)


In [23]:
df.columns


Index(['sand', 'clay', 'silt', 'ph', 'ec_ms_cm', 'om', 'caco3', 'n_no3_ppm',
       'p_ppm', 'k_ppm', 'mg_ppm', 'fe_ppm', 'zn_ppm', 'mn_ppm', 'cu_ppm',
       'b_ppm'],
      dtype='object')

In [24]:
df = df.rename(columns={
    "sand_": "sand",
    "clay_": "clay",
    "silt_": "silt",
    "om_()": "om",
    "caco3_": "caco3"
})


In [25]:
df.columns


Index(['sand', 'clay', 'silt', 'ph', 'ec_ms_cm', 'om', 'caco3', 'n_no3_ppm',
       'p_ppm', 'k_ppm', 'mg_ppm', 'fe_ppm', 'zn_ppm', 'mn_ppm', 'cu_ppm',
       'b_ppm'],
      dtype='object')

In [26]:
df["fertility_score"] = df.apply(fertility_score, axis=1)
df[["fertility_score"]].head()


Unnamed: 0,fertility_score
0,3
1,4
2,4
3,1
4,2


In [27]:
def fertility_class(score):
    if score <= 2:
        return "Low"
    elif score <= 4:
        return "Medium"
    else:
        return "High"

df["fertility_class"] = df["fertility_score"].apply(fertility_class)
df[["fertility_score", "fertility_class"]].head()


Unnamed: 0,fertility_score,fertility_class
0,3,Medium
1,4,Medium
2,4,Medium
3,1,Low
4,2,Low


In [28]:
df["fertility_class"].value_counts()


fertility_class
Medium    445
Low       290
High       46
Name: count, dtype: int64

In [29]:
# Select features (X) and target (y)
features = [
    "ph",
    "om",
    "n_no3_ppm",
    "p_ppm",
    "k_ppm",
    "ec_ms_cm"
]

X = df[features]
y = df["fertility_class"]

X.head(), y.head()


(      ph        om  n_no3_ppm      p_ppm  k_ppm  ec_ms_cm
 0  5.162  3.783414  14.462366  18.608840    147     0.274
 1  6.071  1.451688  14.587814  32.742137    180     0.355
 2  6.921  1.767599  14.910394  19.024525    183     0.675
 3  7.704  0.718322   4.211470   6.276846    116     0.355
 4  7.609  0.996625   6.111111   6.553970    228     0.392,
 0    Medium
 1    Medium
 2    Medium
 3       Low
 4       Low
 Name: fertility_class, dtype: object)

In [30]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Check class order
le.classes_


array(['High', 'Low', 'Medium'], dtype=object)

In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

X_train.shape, X_test.shape


((624, 6), (157, 6))

In [32]:
from sklearn.linear_model import LogisticRegression

baseline_model = LogisticRegression(max_iter=1000)
baseline_model.fit(X_train, y_train)


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [33]:
y_pred = baseline_model.predict(X_test)


In [34]:
from sklearn.metrics import confusion_matrix, classification_report

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Confusion Matrix:
[[ 0  0  9]
 [ 0 51  7]
 [ 3 10 77]]

Classification Report:
              precision    recall  f1-score   support

        High       0.00      0.00      0.00         9
         Low       0.84      0.88      0.86        58
      Medium       0.83      0.86      0.84        90

    accuracy                           0.82       157
   macro avg       0.55      0.58      0.57       157
weighted avg       0.78      0.82      0.80       157



In [35]:
X.describe()


Unnamed: 0,ph,om,n_no3_ppm,p_ppm,k_ppm,ec_ms_cm
count,781.0,781.0,781.0,781.0,781.0,781.0
mean,7.133668,2.028909,11.623384,13.840659,296.226633,0.452793
std,0.897823,0.627244,11.052054,14.586017,194.461073,0.304841
min,4.49,0.37,0.18,1.07,31.0,0.134
25%,6.66,1.59,5.28,5.73,169.0,0.338
50%,7.55,1.96,8.85,9.49,255.0,0.404
75%,7.78,2.37,14.31,15.98,358.0,0.497
max,8.2,4.98,120.64,152.42,1665.0,5.621


In [36]:
from sklearn.ensemble import RandomForestClassifier


In [37]:
rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=8,
    min_samples_leaf=5,
    random_state=42
)

rf_model.fit(X_train, y_train)


In [38]:
y_pred_rf = rf_model.predict(X_test)


In [39]:
from sklearn.metrics import confusion_matrix, classification_report

print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))


Random Forest Confusion Matrix:
[[ 5  0  4]
 [ 0 57  1]
 [ 0  0 90]]

Random Forest Classification Report:
              precision    recall  f1-score   support

        High       1.00      0.56      0.71         9
         Low       1.00      0.98      0.99        58
      Medium       0.95      1.00      0.97        90

    accuracy                           0.97       157
   macro avg       0.98      0.85      0.89       157
weighted avg       0.97      0.97      0.96       157



In [40]:
from sklearn.metrics import accuracy_score

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))


Logistic Regression Accuracy: 0.8152866242038217
Random Forest Accuracy: 0.9681528662420382


In [41]:
import joblib


In [42]:
model_bundle = {
    "model": rf_model,
    "label_encoder": le,
    "features": features
}


In [43]:
joblib.dump(model_bundle, "soil_fertility_model.joblib")


['soil_fertility_model.joblib']

In [44]:
loaded_bundle = joblib.load("soil_fertility_model.joblib")

loaded_model = loaded_bundle["model"]
loaded_le = loaded_bundle["label_encoder"]
loaded_features = loaded_bundle["features"]


In [45]:
# Take one test sample
sample = X_test.iloc[0:1]

# Predict
pred_encoded = loaded_model.predict(sample)
pred_label = loaded_le.inverse_transform(pred_encoded)

pred_label


array(['Low'], dtype=object)