In [7]:
# STEP 1: Load the dataset
import pandas as pd
import os

# Check if the folder exists
dataset_path = "../soil_dataset"
print("Files in soil_dataset folder:", os.listdir(dataset_path))

# Load the CSV file
csv_file = os.path.join(dataset_path, "soil_fertility.csv")
df = pd.read_csv(csv_file)

# Show first 5 rows
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Check data types and missing values
print("\nDataset info:")
print(df.info())


Files in soil_dataset folder: ['soil_fertility.csv']

First 5 rows of the dataset:
     N    P    K    pH    EC    OC     S    Zn    Fe    Cu    Mn     B  Output
0  138  8.6  560  7.46  0.62  0.70   5.9  0.24  0.31  0.77  8.71  0.11       0
1  213  7.5  338  7.62  0.75  1.06  25.4  0.30  0.86  1.54  2.89  2.29       0
2  163  9.6  718  7.59  0.51  1.11  14.3  0.30  0.86  1.57  2.70  2.03       0
3  157  6.8  475  7.64  0.58  0.94  26.0  0.34  0.54  1.53  2.65  1.82       0
4  270  9.9  444  7.63  0.40  0.86  11.8  0.25  0.76  1.69  2.43  2.26       1

Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 880 entries, 0 to 879
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   N       880 non-null    int64  
 1   P       880 non-null    float64
 2   K       880 non-null    int64  
 3   pH      880 non-null    float64
 4   EC      880 non-null    float64
 5   OC      880 non-null    float64
 6   S       880 non-null  

In [8]:
# STEP 2: Separate features and target
# Features (all columns except 'Output')
X = df.drop("Output", axis=1)

# Target
y = df["Output"]

# Quick check
print("Feature columns:", X.columns.tolist())
print("Target column unique values:", y.unique())


Feature columns: ['N', 'P', 'K', 'pH', 'EC', 'OC', 'S', 'Zn', 'Fe', 'Cu', 'Mn', 'B']
Target column unique values: [0 1 2]


In [9]:
# STEP 3: Split the dataset
from sklearn.model_selection import train_test_split

# 80% train, 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Quick check
print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)


Training set shape: (704, 12) (704,)
Test set shape: (176, 12) (176,)


In [10]:
# STEP 4: Feature scaling
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Fit on training data and transform
X_train_scaled = scaler.fit_transform(X_train)

# Transform test data
X_test_scaled = scaler.transform(X_test)

# Quick check
print("First 5 rows of scaled features (train):")
print(X_train_scaled[:5])


First 5 rows of scaled features (train):
[[ 6.99460089e-01 -2.08808131e-01 -1.14216140e+00 -1.31077499e+01
  -1.83152796e-01  2.79188410e-01  1.28810860e-01 -2.83647234e-02
   6.79420565e-01  6.76419731e-01  7.22933086e-02 -4.69884791e-01]
 [-8.38544332e-01 -2.91064306e-01  6.94090121e-02 -5.63332270e-01
   1.30134881e+00  2.03820539e-01  5.87998368e-01 -5.67294468e-02
  -6.06014035e-01  1.27106344e+00 -1.36203743e+00  2.59257902e+00]
 [-1.08410806e+00 -2.22517493e-01  1.79671548e+00  1.50094326e-01
  -2.53843349e-01  5.26825701e-01  1.50191525e+00 -8.98216241e-02
  -1.01896766e+00  1.38118264e+00 -1.37358599e+00  2.53977792e+00]
 [ 1.34568043e+00 -2.08808131e-01  5.85145066e-01 -2.26436377e-01
  -1.31420164e+00  6.38516359e-02  4.65399762e-01 -7.56392624e-02
  -1.09227716e-01  8.52610459e-01  1.01696525e+00 -4.87485158e-01]
 [-5.15434159e-01 -3.82460055e-01 -3.64464176e-01 -2.26436377e-01
  -8.19367771e-01 -5.39091333e-01 -1.12838212e+00 -6.61843546e-02
  -6.57589135e-02 -1.38462387e-

In [11]:
# STEP 5: Train a Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)

# Predict on test set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Test Accuracy: 0.8920454545454546

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.93      0.92        80
           1       0.87      0.94      0.91        88
           2       0.00      0.00      0.00         8

    accuracy                           0.89       176
   macro avg       0.60      0.62      0.61       176
weighted avg       0.85      0.89      0.87       176



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [12]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Compute class weights
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))
print("Class weights:", class_weights)

# Train RF with class weights
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight=class_weights)
rf_model.fit(X_train_scaled, y_train)

# Predict & evaluate
y_pred = rf_model.predict(X_test_scaled)
from sklearn.metrics import classification_report, accuracy_score
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Class weights: {0: 0.731048805815161, 1: 0.6666666666666666, 2: 7.56989247311828}
Test Accuracy: 0.8977272727272727

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93        80
           1       0.88      0.93      0.91        88
           2       0.67      0.25      0.36         8

    accuracy                           0.90       176
   macro avg       0.82      0.70      0.73       176
weighted avg       0.89      0.90      0.89       176



In [13]:
import joblib
import os

# Folder to save model
model_folder = "../soil_dataset/models"
os.makedirs(model_folder, exist_ok=True)

# Save the trained model
model_file = os.path.join(model_folder, "soil_rf_model.pkl")
joblib.dump(rf_model, model_file)

# Save the scaler
scaler_file = os.path.join(model_folder, "soil_scaler.pkl")
joblib.dump(scaler, scaler_file)

print("✅ Model saved to:", model_file)
print("✅ Scaler saved to:", scaler_file)


✅ Model saved to: ../soil_dataset/models\soil_rf_model.pkl
✅ Scaler saved to: ../soil_dataset/models\soil_scaler.pkl


In [14]:
# STEP 8: Predict soil quality from user input
import numpy as np
import joblib

# Load the saved model and scaler
model = joblib.load("../soil_dataset/models/soil_rf_model.pkl")
scaler = joblib.load("../soil_dataset/models/soil_scaler.pkl")

# Example: manually input features
user_input = {
    "N": 150,
    "P": 10,
    "K": 300,
    "pH": 6.5,
    "EC": 0.5,
    "OC": 0.8,
    "S": 10,
    "Zn": 0.3,
    "Fe": 0.4,
    "Cu": 0.2,
    "Mn": 2.0,
    "B": 0.1
}

# Convert input to array
input_array = np.array([list(user_input.values())])

# Scale features
input_scaled = scaler.transform(input_array)

# Predict
pred_class = model.predict(input_scaled)[0]
print("Predicted Soil Fertility Class:", pred_class)


Predicted Soil Fertility Class: 0




In [15]:
# Load model and scaler
import pandas as pd
import numpy as np
import joblib

model = joblib.load("../soil_dataset/models/soil_rf_model.pkl")
scaler = joblib.load("../soil_dataset/models/soil_scaler.pkl")

# User input as a DataFrame (preserves column names)
user_input = pd.DataFrame([{
    "N": 150,
    "P": 10,
    "K": 300,
    "pH": 6.5,
    "EC": 0.5,
    "OC": 0.8,
    "S": 10,
    "Zn": 0.3,
    "Fe": 0.4,
    "Cu": 0.2,
    "Mn": 2.0,
    "B": 0.1
}])

# Scale features
input_scaled = scaler.transform(user_input)

# Predict
pred_class = model.predict(input_scaled)[0]

# Map numeric class to readable label
class_map = {0: "Low", 1: "Medium", 2: "High"}
print("Predicted Soil Fertility Class:", class_map[pred_class])


Predicted Soil Fertility Class: Low
