In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import json

In [2]:
# Load dataset
df = pd.read_csv("../data/soil_climate.csv")

In [3]:
df.head()

Unnamed: 0,Crop_Type,Soil_Type,Farm_Size_Acres,Irrigation_Available,Soil_pH,Soil_Nitrogen,Soil_Organic_Matter,Temperature,Rainfall,Humidity,Compatible
0,summer paddy,Red and Yellow soils,79.838232,0,5.17829,198.870486,1.45855,20.172143,1861.635725,57.924332,0
1,Kulthi,Alluvial soils,33.932796,1,4.862699,27.78168,2.530317,33.646919,1400.435779,53.26191,0
2,Arhar,Laterite soils,59.673206,1,7.691357,62.500094,4.656399,22.535805,574.308028,65.547263,0
3,Gram,Alluvial soils,50.000261,0,6.482151,134.655093,2.268048,25.672081,1900.397115,34.972994,0
4,summer paddy,Red and Yellow soils,94.628058,0,6.054078,69.894889,1.448071,14.366488,1568.615247,25.953544,0


In [4]:
print("Dataset Shape:", df.shape)

Dataset Shape: (10000, 11)


In [5]:
# Check for categorical columns
print(df.dtypes)

Crop_Type                object
Soil_Type                object
Farm_Size_Acres         float64
Irrigation_Available      int64
Soil_pH                 float64
Soil_Nitrogen           float64
Soil_Organic_Matter     float64
Temperature             float64
Rainfall                float64
Humidity                float64
Compatible                int64
dtype: object


In [6]:
print("Columns:", df.columns)

Columns: Index(['Crop_Type', 'Soil_Type', 'Farm_Size_Acres', 'Irrigation_Available',
       'Soil_pH', 'Soil_Nitrogen', 'Soil_Organic_Matter', 'Temperature',
       'Rainfall', 'Humidity', 'Compatible'],
      dtype='object')


In [7]:
# Find unique crop types
print("Unique Soil Types:", df['Soil_Type'].unique())

Unique Soil Types: ['Red and Yellow soils' 'Alluvial soils' 'Laterite soils' 'Black soils']


In [8]:
# Find unique soil types
print("Unique Crop Types:", df['Crop_Type'].unique())

Unique Crop Types: ['summer paddy' 'Kulthi' 'Arhar' 'Gram' 'Pea' 'Soybean' 'Maize' 'Rice'
 'Mustard' 'Urd' 'Wheat' 'Masoor' 'Millets' 'Groundnut' 'Niger' 'Tiwra'
 'Til' 'Moong' 'Jwar']


In [9]:
# Count frequency of each soil type
print("Soil Type Counts:\n", df['Soil_Type'].value_counts())

Soil Type Counts:
 Laterite soils          2539
Alluvial soils          2528
Red and Yellow soils    2474
Black soils             2459
Name: Soil_Type, dtype: int64


In [10]:
# Encode categorical columns
label_encoders = {}
for col in ["Crop_Type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [11]:
# Features & target
X = df.drop("Soil_Type", axis=1)   # All features except Soil_Type
y = df["Soil_Type"]                # Predict Soil_Type

In [12]:
# Encode Soil_Type target separately
soil_encoder = LabelEncoder()
y = soil_encoder.fit_transform(y)
label_encoders["Soil_Type"] = soil_encoder

In [13]:
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [15]:
# Random Forest with class_weight balanced
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)

In [16]:
# Train
model.fit(X_train, y_train)

In [17]:
# Predictions
y_pred = model.predict(X_test)

In [18]:
# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.2495
              precision    recall  f1-score   support

           0       0.26      0.28      0.27       505
           1       0.24      0.23      0.23       492
           2       0.25      0.25      0.25       508
           3       0.25      0.24      0.24       495

    accuracy                           0.25      2000
   macro avg       0.25      0.25      0.25      2000
weighted avg       0.25      0.25      0.25      2000



In [19]:
# Save model + encoders + scaler
joblib.dump(model, "../models/soil_recommendation_model.pkl")
joblib.dump(label_encoders, "../models/soil_label_encoders.pkl")
joblib.dump(scaler, "../models/soil_scaler.pkl")
with open("../models/soil_feature_columns.json", "w") as f:
    json.dump(X.columns.tolist(), f)

print("✅ Soil Recommendation Model saved successfully!")

✅ Soil Recommendation Model saved successfully!


In [20]:
# Load Necessary Libraries
import joblib
import json
import numpy as np
import pandas as pd

In [21]:
# Load the saved models
model = joblib.load("../models/soil_recommendation_model.pkl")
label_encoders = joblib.load("../models/soil_label_encoders.pkl")
scaler = joblib.load("../models/soil_scaler.pkl")
with open("../models/soil_feature_columns.json", "r") as f:
    feature_columns = json.load(f)

In [22]:
# PREDICTION FUNCTION
def predict_best_soil(input_data: dict):
    # Convert to dataframe
    df_input = pd.DataFrame([input_data], columns=feature_columns)

    # Encode categorical features
    if "Crop_Type" in df_input.columns:
        le = label_encoders["Crop_Type"]
        df_input["Crop_Type"] = le.transform(df_input["Crop_Type"])

    # Scale features
    X_scaled = scaler.transform(df_input)

    # Predict soil
    prediction = model.predict(X_scaled)[0]
    prob = model.predict_proba(X_scaled)[0]

    soil_label = label_encoders["Soil_Type"].inverse_transform([prediction])[0]

    return {
        "Best_Soil": soil_label,
        "Probabilities": {
            soil: float(p)
            for soil, p in zip(label_encoders["Soil_Type"].classes_, prob)
        }
    }

In [23]:
test_input = {
    "Crop_Type": "Rice",
    "Farm_Size_Acres": 5,
    "Irrigation_Available": 1,
    "Soil_pH": 6.8,
    "Soil_Nitrogen": 0.25,
    "Soil_Organic_Matter": 1.5,
    "Temperature": 28,
    "Rainfall": 120,
    "Humidity": 70,
    "Compatible": 1
}

print("\n🌱 Best Soil Prediction:", predict_best_soil(test_input))


🌱 Best Soil Prediction: {'Best_Soil': 'Black soils', 'Probabilities': {'Alluvial soils': 0.27871204027100654, 'Black soils': 0.31524102729785425, 'Laterite soils': 0.18389501137070755, 'Red and Yellow soils': 0.22215192106043177}}
