In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Load dataset
df = pd.read_csv("../data/soil_climate.csv")

In [3]:
df.head()

Unnamed: 0,Crop_Type,Soil_Type,Farm_Size_Acres,Irrigation_Available,Soil_pH,Soil_Nitrogen,Soil_Organic_Matter,Temperature,Rainfall,Humidity,Compatible
0,summer paddy,Red and Yellow soils,79.838232,0,5.17829,198.870486,1.45855,20.172143,1861.635725,57.924332,0
1,Kulthi,Alluvial soils,33.932796,1,4.862699,27.78168,2.530317,33.646919,1400.435779,53.26191,0
2,Arhar,Laterite soils,59.673206,1,7.691357,62.500094,4.656399,22.535805,574.308028,65.547263,0
3,Gram,Alluvial soils,50.000261,0,6.482151,134.655093,2.268048,25.672081,1900.397115,34.972994,0
4,summer paddy,Red and Yellow soils,94.628058,0,6.054078,69.894889,1.448071,14.366488,1568.615247,25.953544,0


In [4]:
print("Dataset Shape:", df.shape)

Dataset Shape: (10000, 11)


In [5]:
# Check for categorical columns
print(df.dtypes)

Crop_Type                object
Soil_Type                object
Farm_Size_Acres         float64
Irrigation_Available      int64
Soil_pH                 float64
Soil_Nitrogen           float64
Soil_Organic_Matter     float64
Temperature             float64
Rainfall                float64
Humidity                float64
Compatible                int64
dtype: object


In [6]:
print("Columns:", df.columns)

Columns: Index(['Crop_Type', 'Soil_Type', 'Farm_Size_Acres', 'Irrigation_Available',
       'Soil_pH', 'Soil_Nitrogen', 'Soil_Organic_Matter', 'Temperature',
       'Rainfall', 'Humidity', 'Compatible'],
      dtype='object')


In [7]:
# Encode categorical columns
label_encoders = {}
for col in ["Crop_Type", "Soil_Type"]:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [8]:
# Features and target
X = df.drop("Compatible", axis=1)
y = df["Compatible"]

In [9]:
# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [10]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
# Random Forest with class_weight balanced
model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    class_weight="balanced",
    random_state=42
)

In [12]:
# Train
model.fit(X_train, y_train)

In [13]:
# Predictions
y_pred = model.predict(X_test)

In [14]:
# Results
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.998
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1858
           1       1.00      0.97      0.99       142

    accuracy                           1.00      2000
   macro avg       1.00      0.99      0.99      2000
weighted avg       1.00      1.00      1.00      2000



In [19]:
import joblib
import json

# Save trained model (only one model, not best_model)
joblib.dump(model, "soil_climate_model.pkl")

# Save label encoders
joblib.dump(label_encoders, "label_encoders.pkl")

# Save scaler
joblib.dump(scaler, "scaler.pkl")

# Save feature columns (from DataFrame, not numpy array)
with open("feature_columns.json", "w") as f:
    json.dump(df.drop("Compatible", axis=1).columns.tolist(), f)

print("✅ Model and related objects saved successfully!")

NameError: name 'best_model' is not defined