In [3]:
import os
import json
import joblib
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

FEATURES = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']
TARGET = 'label'


In [5]:
df = pd.read_csv('Crop_recommendation.csv')
print("Shape:", df.shape)
df.head()

Shape: (2200, 8)


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [6]:
# Basic null check and class distribution
print(df.isna().sum())
print("\nClass counts:")
print(df[TARGET].value_counts().head(10))

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

Class counts:
label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
Name: count, dtype: int64


In [7]:
# Copy to avoid SettingWithCopy warnings
data = df.copy()

# Ensure dtypes
data[FEATURES] = data[FEATURES].astype(float)
data[TARGET] = data[TARGET].astype(str)

# Encode labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data[TARGET].values)

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(data[FEATURES].values)

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train:", X_train.shape, "Test:", X_test.shape, "Classes:", len(label_encoder.classes_))


Train: (1760, 7) Test: (440, 7) Classes: 22


In [8]:
candidates = {
    "RandomForest": RandomForestClassifier(
        n_estimators=200, max_depth=None, min_samples_leaf=1, random_state=42
    ),
    "SVM_RBF": SVC(kernel='rbf', probability=True, C=3.0, gamma='scale', random_state=42)
}

trained = {}
scores = {}

for name, model in candidates.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    scores[name] = acc
    trained[name] = model
    print(f"{name} accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))


RandomForest accuracy: 0.9955
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00        20
           2       1.00      0.95      0.97        20
           3       1.00      1.00      1.00        20
           4       1.00      1.00      1.00        20
           5       1.00      1.00      1.00        20
           6       1.00      1.00      1.00        20
           7       1.00      1.00      1.00        20
           8       0.95      1.00      0.98        20
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        20
          11       0.95      1.00      0.98        20
          12       1.00      1.00      1.00        20
          13       1.00      1.00      1.00        20
          14       1.00      1.00      1.00        20
          15       1.00      1.00      1.00        20
          16       1.00      1.00      1.00        

In [9]:
best_name = max(scores, key=scores.get)
best_model = trained[best_name]

print("Selected model:", best_name, "with accuracy:", scores[best_name])

BEST_MODEL_PATH = os.path.join(MODEL_DIR, "best_model.pkl")
SCALER_PATH = os.path.join(MODEL_DIR, "scaler.pkl")
ENCODER_PATH = os.path.join(MODEL_DIR, "label_encoder.pkl")

joblib.dump(best_model, BEST_MODEL_PATH)
joblib.dump(scaler, SCALER_PATH)
joblib.dump(label_encoder, ENCODER_PATH)

print("Saved:", BEST_MODEL_PATH)
print("Saved:", SCALER_PATH)
print("Saved:", ENCODER_PATH)
print("All scores:", json.dumps(scores, indent=2))


Selected model: RandomForest with accuracy: 0.9954545454545455
Saved: models\best_model.pkl
Saved: models\scaler.pkl
Saved: models\label_encoder.pkl
All scores: {
  "RandomForest": 0.9954545454545455,
  "SVM_RBF": 0.9886363636363636
}


In [20]:
class CropRecommender:
    def __init__(self, model_path, scaler_path, encoder_path):
        self.model = joblib.load(model_path)
        self.scaler = joblib.load(scaler_path)
        self.encoder = joblib.load(encoder_path)
        self.order = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall']

    def predict_one(self, features_dict):
        x = np.array([[features_dict[k] for k in self.order]], dtype=float)
        xs = self.scaler.transform(x)
        proba = self.model.predict_proba(xs)[0]  # Extract the first row
        idx = np.argmax(proba)
        return {
            "crop": self.encoder.classes_[idx],
            "confidence": float(proba[idx])
        }

    def top_n(self, features_dict, n=5):
        x = np.array([[features_dict[k] for k in self.order]], dtype=float)
        xs = self.scaler.transform(x)
        proba = self.model.predict_proba(xs)[0]  # Extract the first row
        idx = np.argsort(proba)[::-1][:n]
        return [
            {"crop": self.encoder.classes_[i], "score": float(proba[i])}
            for i in idx
        ]

recommender = CropRecommender(BEST_MODEL_PATH, SCALER_PATH, ENCODER_PATH)


In [21]:
example = {
    "N": 90,
    "P": 42,
    "K": 43,
    "temperature": 24.0,
    "humidity": 80.0,
    "ph": 6.5,
    "rainfall": 200.0,
}

best = recommender.predict_one(example)
top5 = recommender.top_n(example, n=5)
best, top5


({'crop': 'rice', 'confidence': 0.52},
 [{'crop': 'rice', 'score': 0.52},
  {'crop': 'jute', 'score': 0.475},
  {'crop': 'watermelon', 'score': 0.005},
  {'crop': 'pomegranate', 'score': 0.0},
  {'crop': 'papaya', 'score': 0.0}])