In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import joblib

# Load and verify data
df = pd.read_csv('crop_data.csv')
print("Original Crop Distribution:")
print(df['label'].value_counts())

# Feature Engineering - ADDED MISSING FEATURE CALCULATIONS
df['nutrient_balance'] = (df['N'] + df['P']) / df['K']
df['temp_humidity_index'] = (df['temperature'] * df['humidity']) / 100

# Create feature matrix with all required features
X = df[['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 
       'nutrient_balance', 'temp_humidity_index']]

# Encode labels
le = LabelEncoder()
y = le.fit_transform(df['label'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

model = GridSearchCV(
    RandomForestClassifier(class_weight='balanced'),
    param_grid,
    cv=5,
    scoring='f1_weighted'
)
model.fit(X_train, y_train)

# Evaluate
print("\nBest Parameters:", model.best_params_)
print(classification_report(y_test, model.predict(X_test), 
                           target_names=le.classes_))

# Save artifacts
joblib.dump(model.best_estimator_, 'model.pkl')
joblib.dump(le, 'label_encoder.pkl')
print(f"\nSaved {len(le.classes_)} crops: {list(le.classes_)}")

Original Crop Distribution:
label
rice           100
maize          100
chickpea       100
kidneybeans    100
pigeonpeas     100
mothbeans      100
mungbean       100
blackgram      100
lentil         100
pomegranate    100
banana         100
mango          100
grapes         100
watermelon     100
muskmelon      100
apple          100
orange         100
papaya         100
coconut        100
cotton         100
jute           100
coffee         100
Name: count, dtype: int64

Best Parameters: {'max_depth': 20, 'min_samples_split': 2, 'n_estimators': 100}
              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        20
      banana       1.00      1.00      1.00        20
   blackgram       1.00      0.95      0.97        20
    chickpea       1.00      1.00      1.00        20
     coconut       1.00      1.00      1.00        20
      coffee       1.00      1.00      1.00        20
      cotton       1.00      1.00      1.00        20
      gr