In [4]:
# ----------------- Imports -----------------
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)
# 2. Load your 6600+ row dataset
df = pd.read_csv(r'../data/crop_recommendation_6600_realistic.csv')  # Adjust path if needed
X = df.drop('label', axis=1)
y = df['label']
df.head()






Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,93.113649,91.516982,46.173434,27.783575,83.949361,6.080523,117.102538,banana
1,21.995568,35.747756,20.798377,27.519434,86.402826,6.606874,37.612247,mungbean
2,27.0,30.0,5.0,32.717485,90.546083,7.656978,113.328978,orange
3,60.866144,55.120913,36.749537,26.071922,80.1555,7.122667,150.817129,jute
4,24.0,44.0,17.0,29.859691,80.034996,6.666381,50.664875,mungbean


In [2]:
# ----------------- Encode Target -----------------
le = LabelEncoder()
y_encoded = le.fit_transform(y)
joblib.dump(le, '../app/label_encoder.pkl')

['../app/label_encoder.pkl']

In [3]:
# ----------------- Train/Test Split -----------------
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# ----------------- Scale Data -----------------
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, '../app/scaler.pkl')



['../app/scaler.pkl']

In [4]:
model_scores = {}

In [5]:

# ----------------- Random Forest -----------------
rf_params = {'n_estimators': [100], 'max_depth': [None], 'min_samples_split': [2]}
rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=3, scoring='accuracy')
rf_grid.fit(X_train, y_train)
rf_model = rf_grid.best_estimator_
model_scores["Random Forest"] = accuracy_score(y_test, rf_model.predict(X_test))
joblib.dump(rf_model, '../app/rf_model.pkl')

['../app/rf_model.pkl']

In [6]:

# ----------------- Logistic Regression -----------------
lr_params = {'C': [1.0], 'solver': ['lbfgs'], 'max_iter': [500]}
lr_grid = GridSearchCV(LogisticRegression(), lr_params, cv=3, scoring='accuracy')
lr_grid.fit(X_train_scaled, y_train)
lr_model = lr_grid.best_estimator_
model_scores["Logistic Regression"] = accuracy_score(y_test, lr_model.predict(X_test_scaled))
joblib.dump(lr_model, '../app/lr_model.pkl')



['../app/lr_model.pkl']

In [7]:
# ----------------- KNN -----------------
knn_params = {'n_neighbors': [5]}
knn_grid = GridSearchCV(KNeighborsClassifier(), knn_params, cv=3, scoring='accuracy')
knn_grid.fit(X_train_scaled, y_train)
knn_model = knn_grid.best_estimator_
model_scores["KNN"] = accuracy_score(y_test, knn_model.predict(X_test_scaled))
joblib.dump(knn_model, '../app/knn_model.pkl')

['../app/knn_model.pkl']

In [8]:
# ----------------- XGBoost -----------------
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)
model_scores["XGBoost"] = accuracy_score(y_test, xgb.predict(X_test))
joblib.dump(xgb, '../app/xgb_model.pkl')


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


['../app/xgb_model.pkl']

In [9]:
# ----------------- Show Results -----------------
for model, score in model_scores.items():
    print(f"{model} Accuracy: {round(score*100, 2)}%")

Random Forest Accuracy: 99.85%
Logistic Regression Accuracy: 98.41%
KNN Accuracy: 98.64%
XGBoost Accuracy: 99.39%


In [10]:
model_scores = {
    "Random Forest": 0.997,
    "Logistic Regression": 0.9841,
    "KNN": 0.9864,
    "XGBoost": 0.9939
}
joblib.dump(model_scores, "../app/model_accuracies.pkl")

['../app/model_accuracies.pkl']