In [None]:
#step-1 : Loading the Dataset
import pandas as pd
df = pd.read_csv("final_balanced_crop_dataset_4600_all_districts.csv")
print(df.shape)
print(df.head())

(4600, 16)
           state   district      N     P      K    pH  organic_carbon  \
0  Uttar Pradesh  Chandauli  213.9   3.3  114.2  6.38            0.96   
1    Maharashtra       Beed  165.0  26.1  165.4  5.08            0.93   
2          Bihar    Bhojpur  191.1  28.2   46.4  5.24            1.09   
3      Rajasthan  Rajsamand  186.4  17.5   55.9  6.04            0.55   
4      Rajasthan      Churu  203.7  15.5  194.4  5.87            1.03   

   soil_moisture     soil_type  temperature_c  humidity_pct  rainfall_mm  \
0           30.1        Clayey           24.0            46        217.6   
1           16.4  Black Cotton           27.5            49         37.7   
2           16.6      Laterite           30.5            61          0.0   
3           17.3         Sandy           26.9            64        239.5   
4           12.0         Sandy           29.6            48        142.9   

   wind_speed_ms  solar_radiation_wm2  evapotranspiration_mm         crop  
0           0.99 

In [2]:
#step-2 : showing info and quick stats
print(df.info())
print(df.isnull().sum())
print(df['crop'].value_counts())
print(df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   state                  4600 non-null   object 
 1   district               4600 non-null   object 
 2   N                      4600 non-null   float64
 3   P                      4600 non-null   float64
 4   K                      4600 non-null   float64
 5   pH                     4600 non-null   float64
 6   organic_carbon         4600 non-null   float64
 7   soil_moisture          4600 non-null   float64
 8   soil_type              4600 non-null   object 
 9   temperature_c          4600 non-null   float64
 10  humidity_pct           4600 non-null   int64  
 11  rainfall_mm            4600 non-null   float64
 12  wind_speed_ms          4600 non-null   float64
 13  solar_radiation_wm2    4600 non-null   float64
 14  evapotranspiration_mm  4600 non-null   float64
 15  crop

In [4]:
#step-3: Drop obvious identifiers
if 'district' in df.columns and df['district'].nunique() > 50:
    df = df.drop(columns=['district'])
print(df.columns)

Index(['state', 'N', 'P', 'K', 'pH', 'organic_carbon', 'soil_moisture',
       'soil_type', 'temperature_c', 'humidity_pct', 'rainfall_mm',
       'wind_speed_ms', 'solar_radiation_wm2', 'evapotranspiration_mm',
       'crop'],
      dtype='object')


In [5]:
#Step 4 — Prepare X and y, show label mapping
from sklearn.preprocessing import LabelEncoder
X = df.drop(columns=['crop'])
y = df['crop'].astype(str)
le = LabelEncoder()
y_enc = le.fit_transform(y)
print(dict(zip(le.classes_, le.transform(le.classes_))))


{'Bajra': np.int64(0), 'Banana': np.int64(1), 'Bengal Gram': np.int64(2), 'Coconut': np.int64(3), 'Cotton': np.int64(4), 'Groundnut': np.int64(5), 'Maize': np.int64(6), 'Mirchi': np.int64(7), 'Moong': np.int64(8), 'Mustard': np.int64(9), 'Onion': np.int64(10), 'Potato': np.int64(11), 'Ragi': np.int64(12), 'Rice': np.int64(13), 'Sorghum': np.int64(14), 'Soybean': np.int64(15), 'Sugarcane': np.int64(16), 'Sunflower': np.int64(17), 'Tobacco': np.int64(18), 'Tomato': np.int64(19), 'Toor': np.int64(20), 'Urad': np.int64(21), 'Wheat': np.int64(22)}


In [6]:
#Step 5 — Identify numeric & categorical columns
num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object','category']).columns.tolist()
print("Numeric:", num_cols)
print("Categorical:", cat_cols)



Numeric: ['N', 'P', 'K', 'pH', 'organic_carbon', 'soil_moisture', 'temperature_c', 'humidity_pct', 'rainfall_mm', 'wind_speed_ms', 'solar_radiation_wm2', 'evapotranspiration_mm']
Categorical: ['state', 'soil_type']


In [None]:
#Step 6 — Build preprocessing (impute, encode, scale) — do NOT fit it yet
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipe = Pipeline([('imputer', SimpleImputer(strategy='median')),
                     ('scaler', StandardScaler())])
cat_pipe = Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))])

preprocessor = ColumnTransformer([('num', num_pipe, num_cols),
                                  ('cat', cat_pipe, cat_cols)],
                                 remainder='drop', sparse_threshold=0)

#Step 7 — Stratified split (hold-out test set)
X = df.drop(columns=['crop'])
y = df['crop'].astype(str)

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_enc = le.fit_transform(y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.20, random_state=42, stratify=y_enc
)


preprocessor.fit(X_train)
sample = preprocessor.transform(X_train.head(5))
print(sample)
print(sample.shape)


[[ 0.19893147 -1.59278251  0.02423594 -1.39499377  0.46281453 -0.94179839
  -0.42245864  0.2532529  -0.95653505 -0.06845166 -1.50092018  0.65263974
   0.          0.          0.          0.          0.          0.
   0.          0.          0.          1.          1.          0.
   0.          0.          0.          0.          0.        ]
 [ 0.73748152 -0.0393822   1.74440944  0.9701105  -0.80677731 -2.3025157
   0.12415643  1.38291433  0.61966241  1.69580719 -0.59768735 -0.11308822
   0.          0.          0.          0.          0.          0.
   0.          1.          0.          0.          0.          0.
   0.          0.          0.          1.          0.        ]
 [-0.20124114 -0.13143555  1.41737478 -1.76946862  0.42313978  1.09927758
  -1.81384246 -0.08564553  1.04251377 -0.82677345  0.25705527 -0.94688088
   0.          0.          1.          0.          0.          0.
   0.          0.          0.          0.          0.          0.
   0.          1.          0.      

In [11]:
#Step 8 — Train models + evaluate (example: one model), then loop 9 models
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

pipe = Pipeline([('pre', preprocessor), ('clf', LogisticRegression(max_iter=1000, multi_class='multinomial'))])

# Cross-validate on TRAIN only
scores_acc = cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy', n_jobs=1)
scores_f1  = cross_val_score(pipe, X_train, y_train, cv=5, scoring='f1_macro', n_jobs=1)
print("CV acc mean/std:", scores_acc.mean(), scores_acc.std())
print("CV f1_macro mean:", scores_f1.mean())

# Fit on full train and evaluate on held-out TEST
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Test accuracy:", accuracy_score(y_test, y_pred))
print("Test f1_macro:", f1_score(y_test, y_pred, average='macro'))
print(classification_report(y_test, y_pred, zero_division=0, digits=4))




CV acc mean/std: 0.0470108695652174 0.006291215708038164
CV f1_macro mean: 0.045877421239571656
Test accuracy: 0.05
Test f1_macro: 0.046329229069181514
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000        40
           1     0.0000    0.0000    0.0000        40
           2     0.0556    0.0500    0.0526        40
           3     0.0513    0.0500    0.0506        40
           4     0.0417    0.0250    0.0312        40
           5     0.0484    0.0750    0.0588        40
           6     0.1111    0.1250    0.1176        40
           7     0.0556    0.1000    0.0714        40
           8     0.0698    0.0750    0.0723        40
           9     0.0000    0.0000    0.0000        40
          10     0.0667    0.0750    0.0706        40
          11     0.0000    0.0000    0.0000        40
          12     0.0492    0.0750    0.0594        40
          13     0.0741    0.1000    0.0851        40
          14     0.0690    0.0500    



In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Build pipeline
dt_pipe = Pipeline([
    ('preprocess', preprocessor),   # use same preprocessor
    ('model', DecisionTreeClassifier(random_state=42))
])

# Cross-validation accuracy (train set)
dt_cv_acc = cross_val_score(dt_pipe, X_train, y_train, cv=5, scoring='accuracy')
dt_cv_f1  = cross_val_score(dt_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("Decision Tree CV Accuracy:", dt_cv_acc.mean())
print("Decision Tree CV F1:", dt_cv_f1.mean())

# Fit and test on test set
dt_pipe.fit(X_train, y_train)
y_pred_dt = dt_pipe.predict(X_test)

print("Decision Tree Test Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree Test F1:", f1_score(y_test, y_pred_dt, average='macro'))
print(classification_report(y_test, y_pred_dt))


Decision Tree CV Accuracy: 0.04891304347826087
Decision Tree CV F1: 0.048895988969093106
Decision Tree Test Accuracy: 0.04673913043478261
Decision Tree Test F1: 0.04747818267862513
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.02      0.03      0.02        40
           2       0.00      0.00      0.00        40
           3       0.00      0.00      0.00        40
           4       0.04      0.03      0.03        40
           5       0.05      0.05      0.05        40
           6       0.00      0.00      0.00        40
           7       0.16      0.12      0.14        40
           8       0.02      0.03      0.02        40
           9       0.04      0.05      0.05        40
          10       0.06      0.05      0.05        40
          11       0.02      0.03      0.02        40
          12       0.06      0.07      0.07        40
          13       0.05      0.05      0.05        40
        

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', RandomForestClassifier(n_estimators=200, random_state=42))
])

rf_cv_acc = cross_val_score(rf_pipe, X_train, y_train, cv=5, scoring='accuracy')
rf_cv_f1  = cross_val_score(rf_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("Random Forest CV Accuracy:", rf_cv_acc.mean())
print("Random Forest CV F1:", rf_cv_f1.mean())

rf_pipe.fit(X_train, y_train)
y_pred_rf = rf_pipe.predict(X_test)

print("Random Forest Test Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest Test F1:", f1_score(y_test, y_pred_rf, average='macro'))
print(classification_report(y_test, y_pred_rf))


Random Forest CV Accuracy: 0.04646739130434782
Random Forest CV F1: 0.04599726163331714
Random Forest Test Accuracy: 0.04782608695652174
Random Forest Test F1: 0.04758085321375871
              precision    recall  f1-score   support

           0       0.05      0.05      0.05        40
           1       0.02      0.03      0.02        40
           2       0.11      0.10      0.11        40
           3       0.04      0.05      0.04        40
           4       0.00      0.00      0.00        40
           5       0.02      0.03      0.02        40
           6       0.03      0.03      0.03        40
           7       0.06      0.07      0.07        40
           8       0.02      0.03      0.02        40
           9       0.06      0.05      0.05        40
          10       0.07      0.10      0.08        40
          11       0.09      0.07      0.08        40
          12       0.02      0.03      0.02        40
          13       0.00      0.00      0.00        40
         

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

gb_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', GradientBoostingClassifier(random_state=42))
])

gb_cv_acc = cross_val_score(gb_pipe, X_train, y_train, cv=5, scoring='accuracy')
gb_cv_f1  = cross_val_score(gb_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("Gradient Boosting CV Accuracy:", gb_cv_acc.mean())
print("Gradient Boosting CV F1:", gb_cv_f1.mean())

gb_pipe.fit(X_train, y_train)
y_pred_gb = gb_pipe.predict(X_test)

print("Gradient Boosting Test Accuracy:", accuracy_score(y_test, y_pred_gb))
print("Gradient Boosting Test F1:", f1_score(y_test, y_pred_gb, average='macro'))
print(classification_report(y_test, y_pred_gb))


Gradient Boosting CV Accuracy: 0.04076086956521739
Gradient Boosting CV F1: 0.04009842120531465
Gradient Boosting Test Accuracy: 0.03152173913043478
Gradient Boosting Test F1: 0.031924179313963914
              precision    recall  f1-score   support

           0       0.12      0.10      0.11        40
           1       0.06      0.05      0.05        40
           2       0.05      0.05      0.05        40
           3       0.07      0.07      0.07        40
           4       0.03      0.03      0.03        40
           5       0.00      0.00      0.00        40
           6       0.00      0.00      0.00        40
           7       0.02      0.03      0.02        40
           8       0.04      0.05      0.05        40
           9       0.03      0.03      0.03        40
          10       0.00      0.00      0.00        40
          11       0.00      0.00      0.00        40
          12       0.04      0.05      0.05        40
          13       0.03      0.03      0.03   

In [15]:
from sklearn.ensemble import ExtraTreesClassifier

et_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', ExtraTreesClassifier(n_estimators=300, random_state=42))
])

et_cv_acc = cross_val_score(et_pipe, X_train, y_train, cv=5, scoring='accuracy')
et_cv_f1  = cross_val_score(et_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("Extra Trees CV Accuracy:", et_cv_acc.mean())
print("Extra Trees CV F1:", et_cv_f1.mean())

et_pipe.fit(X_train, y_train)
y_pred_et = et_pipe.predict(X_test)

print("Extra Trees Test Accuracy:", accuracy_score(y_test, y_pred_et))
print("Extra Trees Test F1:", f1_score(y_test, y_pred_et, average='macro'))
print(classification_report(y_test, y_pred_et))


Extra Trees CV Accuracy: 0.04755434782608696
Extra Trees CV F1: 0.046874636193819416
Extra Trees Test Accuracy: 0.04891304347826087
Extra Trees Test F1: 0.04959036650845489
              precision    recall  f1-score   support

           0       0.02      0.03      0.02        40
           1       0.04      0.05      0.04        40
           2       0.07      0.07      0.07        40
           3       0.02      0.03      0.02        40
           4       0.07      0.05      0.06        40
           5       0.04      0.05      0.04        40
           6       0.11      0.07      0.09        40
           7       0.00      0.00      0.00        40
           8       0.03      0.03      0.03        40
           9       0.03      0.03      0.03        40
          10       0.09      0.10      0.10        40
          11       0.07      0.05      0.06        40
          12       0.07      0.07      0.07        40
          13       0.05      0.05      0.05        40
          14    

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', KNeighborsClassifier(n_neighbors=5))
])

knn_cv_acc = cross_val_score(knn_pipe, X_train, y_train, cv=5, scoring='accuracy')
knn_cv_f1  = cross_val_score(knn_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("KNN CV Accuracy:", knn_cv_acc.mean())
print("KNN CV F1:", knn_cv_f1.mean())

knn_pipe.fit(X_train, y_train)
y_pred_knn = knn_pipe.predict(X_test)

print("KNN Test Accuracy:", accuracy_score(y_test, y_pred_knn))
print("KNN Test F1:", f1_score(y_test, y_pred_knn, average='macro'))
print(classification_report(y_test, y_pred_knn))


KNN CV Accuracy: 0.037228260869565225
KNN CV F1: 0.02946739870349337
KNN Test Accuracy: 0.042391304347826085
KNN Test F1: 0.037067906658821856
              precision    recall  f1-score   support

           0       0.02      0.07      0.03        40
           1       0.05      0.15      0.08        40
           2       0.03      0.07      0.04        40
           3       0.06      0.15      0.09        40
           4       0.03      0.05      0.04        40
           5       0.04      0.07      0.05        40
           6       0.03      0.03      0.03        40
           7       0.04      0.05      0.05        40
           8       0.12      0.07      0.09        40
           9       0.04      0.03      0.03        40
          10       0.00      0.00      0.00        40
          11       0.18      0.07      0.11        40
          12       0.00      0.00      0.00        40
          13       0.07      0.03      0.04        40
          14       0.00      0.00      0.00   

In [17]:
from sklearn.naive_bayes import GaussianNB

nb_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', GaussianNB())
])

nb_cv_acc = cross_val_score(nb_pipe, X_train, y_train, cv=5, scoring='accuracy')
nb_cv_f1  = cross_val_score(nb_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("Naive Bayes CV Accuracy:", nb_cv_acc.mean())
print("Naive Bayes CV F1:", nb_cv_f1.mean())

nb_pipe.fit(X_train, y_train)
y_pred_nb = nb_pipe.predict(X_test)

print("Naive Bayes Test Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes Test F1:", f1_score(y_test, y_pred_nb, average='macro'))
print(classification_report(y_test, y_pred_nb))


Naive Bayes CV Accuracy: 0.050815217391304346
Naive Bayes CV F1: 0.048038095437600535
Naive Bayes Test Accuracy: 0.043478260869565216
Naive Bayes Test F1: 0.03957860105003538
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.02      0.03      0.02        40
           2       0.05      0.05      0.05        40
           3       0.03      0.03      0.03        40
           4       0.00      0.00      0.00        40
           5       0.02      0.03      0.02        40
           6       0.08      0.10      0.09        40
           7       0.04      0.05      0.04        40
           8       0.00      0.00      0.00        40
           9       0.00      0.00      0.00        40
          10       0.06      0.12      0.08        40
          11       0.00      0.00      0.00        40
          12       0.05      0.10      0.07        40
          13       0.05      0.07      0.06        40
          14  

In [18]:
from sklearn.svm import SVC

svm_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', SVC(kernel='rbf'))
])

svm_cv_acc = cross_val_score(svm_pipe, X_train, y_train, cv=5, scoring='accuracy')
svm_cv_f1  = cross_val_score(svm_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("SVM CV Accuracy:", svm_cv_acc.mean())
print("SVM CV F1:", svm_cv_f1.mean())

svm_pipe.fit(X_train, y_train)
y_pred_svm = svm_pipe.predict(X_test)

print("SVM Test Accuracy:", accuracy_score(y_test, y_pred_svm))
print("SVM Test F1:", f1_score(y_test, y_pred_svm, average='macro'))
print(classification_report(y_test, y_pred_svm))


SVM CV Accuracy: 0.04592391304347826
SVM CV F1: 0.044547801797251055
SVM Test Accuracy: 0.041304347826086954
SVM Test F1: 0.03967665883411355
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.00      0.00      0.00        40
           2       0.09      0.10      0.09        40
           3       0.04      0.05      0.05        40
           4       0.06      0.07      0.07        40
           5       0.02      0.03      0.02        40
           6       0.00      0.00      0.00        40
           7       0.05      0.07      0.06        40
           8       0.06      0.07      0.06        40
           9       0.03      0.03      0.03        40
          10       0.04      0.05      0.04        40
          11       0.05      0.05      0.05        40
          12       0.07      0.07      0.07        40
          13       0.03      0.03      0.03        40
          14       0.04      0.03      0.03    

In [19]:
from sklearn.neural_network import MLPClassifier

mlp_pipe = Pipeline([
    ('preprocess', preprocessor),
    ('model', MLPClassifier(hidden_layer_sizes=(128,64), max_iter=500, random_state=42))
])

mlp_cv_acc = cross_val_score(mlp_pipe, X_train, y_train, cv=5, scoring='accuracy')
mlp_cv_f1  = cross_val_score(mlp_pipe, X_train, y_train, cv=5, scoring='f1_macro')

print("MLP NN CV Accuracy:", mlp_cv_acc.mean())
print("MLP NN CV F1:", mlp_cv_f1.mean())

mlp_pipe.fit(X_train, y_train)
y_pred_mlp = mlp_pipe.predict(X_test)

print("MLP Test Accuracy:", accuracy_score(y_test, y_pred_mlp))
print("MLP Test F1:", f1_score(y_test, y_pred_mlp, average='macro'))
print(classification_report(y_test, y_pred_mlp))




MLP NN CV Accuracy: 0.049184782608695646
MLP NN CV F1: 0.04872765873455245
MLP Test Accuracy: 0.03695652173913044
MLP Test F1: 0.036223998890237065
              precision    recall  f1-score   support

           0       0.05      0.05      0.05        40
           1       0.07      0.07      0.07        40
           2       0.02      0.03      0.02        40
           3       0.00      0.00      0.00        40
           4       0.03      0.03      0.03        40
           5       0.03      0.03      0.03        40
           6       0.06      0.07      0.07        40
           7       0.05      0.05      0.05        40
           8       0.03      0.03      0.03        40
           9       0.02      0.03      0.02        40
          10       0.02      0.03      0.02        40
          11       0.02      0.03      0.02        40
          12       0.03      0.03      0.03        40
          13       0.00      0.00      0.00        40
          14       0.12      0.15      0.

