In [1]:
#  IMPORT LIBRARIES

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, confusion_matrix, classification_report
)

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier


In [5]:
#  LOAD DATA

file_path = "E:\Projects\data2_Crop_data.csv" # dataset file path
df = pd.read_csv(file_path)

print("Columns:", df.columns.tolist())
print("Shape:", df.shape)
print(df.head())
print(df.info())

Columns: ['STATE', 'SOIL_TYPE', 'N_SOIL', 'P_SOIL', 'K_SOIL', 'TEMPERATURE', 'HUMIDITY', 'ph', 'RAINFALL', 'CROP_PRICE', 'CROP']
Shape: (2200, 11)
                 STATE   SOIL_TYPE  N_SOIL  P_SOIL  K_SOIL  TEMPERATURE  \
0  Andaman and Nicobar  Sandy soil      90      42      43    20.879744   
1  Andaman and Nicobar  Sandy soil      85      58      41    21.770462   
2  Andaman and Nicobar  Sandy soil      60      55      44    23.004459   
3  Andaman and Nicobar  Sandy soil      74      35      40    26.491096   
4  Andaman and Nicobar  Sandy soil      78      42      42    20.130175   

    HUMIDITY        ph    RAINFALL  CROP_PRICE           CROP  
0  82.002744  6.502985  202.935536        7000     Amaranthus  
1  80.319644  7.038096  226.655537        5000   Green Banana  
2  82.320763  7.840207  263.964248        7000  Ladies Finger  
3  80.158363  6.980401  242.864034        7000   Bitter Gourd  
4  81.604873  7.628473  262.717340      120000   Black pepper  
<class 'pandas.cor

  file_path = "E:\Projects\data2_Crop_data.csv" # dataset file path


In [7]:

#  BASIC CLEANING & MISSING VALUES

print("Missing values:\n", df.isna().sum())

numeric_cols = [
    "N_SOIL", "P_SOIL", "K_SOIL",
    "TEMPERATURE", "HUMIDITY", "ph",
    "RAINFALL", "CROP_PRICE"
]

for col in numeric_cols:
    if df[col].isna().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)


Missing values:
 STATE          0
SOIL_TYPE      0
N_SOIL         0
P_SOIL         0
K_SOIL         0
TEMPERATURE    0
HUMIDITY       0
ph             0
RAINFALL       0
CROP_PRICE     0
CROP           0
dtype: int64


In [9]:

#  TARGET ENCODING (CROP)

target_col = "CROP"

label_encoder = LabelEncoder()
df[target_col] = label_encoder.fit_transform(df[target_col])

print("\nUnique crops (encoded) BEFORE filtering:")
print(dict(zip(label_encoder.classes_,
               label_encoder.transform(label_encoder.classes_))))



Unique crops (encoded) BEFORE filtering:
{'Amaranthus': 0, 'Amla': 1, 'Amphophalus': 2, 'Apple': 3, 'Arecanut': 4, 'Ash Gourd': 5, 'Bajra': 6, 'Banana': 7, 'Barley': 8, 'Beans': 9, 'Beetroot': 10, 'Bengal Gram': 11, 'Betal Leaves': 12, 'Bitter Gourd': 13, 'Black Gram': 14, 'Black pepper': 15, 'Bottle Gourd': 16, 'Brinjal': 17, 'Broken Rice': 18, 'Cabbage': 19, 'Capsicum': 20, 'Carrot': 21, 'Cashewnuts': 22, 'Castor Seed': 23, 'Cauliflower': 24, 'Chana Dal': 25, 'Chholia': 26, 'Chilly Capsicum': 27, 'Cluster Beans': 28, 'Cluster beans': 29, 'Coconut': 30, 'Coconut Oil': 31, 'Coconut Seed': 32, 'Colacasia': 33, 'Copra': 34, 'Coriander': 35, 'Corriander seed': 36, 'Cotton': 37, 'Cowpea': 38, 'Cucumber': 39, 'Cumbu': 40, 'Drumstick': 41, 'Dry Chillies': 42, 'Duster Beans': 43, 'Elephat Yam': 44, 'Field Pea': 45, 'Fish': 46, 'French Beans': 47, 'Garlic': 48, 'Ghee': 49, 'Gingelly Oil': 50, 'Ginger': 51, 'Grapes': 52, 'Green Avare': 53, 'Green Banana': 54, 'Green Chilli': 55, 'Green Gram': 

In [11]:

#  REMOVE RARE CLASSES (COUNT < 2)


class_counts = df[target_col].value_counts()
print("\nClass counts before filtering:\n", class_counts)

valid_classes = class_counts[class_counts >= 2].index
df = df[df[target_col].isin(valid_classes)].copy()

print("\nShape after removing rare classes:", df.shape)
print("Class counts after filtering:\n", df[target_col].value_counts())

# ðŸ‘‰ Recompute list of PRESENT classes and their names
present_classes = np.sort(df[target_col].unique())
present_class_names = label_encoder.inverse_transform(present_classes)

print("\nNumber of classes after filtering:", len(present_classes))



Class counts before filtering:
 CROP
105    109
137    100
94     100
92      96
17      96
      ... 
36       1
59       1
132      1
31       1
103      1
Name: count, Length: 149, dtype: int64

Shape after removing rare classes: (2169, 11)
Class counts after filtering:
 CROP
105    109
94     100
137    100
92      96
17      96
      ... 
75       2
87       2
15       2
122      2
22       2
Name: count, Length: 118, dtype: int64

Number of classes after filtering: 118


In [15]:

#  FEATURE SELECTION

X = df[numeric_cols]
y = df[target_col]


In [17]:
# TRAIN-TEST SPLIT (stratified)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

Train shape: (1735, 8)
Test shape : (434, 8)


In [21]:
# FEATURE SCALING

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [25]:
#  DEFINE MODELS

models = {
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=200, random_state=42),
    "NaiveBayes": GaussianNB(),
    "SVM": SVC(kernel="rbf", probability=True, random_state=42),
    "ANN": MLPClassifier(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        max_iter=500,
        random_state=42
    )
}

In [27]:
#  TRAIN & EVALUATE EACH MODEL

results = []

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average="weighted", zero_division=0)
    rec = recall_score(y_test, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_test, y_pred, average="weighted", zero_division=0)
    cm = confusion_matrix(y_test, y_pred, labels=present_classes)

    results.append({
        "Model": name,
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1_score": f1
    })

    print(f"\n========== {name} ==========")
    print(f"Accuracy : {acc:.4f}")
    print(f"Precision: {prec:.4f}")
    print(f"Recall   : {rec:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("Confusion Matrix (subset classes):\n", cm)

    print("\nClassification Report:")
    print(classification_report(
        y_test, y_pred,
        zero_division=0,
        labels=present_classes,          # only present labels
        target_names=present_class_names # matching names
    ))


Accuracy : 0.1406
Precision: 0.1378
Recall   : 0.1406
F1-score : 0.1347
Confusion Matrix (subset classes):
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Classification Report:
                 precision    recall  f1-score   support

     Amaranthus       0.00      0.00      0.00         2
    Amphophalus       0.00      0.00      0.00         1
          Apple       0.17      0.17      0.17         6
       Arecanut       1.00      0.50      0.67         2
      Ash Gourd       0.00      0.00      0.00         2
          Bajra       0.00      0.00      0.00         3
         Banana       0.20      0.13      0.16        15
         Barley       0.50      0.50      0.50         2
          Beans       0.00      0.00      0.00         1
       Beetroot       0.00      0.00      0.00         3
    Bengal Gram       0.00      0.00      0.00         3
   Bitter Gourd       0.00      0.00      0.00         6
     



In [32]:
# MODEL COMPARISON TABLE

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="Accuracy", ascending=False)

print("\n=== Model Comparison (Sorted by Accuracy) ===")
print(results_df)

best_model_name = results_df.iloc[0]["Model"]
print("\nBest model based on Accuracy:", best_model_name)


=== Model Comparison (Sorted by Accuracy) ===
          Model  Accuracy  Precision    Recall  F1_score
1  RandomForest  0.198157   0.143997  0.198157  0.159201
4           ANN  0.158986   0.130451  0.158986  0.139011
0  DecisionTree  0.140553   0.137778  0.140553  0.134653
3           SVM  0.112903   0.076896  0.112903  0.079641
2    NaiveBayes  0.099078   0.074827  0.099078  0.072853

Best model based on Accuracy: RandomForest
