In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

df = pd.read_csv('Crop_recommendation.csv')

df.sample(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
434,36,61,21,34.538239,39.044689,5.617008,168.594832,pigeonpeas
985,39,25,36,18.90223,94.998975,5.567805,107.610321,pomegranate
1478,113,28,48,28.87726,92.488397,6.170521,24.442676,muskmelon
1919,117,56,15,25.992374,77.054355,7.368258,89.118821,cotton
523,28,48,15,25.161254,55.254358,9.254089,40.897328,mothbeans


## Data Exploration and Pre-processing

In [3]:
df.size

17600

In [4]:
df.shape

(2200, 8)

In [5]:
df['label'].unique()

array(['rice', 'maize', 'chickpea(mbaazi)', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'pawpaw', 'coconut', 'cotton', 'jute(mrenda)', 'coffee'],
      dtype=object)

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [8]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

# df['label'] = encoder.fit_transform(df['label'])

label_fit = encoder.fit(df['label'])
df['label'] = label_fit.transform(df['label'])

# Key value pairs of the transformed/encoded labels
labels = dict(zip(label_fit.classes_, label_fit.transform(label_fit.classes_)))

In [9]:
# labels
# {'apple': 0, 'banana': 1, 'blackgram': 2, 'chickpea(mbaazi)': 3, 'coconut': 4, 'coffee': 5, 'cotton': 6, 'grapes': 7, 'jute(mrenda)': 8, 'kidneybeans': 9, 'lentil': 10, 'maize': 11, 'mango': 12, 'mothbeans': 13, 'mungbean': 14, 'muskmelon': 15, 'orange': 16, 'pawpaw': 17, 'pigeonpeas': 18, 'pomegranate': 19, 'rice': 20, 'watermelon': 21}

In [10]:
X = df.drop(['label'], axis = 1)
y = df['label']

## Model Training

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [12]:
# Base models: Decision Tree, SVM , KNN

svm = SVC()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=3)

### SVM

In [13]:
# Define a parameter grid for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['auto']
}

# Create a GridSearchCV object for hyperparameter tuning
svm = GridSearchCV(svm, param_grid, cv=5, scoring='accuracy', verbose=1)

In [14]:
models = [svm, dt, knn]

for model in models:
  model.fit(X_train, y_train)
  model_predict = model.predict(X_test)
  score = metrics.accuracy_score(y_test, model_predict)
  print("-------" * 4, model, "-------" * 4)
  print("Accuracy score: ", score)
  print(classification_report(y_test, model_predict))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
---------------------------- GridSearchCV(cv=5, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'gamma': ['auto'],
                         'kernel': ['linear']},
             scoring='accuracy', verbose=1) ----------------------------
Accuracy score:  0.9795454545454545
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       0.95      1.00      0.98        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       0.94      1.00      0.97        17
           6       0.94      1.00      0.97        17
           7       1.00      1.00      1.00        14
           8       0.87      0.87      0.87        23
           9       1.00      1.00      1.00        20
          10       1.00      1.00      1.00        11
      

## Ensemble Stacking

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [16]:
# Define the base models
base_models = [
    ('svm', svm),
    ('dt', dt),
    ('knn', knn)
]

# Define the meta-learner (classifier)
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1900)

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=lr)

# Fit the stacking classifier on the training data
stacking_classifier.fit(X_train, y_train)

# Make predictions using the stacking classifier
stack_predictions = stacking_classifier.predict(X_test)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [17]:
# Evaluate the stacking classifier's performance

print("Accuracy score: ", metrics.accuracy_score(y_test, stack_predictions))
print(classification_report(y_test, stack_predictions))

Accuracy score:  0.9818181818181818
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       0.95      1.00      0.98        20
           3       1.00      1.00      1.00        26
           4       1.00      1.00      1.00        27
           5       0.94      1.00      0.97        17
           6       0.94      1.00      0.97        17
           7       1.00      1.00      1.00        14
           8       0.91      0.87      0.89        23
           9       1.00      1.00      1.00        20
          10       0.92      1.00      0.96        11
          11       1.00      0.95      0.98        21
          12       1.00      1.00      1.00        19
          13       1.00      0.96      0.98        24
          14       1.00      1.00      1.00        19
          15       1.00      1.00      1.00        17
          16       1.00      1.00      1.00  

## Export Model

In [18]:
import joblib

joblib.dump(stacking_classifier, 'model.pkl')

['model.pkl']