In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

df = pd.read_csv('Crop_recommendation.csv')

df.sample(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
2130,98,27,27,24.713841,51.291425,7.23811,197.643971,coffee
211,43,66,79,19.46234,15.22539,7.976608,74.585651,wheat
1424,102,25,50,28.204808,92.914404,6.099662,20.360011,potato
1705,70,68,45,33.835086,92.854702,6.991626,203.404403,pawpaw
1179,0,17,30,35.474783,47.972305,6.279134,97.790725,mango


## Data Exploration and Pre-processing

In [3]:
df.size

17600

In [4]:
df.shape

(2200, 8)

In [5]:
df['label'].unique()

array(['rice', 'maize', 'wheat', 'beans', 'pigeonpeas(mbaazi)', 'sorghum',
       'cassava', 'blackgram', 'lentil(kamande)', 'avocado', 'banana',
       'mango', 'cabbage', 'watermelon', 'potato', 'tea', 'orange',
       'pawpaw', 'tomato', 'millet', 'groundnut', 'coffee'], dtype=object)

In [6]:
df.duplicated().sum()

0

In [7]:
df.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [8]:
# Label encoding
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

label_fit = encoder.fit(df['label'])
df['label'] = label_fit.transform(df['label'])

# Key value pairs of the transformed/encoded labels
labels = dict(zip(label_fit.classes_, label_fit.transform(label_fit.classes_)))

In [9]:
# labels
# {'apple': 0, 'banana': 1, 'blackgram': 2, 'chickpea(mbaazi)': 3, 'coconut': 4, 'coffee': 5, 'cotton': 6, 'grapes': 7, 'jute(mrenda)': 8, 'kidneybeans': 9, 'lentil': 10, 'maize': 11, 'mango': 12, 'mothbeans': 13, 'mungbean': 14, 'muskmelon': 15, 'orange': 16, 'pawpaw': 17, 'pigeonpeas': 18, 'pomegranate': 19, 'rice': 20, 'watermelon': 21}

In [10]:
X = df.drop(['label'], axis = 1)
y = df['label']

## Model Training

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

In [12]:
# Base models: Decision Tree, SVM , KNN

svm = SVC()
dt = DecisionTreeClassifier()
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

### SVM

In [13]:
# Define a parameter grid for hyperparameter tuning
svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear'],
    'gamma': ['auto']
}

# Create a GridSearchCV object for hyperparameter tuning
svm = GridSearchCV(svm, svm_param_grid, cv=5, scoring='accuracy', verbose=1)

### Decision Tree

In [14]:
dt_param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'criterion': ['gini', 'entropy'],
    'min_samples_leaf': [1, 2, 5, 10, 20]
}

dt = GridSearchCV(dt, dt_param_grid, cv=5)

### Training models and checking accuracy

In [15]:
models = [svm, dt, knn]

for model in models:
  model.fit(X_train, y_train)
  model_predict = model.predict(X_test)
  score = metrics.accuracy_score(y_test, model_predict)
  print("-------" * 4, model, "-------" * 4) if model == knn else print("-------" * 4, model.best_estimator_.__class__.__name__ , "-------" * 4)
  print("Accuracy score: ", score)
  print(classification_report(y_test, model_predict))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
---------------------------- SVC ----------------------------
Accuracy score:  0.9795454545454545
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       0.95      1.00      0.98        20
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00        19
           6       0.94      1.00      0.97        17
           7       0.87      0.87      0.87        23
           8       1.00      1.00      1.00        11
           9       1.00      0.95      0.98        21
          10       0.95      1.00      0.97        19
          11       0.94      1.00      0.97        17
          12       1.00      1.00      1.00        14
          13       0.96      1.00      0.98        23
          14       1.00      0.

## Ensemble Stacking

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [17]:
# Define the base models
base_models = [
    ('svm', svm),
    ('dt', dt),
    ('knn', knn)
]

# Define the meta-learner (classifier)
lr = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1900)

# Create the stacking classifier
stacking_classifier = StackingClassifier(estimators=base_models, final_estimator=lr)

# Fit the stacking classifier on the training data
stacking_classifier.fit(X_train, y_train)

# Make predictions using the stacking classifier
stack_predictions = stacking_classifier.predict(X_test)


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [18]:
# check if the stacking ensemble classifier is overfitting

# Perform k-fold cross-validation with k=5 (5 folds)
stacking_scores = cross_val_score(dt, X, y, cv=5)

# Print the cross-validation scores
print("Cross-validation scores:", stacking_scores)

# Calculate and print the mean and standard deviation of the scores
print("Mean accuracy: {:.2f}".format(stacking_scores.mean()))
print("Standard deviation: {:.2f}".format(stacking_scores.std()))


Cross-validation scores: [0.98636364 0.98409091 0.98863636 0.98863636 0.98409091]
Mean accuracy: 0.99
Standard deviation: 0.00


In [19]:
# Evaluate the stacking classifier's performance

print("Accuracy score: ", metrics.accuracy_score(y_test, stack_predictions))
print(classification_report(y_test, stack_predictions))

Accuracy score:  0.9818181818181818
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        23
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        20
           3       0.95      1.00      0.98        20
           4       1.00      1.00      1.00        14
           5       1.00      1.00      1.00        19
           6       0.94      1.00      0.97        17
           7       0.91      0.87      0.89        23
           8       0.92      1.00      0.96        11
           9       1.00      0.95      0.98        21
          10       1.00      1.00      1.00        19
          11       0.94      1.00      0.97        17
          12       1.00      1.00      1.00        14
          13       0.96      1.00      0.98        23
          14       1.00      0.96      0.98        23
          15       1.00      1.00      1.00        17
          16       0.94      0.89      0.92  

## Export Model

In [20]:
import joblib

joblib.dump(stacking_classifier, 'model.pkl')

['model.pkl']