# A model to recommend crops based on features

In [66]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [67]:
crop_data = pd.read_csv("Crop_recommendation_data.csv")

In [68]:
crop_data.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [69]:
# There are no null values in the dataset
crop_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [70]:
crop_data.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,25.616244,71.481779,6.46948,103.463655
std,36.917334,32.985883,50.647931,5.063749,22.263812,0.773938,54.958389
min,0.0,5.0,5.0,8.825675,14.25804,3.504752,20.211267
25%,21.0,28.0,20.0,22.769375,60.261953,5.971693,64.551686
50%,37.0,51.0,32.0,25.598693,80.473146,6.425045,94.867624
75%,84.25,68.0,49.0,28.561654,89.948771,6.923643,124.267508
max,140.0,145.0,205.0,43.675493,99.981876,9.935091,298.560117


In [71]:
crop_data.head(5)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [72]:
# equal values in all classes
crop_data['label'].value_counts()

label
rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: count, dtype: int64

- Encoding Our Values

In [73]:
# Extracting the features and the target from the data as an array

X = crop_data.iloc[:, :-1].values
y = crop_data.iloc[:, -1].values

In [74]:
# Encoding y values
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(y)

# Using Logistic Regression

In [75]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

- Feature Scaling

In [76]:
# Using standard scaler:

from sklearn.preprocessing import StandardScaler

sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test= sc.transform(X_test)

In [77]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=10, max_iter=10000)
logreg.fit(X_train, y_train)

In [78]:
print(logreg.score(X_train, y_train))
# print(logreg.score(X_test, y_test))

0.9860606060606061


In [79]:
from sklearn.metrics import accuracy_score
y_pred_log = logreg.predict(X_test)
print(f"Logreg Model Accuracy: {accuracy_score(y_test, y_pred_log) * 100:.2f}%")

Logreg Model Accuracy: 97.82%


In [80]:
print(y_pred_log)

[18  2  8  8 17 19  1  8  8 21  4  2 13 20  1 19  5 13 11 10  0 10 19  5
 12  0  6 14  2  9  2  8 21  0 15 18  6 13  4 21  2 15 13 13 18 16 10 17
  9  5  6 20  3  7  2 17  1 20  3 11  7 20 12 21 18 11 13 21  3  1 18  1
 13 20 18  0 17 15 16 20 16 15  8  8  9 10 15 12  5  9 10 10  6 15  9 21
 17 11  2 21 17  7 15  7 15  9 20 19 12  8 14 16  0 11 19 19 16 12 18  8
  1 11 12 18  5  7  7 21  1  8  3 11  5  2 11  5 18 20  0  2 16  6  5 16
 15 19 16 13  1  8 20 10 20  9  3  5 14 17  6 12  8 20 15 20  2  9 12 15
  4 13  9 18 15 21  3  9  5  5  9 15  2  2 10  0 14  4 10 21 14 12 16  4
  7  0  2  2  4 12  4  0  6  6  9  3  5 17 11  9 19 11  8 21 12  2  0  8
  3  1 15 15  9 13  7  4  5 14  6 11 18 14 14 11  0  1 16  3  9 16 18  1
  3 18  5  7  5 16 11  3 21  8 14 10  2  7  7 17  6 15  1  2 19 19 10  2
 17  2 13 10 13  5  6  0  7  3 14  4  7 18 18  2  8  7 21 21  4  6  4  1
  8  8 17 15  0 13 21  6  1  8 11 19 20 20 15  8  0  0 19 17  1  7 16 18
  0  7  1 12 12 21  9  1 14  7 21  8 12  3 16  6  5

In [81]:
# saving the model
import pickle

with open("logreg_crop_classifier.pkl", "wb") as model_file:
    pickle.dump(logreg, model_file)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Using RFC

In [82]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)

rfc.fit(X_train, y_train)

In [83]:
print(rfc.score(X_train, y_train))
# print(rfc.score(X_test, y_test))

0.9824242424242424


In [84]:
y_pred_rfc = rfc.predict(X_test)
print(f"RFC Model Accuracy: {accuracy_score(y_test, y_pred_rfc) * 100:.2f}%")

RFC Model Accuracy: 98.00%


In [85]:
from sklearn.metrics import classification_report

print(f"Classification Report of the Logistic Regression Model: \n{classification_report(y_test, y_pred_log)}")

Classification Report of the Logistic Regression Model: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       1.00      1.00      1.00        25
           2       0.93      1.00      0.96        25
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        25
           5       1.00      1.00      1.00        25
           6       0.96      0.96      0.96        25
           7       1.00      1.00      1.00        25
           8       0.86      0.96      0.91        25
           9       1.00      1.00      1.00        25
          10       0.96      0.92      0.94        25
          11       0.96      0.96      0.96        25
          12       1.00      1.00      1.00        25
          13       0.92      0.92      0.92        25
          14       1.00      1.00      1.00        25
          15       1.00      1.00      1.00        25
          16       1.00 

In [86]:
print(f"\nClassification Report of the Logistic Regression Model: \n{classification_report(y_test, y_pred_rfc)}")


Classification Report of the Logistic Regression Model: 
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        25
           1       1.00      1.00      1.00        25
           2       0.92      0.96      0.94        25
           3       1.00      1.00      1.00        25
           4       1.00      1.00      1.00        25
           5       1.00      1.00      1.00        25
           6       1.00      1.00      1.00        25
           7       1.00      1.00      1.00        25
           8       0.86      1.00      0.93        25
           9       1.00      1.00      1.00        25
          10       0.96      0.88      0.92        25
          11       0.96      1.00      0.98        25
          12       1.00      1.00      1.00        25
          13       0.88      0.88      0.88        25
          14       1.00      1.00      1.00        25
          15       1.00      1.00      1.00        25
          16       1.00