In [74]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import StandardScaler
from category_encoders import  OneHotEncoder, OrdinalEncoder



# Data Preparation

## importing datasets

lets import our dataset, extract usefull features, and do some clearning.

In [6]:
df1 = pd.read_csv("Dataset/Crop_recommendation.csv")
df2 = pd.read_csv("Dataset/Crops.csv")


In [7]:
df1.head()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


## Data exploration 

lest do some `EDA` to see what this dataset has to offer and what modifications required

In [8]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1697 entries, 0 to 1696
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            1697 non-null   int64  
 1   P            1697 non-null   int64  
 2   K            1697 non-null   int64  
 3   temperature  1697 non-null   float64
 4   humidity     1697 non-null   float64
 5   ph           1697 non-null   float64
 6   rainfall     1697 non-null   float64
 7   label        1697 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 106.2+ KB


In [9]:
df1['label'].unique()

array(['rice', 'maize', 'Soyabeans', 'beans', 'peas', 'groundnuts',
       'cowpeas', 'banana', 'mango', 'grapes', 'watermelon', 'apple',
       'orange', 'cotton', 'coffee'], dtype=object)

lets subset this data to extract only crops that are planted here in Rwanda 

In [10]:
df1 = (
    df1[df1['label']
        .isin(['rice','maize','sayobeans','beans','peas','groundnuts','coffee'])]
    )

In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 693 entries, 0 to 1640
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            693 non-null    int64  
 1   P            693 non-null    int64  
 2   K            693 non-null    int64  
 3   temperature  693 non-null    float64
 4   humidity     693 non-null    float64
 5   ph           693 non-null    float64
 6   rainfall     693 non-null    float64
 7   label        693 non-null    object 
dtypes: float64(4), int64(3), object(1)
memory usage: 48.7+ KB


In [12]:
df1.describe(include='all')

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
count,693.0,693.0,693.0,693.0,693.0,693.0,693.0,693
unique,,,,,,,,6
top,,,,,,,,rice
freq,,,,,,,,139
mean,55.225108,51.359307,25.626263,24.368896,55.522137,6.369355,121.723306,
std,35.255526,15.154671,8.56058,4.119512,20.32413,1.024402,67.557516,
min,0.0,15.0,15.0,15.330426,18.09224,3.504752,5.314507,
25%,23.0,40.0,19.0,21.519447,43.614441,5.716223,69.638339,
50%,61.0,53.0,23.0,24.385346,58.252046,6.217974,109.024141,
75%,87.0,61.0,33.0,26.730724,69.027623,6.962386,170.991983,


In [13]:
df1['label'].value_counts(sort=1)

rice          139
beans         125
maize         119
coffee        110
peas          100
groundnuts    100
Name: label, dtype: int64

`from here i will have to redo the codes `

In [14]:
df1 = (
    df1[df1['label']
        .isin(['maize','sayobeans','beans','peas','groundnuts','coffee'])]
    )
df1.columns

Index(['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label'], dtype='object')

#### Data split

In [15]:
X = df1[df1.columns[:6]]
X.head()

Unnamed: 0,N,P,K,temperature,humidity,ph
100,71,54,16,22.6136,63.690706,5.749914
101,61,44,17,26.100184,71.574769,6.931757
102,80,43,16,23.558821,71.593514,6.657965
103,73,58,21,19.97216,57.682729,6.596061
104,61,38,20,18.478913,62.695039,5.970458


In [16]:
y = df1[df1.columns[-1]]
y.head()

100    maize
101    maize
102    maize
103    maize
104    maize
Name: label, dtype: object

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=42)

In [18]:
y_train.value_counts(sort=1)

beans         87
coffee        83
peas          77
maize         75
groundnuts    65
Name: label, dtype: int64

In [19]:
y_test.value_counts(sort=1)

maize         44
beans         38
groundnuts    35
coffee        27
peas          23
Name: label, dtype: int64

#### Model Build

In [57]:
naive_bayes_model = make_pipeline(
    OneHotEncoder(use_cat_names=True),
    MultinomialNB()
)

naive_bayes_model.fit(X_train,y_train)



In [58]:
print("naive bayes Model Training Accuracy",naive_bayes_model.score(X_train,y_train).round(2))
print("naive_bayes Model Training Accuracy",naive_bayes_model.score(X_test,y_test).round(2))

naive bayes Model Training Accuracy 0.95
naive_bayes Model Training Accuracy 0.93


lets create a `xgboost` model and tun hyperparameters automatical

In [70]:
# from sklearn.pipeline import Pipeline
boost_model = make_pipeline(
        OrdinalEncoder(),
        GradientBoostingClassifier(random_state=42, max_leaf_nodes=4),
     
)
boost_model

In [33]:
boost_model.named_steps

{'ordinalencoder': OrdinalEncoder(),
 'gradientboostingclassifier': GradientBoostingClassifier(max_leaf_nodes=4, random_state=42)}

In [71]:
%%time

param_grid = {
    "gradientboostingclassifier__learning_rate": (0.01, 0.1, 1, 10),
    "gradientboostingclassifier__max_leaf_nodes": (3, 10, 30),
    'gradientboostingclassifier__n_estimators':(100,200,300),
    "gradientboostingclassifier__max_depth":(3,6,9),
    
  
}
model_grid_search = GridSearchCV(boost_model, param_grid=param_grid, scoring='accuracy', n_jobs=2, cv=2)
model_grid_search.fit(X_train, y_train)

CPU times: total: 203 ms
Wall time: 1min 42s


In [62]:
print('train accuracy',model_grid_search.score(X_train, y_train))
print('test accuracy',model_grid_search.score(X_test, y_test).round(2))

train accuracy 1.0
test accuracy 0.96


In [47]:
print(f"The best set of parameters is: {model_grid_search.best_params_}")

The best set of parameters is: {'gradientboostingclassifier__learning_rate': 1, 'gradientboostingclassifier__max_depth': 3, 'gradientboostingclassifier__max_leaf_nodes': 10, 'gradientboostingclassifier__n_estimators': 100}


In [66]:
rfc_model = make_pipeline(
    OrdinalEncoder(),
    RandomForestClassifier(random_state=42)
)
rfc_model

In [67]:

%%time

rfc_param_grid = {
    "randomforestclassifier__min_samples_split":(2,3,4,5),
    'randomforestclassifier__n_estimators':(100,200,300),
    "randomforestclassifier__max_depth":(5,10,15,20),
    
  
}
rfc_model_grid_search = GridSearchCV(rfc_model, param_grid=rfc_param_grid, scoring='accuracy', n_jobs=2, cv=2)
rfc_model_grid_search.fit(X_train, y_train)

CPU times: total: 31.2 ms
Wall time: 12.6 s


In [68]:
print('random forest train accuracy',rfc_model_grid_search.score(X_train, y_train))
print('random forest test accuracy',rfc_model_grid_search.score(X_test, y_test).round(2))

random forest train accuracy 1.0
random forest test accuracy 0.98


In [69]:
print(f"The best set of parameters is: {rfc_model_grid_search.best_params_}")

The best set of parameters is: {'randomforestclassifier__max_depth': 10, 'randomforestclassifier__min_samples_split': 2, 'randomforestclassifier__n_estimators': 100}


In [72]:
def transf(n,p,k,temp,hum,ph):
    y =pd.DataFrame([{
    'N':n,
    'P':p,
    'K':k,
    'temperature':temp,
    'humidity':hum,
    'ph':ph
    }])
    return y

result = rfc_model_grid_search.predict(transf(25,68,77,20.09340593,15.11279612,7.701446446))[0]
print('Recommended Crop: ',result)

Recommended Crop:  beans


In [44]:
import pickle

pickle.dump(model,open('model.pkl','wb'))