## Importing my toolkits here

In [3]:
import numpy as np
import pandas as pd
from sklearn import model_selection

## Importing Dataset

In [4]:
initial_d = pd.read_csv('Crop_recommendation.csv')
print(initial_d.head(5))
print("Shape of Dataset: ", initial_d.shape)

    N   P   K  temperature   humidity        ph    rainfall label
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice
3  74  35  40    26.491096  80.158363  6.980401  242.864034  rice
4  78  42  42    20.130175  81.604873  7.628473  262.717340  rice
Shape of Dataset:  (2200, 8)


### All cool till now :)

## Seperating Labels from Features

In [5]:
init_label= initial_d['label']
init_features = initial_d.drop("label",axis=1)
print("Labels:-")
print(init_label.head(5))
print("Features:-")
print(init_features.head(5))
print(init_features.shape)

Labels:-
0    rice
1    rice
2    rice
3    rice
4    rice
Name: label, dtype: object
Features:-
    N   P   K  temperature   humidity        ph    rainfall
0  90  42  43    20.879744  82.002744  6.502985  202.935536
1  85  58  41    21.770462  80.319644  7.038096  226.655537
2  60  55  44    23.004459  82.320763  7.840207  263.964248
3  74  35  40    26.491096  80.158363  6.980401  242.864034
4  78  42  42    20.130175  81.604873  7.628473  262.717340
(2200, 7)


## Data Pre-Processing

In [6]:
#Standardizing the data

from sklearn.preprocessing import StandardScaler
standardized_features = StandardScaler().fit_transform(init_features) # Returns np array
standardized_features = pd.DataFrame(standardized_features) #Convert back np array to Panda DataFrame
print(standardized_features.head(5))


          0         1         2         3         4         5         6
0  1.068797 -0.344551 -0.101688 -0.935587  0.472666  0.043302  1.810361
1  0.933329  0.140616 -0.141185 -0.759646  0.397051  0.734873  2.242058
2  0.255986  0.049647 -0.081939 -0.515898  0.486954  1.771510  2.921066
3  0.635298 -0.556811 -0.160933  0.172807  0.389805  0.660308  2.537048
4  0.743673 -0.344551 -0.121436 -1.083647  0.454792  1.497868  2.898373


## All good for first model-KNN

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

#We have 2200 Points; 
#We split it such that we get 20% of the points(440) points in Test Dataset
Xtrain0, Xtest, Ytrain0, Ytest= model_selection.train_test_split(init_features, init_label, test_size=0.2, random_state=42)

print("==========================")
print("Train Data Shape")
print(Xtrain0.shape)
print(Ytrain0.shape)
print("===========================")
print("Test Data Shape")
print(Xtest.shape)
print(Ytest.shape)


Train Data Shape
(1760, 7)
(1760,)
Test Data Shape
(440, 7)
(440,)


### We are using Cross Validation for determining right value of K(Hyperparameter )

In [15]:
for i in range(1, 30, 2):
    knn = KNeighborsClassifier(n_neighbors=i)
    score = cross_val_score(knn, Xtrain0.to_numpy(), Ytrain0, cv=5, scoring='accuracy')
    print('\n Cross Validation Score for k = %d is ' % (i))
    print(score.mean())


 Cross Validation Score for k = 1 is 
0.9823863636363637

 Cross Validation Score for k = 3 is 
0.9801136363636364

 Cross Validation Score for k = 5 is 
0.9806818181818182

 Cross Validation Score for k = 7 is 
0.9806818181818182

 Cross Validation Score for k = 9 is 
0.9772727272727273

 Cross Validation Score for k = 11 is 
0.975

 Cross Validation Score for k = 13 is 
0.975

 Cross Validation Score for k = 15 is 
0.9715909090909092

 Cross Validation Score for k = 17 is 
0.9698863636363637

 Cross Validation Score for k = 19 is 
0.96875

 Cross Validation Score for k = 21 is 
0.9670454545454545

 Cross Validation Score for k = 23 is 
0.965909090909091

 Cross Validation Score for k = 25 is 
0.9659090909090908

 Cross Validation Score for k = 27 is 
0.9613636363636363

 Cross Validation Score for k = 29 is 
0.9607954545454545


### K=5 gives us best score on cross validation

In [18]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain0.to_numpy(), Ytrain0)
pred = knn.predict(Xtest.to_numpy())
acc = accuracy_score(Ytest, pred, normalize=True) * float(100)
print('Test Accuracy Score for this Model is= %f%%' % (acc))

from sklearn.metrics import classification_report
print(classification_report(Ytest, pred))

Test Accuracy Score for this Model is= 97.045455%
              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        23
      banana       1.00      1.00      1.00        21
   blackgram       0.95      1.00      0.98        20
    chickpea       1.00      1.00      1.00        26
     coconut       1.00      1.00      1.00        27
      coffee       1.00      0.94      0.97        17
      cotton       0.94      1.00      0.97        17
      grapes       1.00      1.00      1.00        14
        jute       0.76      0.96      0.85        23
 kidneybeans       0.95      1.00      0.98        20
      lentil       0.85      1.00      0.92        11
       maize       1.00      0.95      0.98        21
       mango       1.00      1.00      1.00        19
   mothbeans       1.00      0.92      0.96        24
    mungbean       1.00      1.00      1.00        19
   muskmelon       1.00      1.00      1.00        17
      orange       1.00      1.

## Training on Gaussian Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
GNBclf = GaussianNB()
GNBclf.fit(Xtrain0, Ytrain0)
YPred = GNBclf.predict(Xtest)
score = cross_val_score(GNBclf, Xtrain0, Ytrain0, cv=5, scoring='accuracy')
avgCVScore=score.mean()
print("5-fold CV score for Naive Bayes:, ", avgCVScore)

acc = accuracy_score(Ytest, YPred)
print("\nGaussian Naive Bayes's Accuracy aginst Test Dataset:", acc)

from sklearn.metrics import classification_report
print("\n")
print(classification_report(Ytest, YPred))

5-fold CV score for Naive Bayes:,  0.9954545454545455

Gaussian Naive Bayes's Accuracy aginst Test Dataset: 0.9954545454545455


              precision    recall  f1-score   support

       apple       1.00      1.00      1.00        23
      banana       1.00      1.00      1.00        21
   blackgram       1.00      1.00      1.00        20
    chickpea       1.00      1.00      1.00        26
     coconut       1.00      1.00      1.00        27
      coffee       1.00      1.00      1.00        17
      cotton       1.00      1.00      1.00        17
      grapes       1.00      1.00      1.00        14
        jute       0.92      1.00      0.96        23
 kidneybeans       1.00      1.00      1.00        20
      lentil       1.00      1.00      1.00        11
       maize       1.00      1.00      1.00        21
       mango       1.00      1.00      1.00        19
   mothbeans       1.00      1.00      1.00        24
    mungbean       1.00      1.00      1.00        19
   mus

### Gaussian Naive Bayes has better accuracy than KNN. Hence we will chose Gaussian Naive Bayes for our recommendation system. 

## Saving our model on pickle file

In [20]:
clf=GaussianNB()
clf.fit(init_features, init_label)
import pickle
from joblib import dump, load
dump(clf, 'GNBClassifier.pkl')

['GNBClassifier.pkl']