In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
df=pd.read_csv('crop.csv')

In [None]:
df.describe()

In [None]:
df.head(200)

In [None]:
print("The shape of this data set is ",df.shape)

In [None]:
# checking for null | NAN Values

df.isnull().sum()
# it is not having any non values so it quite good

In [None]:
#checking for whichs crops are present in dataset
df['label'].value_counts()

In [None]:
sns.barplot(data=df, x='N',y='label')




In [None]:
sns.barplot(data=df, x='P',y='label')


In [None]:
sns.barplot(data=df, x='K',y='label')


In [None]:
plt.rcParams['figure.figsize'] = (15, 7)

plt.subplot(2, 4, 1)
sns.distplot(df['N'], color = 'lightgrey')
plt.xlabel('Ratio of Nitrogen', fontsize = 12)
plt.grid()

plt.subplot(2, 4, 2)
sns.distplot(df['P'], color = 'skyblue')
plt.xlabel('Ratio of Phosphorous', fontsize = 12)
plt.grid()

plt.subplot(2, 4, 3)
sns.distplot(df['K'], color ='darkblue')
plt.xlabel('Ratio of Potassium', fontsize = 12)
plt.grid()
plt.subplot(2, 4, 4)
sns.distplot(df['temperature'], color = 'black')
plt.xlabel('Temperature', fontsize = 12)
plt.grid()

plt.subplot(2, 4, 5)
sns.distplot(df['rainfall'], color = 'grey')
plt.xlabel('Rainfall', fontsize = 12)
plt.grid()

plt.subplot(2, 4, 6)
sns.distplot(df['humidity'], color = 'lightgreen')
plt.xlabel('Humidity', fontsize = 12)
plt.grid()

plt.subplot(2, 4, 7)
sns.distplot(df['ph'], color = 'darkgreen')
plt.xlabel('pH Level', fontsize = 12)
plt.grid()

plt.suptitle('Distribution for Agricultural Conditions', fontsize = 20)
plt.show()

In [None]:
sns.jointplot(x='N',y='humidity',data=df )

In [None]:
sns.jointplot(x='P',y='humidity',data=df )

In [None]:
sns.jointplot(x='K',y='humidity',data=df )

In [None]:

#on, Winter Season and Rainy Season

print("Summer Crops")
print(df[(df['temperature'] > 30) & (df['humidity'] > 50)]['label'].unique())
print("-----------------------------------")
print("Winter Crops")
print(df[(df['temperature'] < 20) & (df['humidity'] > 30)]['label'].unique())
print("-----------------------------------")
print("Rainy Crops")
print(df[(df['rainfall'] > 200) & (df['humidity'] > 30)]['label'].unique())

In [None]:
sns.pairplot(df, hue='label')

In [None]:
#selecting indpendent feature like nitrogen, phosophorus , postassium, temperature rainfall
#selection dependent features like crop(label)

y = df['label']
x = df.drop(['label'], axis = 1)

In [None]:
print("Shape of x:", x.shape)
print("Shape of y:", y.shape)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix,roc_auc_score


In [None]:
from sklearn.model_selection import train_test_split
#Solitting up the dataset into train and test dataset with ratio of 60:40
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4, random_state =0)

In [None]:
#Printing the shape of the test and train model

print("The Shape of x train:", x_train.shape)
print("The Shape of x test:", x_test.shape)
print("The Shape of y train:", y_train.shape)
print("The Shape of y test:", y_test.shape)

In [None]:
#Using Strafied Cross-Fold Validation for a selection of data in a efficient way

import numpy as np
from sklearn.model_selection import StratifiedKFold

from sklearn.model_selection import cross_val_score


K=StratifiedKFold(n_splits=10)
models = {
          "KNN" : KNeighborsClassifier(),
          "DT" : DecisionTreeClassifier(),
          'RFC' : RandomForestClassifier(),
          'GBC' : GradientBoostingClassifier(),
          'XGB' : XGBClassifier()
          }


model_accuracy = pd.DataFrame(columns=['Model','Accuracy'])


for test, clf in models.items():
    if test == 'XGB' or test=='GBC':
        continue

    result=cross_val_score(clf,x_train,y_train,cv=K)
    print(test)
    print(result)
    print("Minimum value: ",np.min(result))
    print("Maximum value: ",np.max(result))
    print("Average: ",np.mean(result))
    print('\n')



In [None]:
for test, clf in models.items():
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)
    acc = accuracy_score(y_test,y_pred)
    train_pred = clf.predict(x_train)
    train_acc = accuracy_score(y_train, train_pred)
    print("\n", test + ' scores')
    print(acc)
    model_accuracy = model_accuracy.append({'Model': test, 'Accuracy': acc, 'Train_acc': train_acc}, ignore_index=True)


In [None]:
model_accuracy

In [None]:
model_accuracy.sort_values(ascending=False, by = 'Accuracy')

In [None]:


accuracy_rate=[]
for i in range(1,40):
    
    knn=KNeighborsClassifier(n_neighbors=i)
    score=cross_val_score(knn,x_train,y_train,cv=10)
    accuracy_rate.append(score.mean())

In [None]:
error_rate=[]
for i in range(1,40):
    
    knn=KNeighborsClassifier(n_neighbors=i)
    score=cross_val_score(knn,x_train,y_train,cv=10)
    error_rate.append(1-score.mean())

In [None]:
plt.figure(figsize=(12,6))
plt.plot(range(1,40),error_rate,color='blue',linestyle='dashed',marker='o',markerfacecolor='red',markersize=10)
plt.title('Error rate vs  K Value')
plt.xlabel('K')
plt.ylabel('Error rate')

In [None]:

knn=KNeighborsClassifier(n_neighbors=4)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
acc=accuracy_score(y_test,y_pred)
acc


#Since the error rate is incresing and it also tends to underfit the data ,hence it is not used

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
RFC=RandomForestClassifier()
params = {
    'n_estimators':[300,400,500],
    'max_depth':[5,10,15],
    'min_samples_split':[2,5,8]
}
grid_rand = GridSearchCV(RFC,params,cv=3,verbose=3,n_jobs=-1)

grid_rand.fit(x_train,y_train)

pred_rand = grid_rand.predict(x_test)

print(classification_report(y_test,pred_rand))

print('Best score : ',grid_rand.best_score_)
print('Best params : ',grid_rand.best_params_)

In [None]:
#from the model accuracy table we conclude the the maximun accuracy is of decision tree classifer and we choose for our model

RFC=RandomForestClassifier(max_depth=10,min_samples_split=8,n_estimators=5)
RFC.fit(x_train,y_train)
y_pred = RFC.predict(x_test)
acc=accuracy_score(y_test,y_pred)
acc


In [None]:
# now lets do some boosting like XGBoost to prevent the overfitting pf data

XGB=XGBClassifier()
XGB.fit(x_train,y_train)
y_pred = XGB.predict(x_test)
acc=accuracy_score(y_test,y_pred)
print('Accuracy',acc*100)

In [None]:
prediction = XGB.predict((np.array([[20,
                                       30,
                                       50,
                                       15,
                                       80,
                                       9.5,
                                       100]])))
print("The Suggested Crop is:", prediction)

In [None]:
import pickle
file=open("crops.pkl",'wb')
pickle.dump(XGB, file)