## Project analyzing two algorithms for classification in a farming dataset

In [175]:
# Import the pandas library

import pandas as pd


##### Load the dataset :


In [176]:
df= pd.read_csv("farming.csv")


In [177]:
df

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.717340,rice
...,...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507,coffee
2196,99,15,27,27.417112,56.636362,6.086922,127.924610,coffee
2197,118,33,30,24.131797,67.225123,6.362608,173.322839,coffee
2198,117,32,34,26.272418,52.127394,6.758793,127.175293,coffee


In [178]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [179]:
# To check the shape of file

df.shape

(2200, 8)

In [180]:
# To check the datatype of file data

df.dtypes

N                int64
P                int64
K                int64
temperature    float64
humidity       float64
ph             float64
rainfall       float64
label           object
dtype: object

### Separating the data based on objects:
  * Data that include objects- char
  * Data that exclude objects- num

In [181]:
char =df.select_dtypes(include='object')


In [182]:
char

Unnamed: 0,label
0,rice
1,rice
2,rice
3,rice
4,rice
...,...
2195,coffee
2196,coffee
2197,coffee
2198,coffee


In [183]:
num =df.select_dtypes(exclude='object')


In [184]:
num

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,117,32,34,26.272418,52.127394,6.758793,127.175293


##### Encoding the char objects:


In [185]:
char['label'].unique( )


array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [186]:
from sklearn.preprocessing import LabelEncoder
le =LabelEncoder() 


char['label']=le.fit_transform(char['label'])

In [187]:
char['label'].unique( )


array([20, 11,  3,  9, 18, 13, 14,  2, 10, 19,  1, 12,  7, 21, 15,  0, 16,
       17,  4,  6,  8,  5])

In [188]:
char

Unnamed: 0,label
0,20
1,20
2,20
3,20
4,20
...,...
2195,5
2196,5
2197,5
2198,5


In [189]:
# Concatinating/combining the num objects with encoded char objects:

clean_data=pd.concat([char,num],axis=1)

In [190]:
# Saving the cleaned data in a separate csv file to use further

clean_data.to_csv('Farming_cleaned.csv',index=False)

#### Further training of dataset:-


In [191]:
df.dropna(inplace=True)

x= df.drop('label',axis=1)
y= df[['label']]

In [192]:
x

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,117,32,34,26.272418,52.127394,6.758793,127.175293


In [193]:
y

Unnamed: 0,label
0,rice
1,rice
2,rice
3,rice
4,rice
...,...
2195,coffee
2196,coffee
2197,coffee
2198,coffee


##### Split the dataset into training and testing sets :


In [194]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=42)

In [195]:
x_train


Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
1656,17,16,14,16.396243,92.181519,6.625539,102.944161
752,37,79,19,27.543848,69.347863,7.143943,69.408782
892,7,73,25,27.521856,63.132153,7.288057,45.208411
1041,101,70,48,25.360592,75.031933,6.012697,116.553145
1179,0,17,30,35.474783,47.972305,6.279134,97.790725
...,...,...,...,...,...,...,...
1638,10,5,5,21.213070,91.353492,7.817846,112.983436
1095,108,94,47,27.359116,84.546250,6.387431,90.812505
1130,11,36,31,27.920633,51.779659,6.475449,100.258567
1294,11,124,204,13.429886,80.066340,6.361141,71.400430


In [196]:
y_train


Unnamed: 0,label
1656,orange
752,blackgram
892,lentil
1041,banana
1179,mango
...,...
1638,orange
1095,banana
1130,mango
1294,grapes


In [197]:
x_test

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
1451,101,17,47,29.494014,94.729813,6.185053,26.308209
1334,98,8,51,26.179346,86.522581,6.259336,49.430510
1761,59,62,49,43.360515,93.351916,6.941497,114.778071
1735,44,60,55,34.280461,90.555616,6.825371,98.540477
1576,30,137,200,22.914300,90.704756,5.603413,118.604465
...,...,...,...,...,...,...,...
59,99,55,35,21.723831,80.238990,6.501698,277.962619
71,67,45,38,22.727910,82.170688,7.300411,260.887506
1908,121,47,16,23.605640,79.295731,7.723240,72.498009
1958,116,52,19,22.942767,75.371706,6.114526,67.080226


In [198]:
y_test

Unnamed: 0,label
1451,muskmelon
1334,watermelon
1761,papaya
1735,papaya
1576,apple
...,...
59,rice
71,rice
1908,cotton
1958,cotton


In [199]:
# Finding the shape od test and train data set 

x_train.shape, x_test.shape, y_train.shape, y_test.shape


((1760, 7), (440, 7), (1760, 1), (440, 1))

## Applying Decision Tree Algorithm 

In [200]:
# Decision Tree Classifier

In [201]:
import pandas as pd
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

#### Load the dataset :

In [202]:
df = pd.read_csv("Farming_cleaned.csv")


In [203]:
df

Unnamed: 0,label,N,P,K,temperature,humidity,ph,rainfall
0,20,90,42,43,20.879744,82.002744,6.502985,202.935536
1,20,85,58,41,21.770462,80.319644,7.038096,226.655537
2,20,60,55,44,23.004459,82.320763,7.840207,263.964248
3,20,74,35,40,26.491096,80.158363,6.980401,242.864034
4,20,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...,...
2195,5,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,5,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,5,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,5,117,32,34,26.272418,52.127394,6.758793,127.175293


In [204]:
df.dropna(inplace=True)

x= df.drop('label',axis=1)
y= df[['label']]

In [205]:
x

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,117,32,34,26.272418,52.127394,6.758793,127.175293


In [206]:
y

Unnamed: 0,label
0,20
1,20
2,20
3,20
4,20
...,...
2195,5
2196,5
2197,5
2198,5


#### Split the dataset into training and testing sets :

In [207]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)


#### Instantiate the Decision Tree Classifier :


In [208]:
# Making an object dt
dt= DecisionTreeClassifier()

#### Train the model :

In [209]:
dt.fit(x_train,y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [210]:
# Finding score of testing data

dt.score(x_test,y_test)


0.9886363636363636

In [211]:
# Finding score of training data

dt.score(x_train,y_train)


1.0

#### Make predictions on the test set :

In [212]:
pred = dt.predict(x_test)


In [213]:
pred=pd.DataFrame(pred)


In [214]:
pred

Unnamed: 0,0
0,2
1,9
2,1
3,16
4,19
...,...
435,0
436,18
437,14
438,17


### Classification Report:-

In [215]:
print(classification_report(y_test,pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00        22
           1       0.95      1.00      0.98        21
           2       1.00      1.00      1.00        16
           3       1.00      1.00      1.00        23
           4       1.00      1.00      1.00        21
           5       1.00      1.00      1.00        15
           6       1.00      1.00      1.00        26
           7       1.00      1.00      1.00        21
           8       1.00      0.87      0.93        15
           9       1.00      1.00      1.00        22
          10       1.00      1.00      1.00        18
          11       1.00      1.00      1.00        21
          12       1.00      1.00      1.00        19
          13       1.00      1.00      1.00        14
          14       1.00      0.93      0.97        30
          15       1.00      1.00      1.00        20
          16       1.00      1.00      1.00        23
          17       0.93    

In [216]:
### Analyzing the precision score, recall score and F1 score of the algorithm


# print(precision_score(y_test,pred))
# print(recall_score(y_test,pred))
# print(f1_score(y_test,pred))

#### Finding the confusion matrix :

In [217]:
print(confusion_matrix(y_test,pred))

[[22  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0 16  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0 23  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0 15  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0 26  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0 13  0  0  0  0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  0  0 22  0  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0 18  0  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0 21  0  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0 19  0  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 14  0  0  0  0  0  0  0  0]
 [ 0  0  0  0  0  0  0  0  0  0  0

### Evaluate the model :-

In [218]:
accuracy = accuracy_score(y_test, pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.99


## Applying Random Forest Algorithm

In [219]:
# Random Forest Classifier

In [220]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,precision_score,recall_score,f1_score, confusion_matrix

#### Load the dataset :

In [221]:
df = pd.read_csv("Farming_cleaned.csv")


In [222]:
df

Unnamed: 0,label,N,P,K,temperature,humidity,ph,rainfall
0,20,90,42,43,20.879744,82.002744,6.502985,202.935536
1,20,85,58,41,21.770462,80.319644,7.038096,226.655537
2,20,60,55,44,23.004459,82.320763,7.840207,263.964248
3,20,74,35,40,26.491096,80.158363,6.980401,242.864034
4,20,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...,...
2195,5,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,5,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,5,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,5,117,32,34,26.272418,52.127394,6.758793,127.175293


In [223]:
x= df.drop('label',axis=1)
y= df[['label']]

In [224]:
x

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
0,90,42,43,20.879744,82.002744,6.502985,202.935536
1,85,58,41,21.770462,80.319644,7.038096,226.655537
2,60,55,44,23.004459,82.320763,7.840207,263.964248
3,74,35,40,26.491096,80.158363,6.980401,242.864034
4,78,42,42,20.130175,81.604873,7.628473,262.717340
...,...,...,...,...,...,...,...
2195,107,34,32,26.774637,66.413269,6.780064,177.774507
2196,99,15,27,27.417112,56.636362,6.086922,127.924610
2197,118,33,30,24.131797,67.225123,6.362608,173.322839
2198,117,32,34,26.272418,52.127394,6.758793,127.175293


In [225]:
y

Unnamed: 0,label
0,20
1,20
2,20
3,20
4,20
...,...
2195,5
2196,5
2197,5
2198,5


#### Split the dataset into training and testing sets :

In [226]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)

#### Instantiate the Random Forest Classifier :

In [227]:
# n_estimators: The number of trees in the forest.
# random_state: Controls the randomness of the bootstrapping of the samples used when building trees.

rfc = RandomForestClassifier(n_estimators=100, random_state=42)

#### Train the model :


In [228]:
rfc.fit(x_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [229]:
# Finding score of testing data

rfc.score(x_test,y_test)


0.9931818181818182

In [230]:
# Finding score of training data

rfc.score(x_train,y_train)

1.0

#### Make predictions on the test set :

In [231]:

y_pred = rfc.predict(x_test)

In [232]:
y_pred

array([15, 21, 17, 17,  0, 12,  0, 13, 14, 10,  2,  4, 19,  8,  4, 19,  0,
       11, 17, 15,  5, 17, 16, 17,  3,  8, 14, 16, 18, 20, 19, 13,  8, 10,
        8,  2,  8,  3,  3,  9, 17, 12,  2, 11, 14, 11, 18,  4, 15, 11,  2,
        5,  7, 14,  5,  9,  6,  0,  1,  2, 21,  4, 10, 16, 17, 18, 16, 20,
       15, 18, 15,  4,  8,  1,  2, 17,  1,  6, 21, 16,  5,  3, 20, 13, 16,
       12,  5, 13,  2, 19, 11, 13,  6, 17, 18, 13,  9,  5,  2, 10,  4, 20,
       16, 15, 21,  9, 21,  1, 18, 13,  1,  8,  6, 19, 18,  3, 11,  4, 19,
       20, 18,  7,  2,  4,  3,  2,  4, 11,  1, 13,  1,  9, 19,  3,  4, 16,
       18,  1,  1,  0,  9, 15, 14, 13,  4, 11,  0,  4,  9, 13, 14, 10, 21,
       14, 18, 18, 18,  9, 11,  8,  3,  0, 16,  6, 20,  4,  7, 10, 21,  7,
        7,  2, 19,  3,  4, 11, 10,  7, 21,  8,  5,  5,  9,  8, 13,  9,  1,
        9,  4, 17, 17, 14, 12, 19, 21,  9, 11,  0,  2,  3,  7,  7,  1,  6,
       20, 19, 14,  1,  8, 14, 11,  3,  3,  3,  0, 20,  9, 17,  5,  2,  9,
       12, 12,  4, 17,  0

### Evaluate the model :-


In [233]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Random Forest Classifier: {accuracy:.2f}")

Accuracy of the Random Forest Classifier: 0.99


### Classification Report :-

In [234]:
print(classification_report(y_test,pred))


              precision    recall  f1-score   support

           0       0.00      0.00      0.00        23
           1       0.05      0.05      0.05        21
           2       0.12      0.10      0.11        20
           3       0.04      0.04      0.04        26
           4       0.05      0.04      0.04        27
           5       0.00      0.00      0.00        17
           6       0.08      0.12      0.09        17
           7       0.00      0.00      0.00        14
           8       0.00      0.00      0.00        23
           9       0.05      0.05      0.05        20
          10       0.00      0.00      0.00        11
          11       0.19      0.19      0.19        21
          12       0.05      0.05      0.05        19
          13       0.00      0.00      0.00        24
          14       0.00      0.00      0.00        19
          15       0.00      0.00      0.00        17
          16       0.00      0.00      0.00        14
          17       0.00    