In [1]:
#import libraries
import pandas as pd
import numpy as np

In [2]:
#load the data
data=pd.read_csv("/content/Crop_recommendation.csv")
data.head()   #initial dataset

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.9355362,rice
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.6555374,rice
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.9642476,rice
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.8640342,rice
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.7173405,rice


In [3]:
#check for null values
print(data.isnull().sum())

N              17
P              24
K              15
temperature    16
humidity       17
ph             23
rainfall       20
label          11
dtype: int64


In [4]:
#data dimensions
print(data.shape)

(2200, 8)


**Fill missing values with the maximum value in the column**

In [5]:
data_max = data.fillna(data.max())

print("\nCleaned Dataset - Max Value:")
print(data_max.head())


Cleaned Dataset - Max Value:
      N     P     K  temperature   humidity        ph     rainfall label
0  90.0  42.0  43.0    20.879744  82.002744  6.502985  202.9355362  rice
1  85.0  58.0  41.0    21.770462  80.319644  7.038096  226.6555374  rice
2  60.0  55.0  44.0    23.004459  82.320763  7.840207  263.9642476  rice
3  74.0  35.0  40.0    26.491096  80.158363  6.980401  242.8640342  rice
4  78.0  42.0  42.0    20.130175  81.604873  7.628473  262.7173405  rice


  data_max = data.fillna(data.max())


**Most Frequent Value**

In [6]:
from sklearn.impute import SimpleImputer
frequent_val=SimpleImputer(missing_values=np.nan,strategy='most_frequent')

In [7]:
#replacing all of the missing data with the most frequent value

# Instantiate SimpleImputer with strategy='most_frequent'
frequent_imputer = SimpleImputer(strategy='most_frequent')

# Columns to be imputed
columns_to_impute = ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']

# Impute missing values
data[columns_to_impute] = frequent_imputer.fit_transform(data[columns_to_impute])

In [8]:
#printing the preprocessed dataset
data

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.9355362,rice
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.6555374,rice
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.9642476,rice
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.8640342,rice
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.7173405,rice
...,...,...,...,...,...,...,...,...
2195,107.0,34.0,32.0,26.774637,66.413269,6.780064,177.7745075,coffee
2196,99.0,15.0,27.0,27.417112,56.636362,6.086922,127.92461,coffee
2197,118.0,33.0,30.0,24.131797,67.225123,6.362608,173.3228386,coffee
2198,117.0,32.0,34.0,26.272418,52.127394,6.758793,],coffee


In [9]:
print(data.isna().sum())

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64


**Fill missing values with the minimum value in the column**

In [10]:
df_min = data.fillna(data.min())

print("\nCleaned Dataset - Min Value:")
print(df_min.head())



Cleaned Dataset - Min Value:
      N     P     K  temperature   humidity        ph     rainfall label
0  90.0  42.0  43.0    20.879744  82.002744  6.502985  202.9355362  rice
1  85.0  58.0  41.0    21.770462  80.319644  7.038096  226.6555374  rice
2  60.0  55.0  44.0    23.004459  82.320763  7.840207  263.9642476  rice
3  74.0  35.0  40.0    26.491096  80.158363  6.980401  242.8640342  rice
4  78.0  42.0  42.0    20.130175  81.604873  7.628473  262.7173405  rice


**Fill missing values using linear interpolation**

In [13]:
#load the data
data=pd.read_csv("/content/Crop_recommendation.csv")
data.head()   #initial dataset

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.9355362,rice
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.6555374,rice
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.9642476,rice
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.8640342,rice
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.7173405,rice


In [14]:
df_interpolated = data.interpolate()

# Display cleaned datasets
print("\nCleaned Dataset - Interpolated:")
print(df_interpolated.head())


Cleaned Dataset - Interpolated:
      N     P     K  temperature   humidity        ph     rainfall label
0  90.0  42.0  43.0    20.879744  82.002744  6.502985  202.9355362  rice
1  85.0  58.0  41.0    21.770462  80.319644  7.038096  226.6555374  rice
2  60.0  55.0  44.0    23.004459  82.320763  7.840207  263.9642476  rice
3  74.0  35.0  40.0    26.491096  80.158363  6.980401  242.8640342  rice
4  78.0  42.0  42.0    20.130175  81.604873  7.628473  262.7173405  rice


**Using KNN Imputer**

In [15]:
#load the data
data=pd.read_csv("/content/Crop_recommendation.csv")
data.head()   #initial dataset

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.9355362,rice
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.6555374,rice
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.9642476,rice
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.8640342,rice
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.7173405,rice


In [16]:
print(data.isna().sum())

N              17
P              24
K              15
temperature    16
humidity       17
ph             23
rainfall       20
label          11
dtype: int64


In [17]:
#handle categorical data
new_data=pd.get_dummies(data,drop_first=True)
new_data #121 columns after handling categorical values

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall_100.0497183,rainfall_100.1173443,rainfall_100.118612,rainfall_100.1692639,...,label_mango,label_mothbeans,label_mungbean,label_muskmelon,label_orange,label_papaya,label_pigeonpeas,label_pomegranate,label_rice,label_watermelon
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,107.0,34.0,32.0,26.774637,66.413269,6.780064,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2196,99.0,15.0,27.0,27.417112,56.636362,6.086922,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2197,118.0,33.0,30.0,24.131797,67.225123,6.362608,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2198,117.0,32.0,34.0,26.272418,52.127394,6.758793,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
#use KNN Imputer
from sklearn.impute import KNNImputer
knn=KNNImputer(n_neighbors=2)
imputed_data=knn.fit_transform(new_data)
final_data=pd.DataFrame(imputed_data)

In [19]:
#check for null values in the imputed dataset
final_data.isna().sum()

0       0
1       0
2       0
3       0
4       0
       ..
2200    0
2201    0
2202    0
2203    0
2204    0
Length: 2205, dtype: int64

**Fixed Value**

In [20]:
#load the data
data=pd.read_csv("/content/Crop_recommendation.csv")
data.head()   #initial dataset

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90.0,42.0,43.0,20.879744,82.002744,6.502985,202.9355362,rice
1,85.0,58.0,41.0,21.770462,80.319644,7.038096,226.6555374,rice
2,60.0,55.0,44.0,23.004459,82.320763,7.840207,263.9642476,rice
3,74.0,35.0,40.0,26.491096,80.158363,6.980401,242.8640342,rice
4,78.0,42.0,42.0,20.130175,81.604873,7.628473,262.7173405,rice


In [21]:
#Fill missing values with a fixed value (e.g., 0)
df_fixed_value = data.fillna(0)

print("\nCleaned Dataset - Fixed Value:")
print(df_fixed_value.head())


Cleaned Dataset - Fixed Value:
      N     P     K  temperature   humidity        ph     rainfall label
0  90.0  42.0  43.0    20.879744  82.002744  6.502985  202.9355362  rice
1  85.0  58.0  41.0    21.770462  80.319644  7.038096  226.6555374  rice
2  60.0  55.0  44.0    23.004459  82.320763  7.840207  263.9642476  rice
3  74.0  35.0  40.0    26.491096  80.158363  6.980401  242.8640342  rice
4  78.0  42.0  42.0    20.130175  81.604873  7.628473  262.7173405  rice
