In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('/Users/ratnamb.ojha/Downloads/Crop_recommendation.csv')

In [4]:
df.head(10)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice
5,69,37,42,23.058049,83.370118,7.073454,251.055,rice
6,69,55,38,22.708838,82.639414,5.700806,271.32486,rice
7,94,53,40,20.277744,82.894086,5.718627,241.974195,rice
8,89,54,38,24.515881,83.535216,6.685346,230.446236,rice
9,68,58,38,23.223974,83.033227,6.336254,221.209196,rice


## Checking Stastistics of the data:

In [5]:
df.describe()

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall
count,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0,2200.0
mean,50.551818,53.362727,48.149091,25.616244,71.481779,6.46948,103.463655
std,36.917334,32.985883,50.647931,5.063749,22.263812,0.773938,54.958389
min,0.0,5.0,5.0,8.825675,14.25804,3.504752,20.211267
25%,21.0,28.0,20.0,22.769375,60.261953,5.971693,64.551686
50%,37.0,51.0,32.0,25.598693,80.473146,6.425045,94.867624
75%,84.25,68.0,49.0,28.561654,89.948771,6.923643,124.267508
max,140.0,145.0,205.0,43.675493,99.981876,9.935091,298.560117


## Insights:

- The dataset includes **2,200 samples** for each variable, indicating no missing entries.
- **N, P, and K** (nutrients) show large standard deviations and a wide range, with several *potential outliers* especially in K (max: 205 vs 75th percentile: 49), suggesting the dataset covers diverse soil types and possibly extreme cases.
- **Temperature** and **humidity** also span wide intervals:
  - *Temperature*: 8.8°C to 43.7°C — covers both cooler and hotter climates.
  - *Humidity*: 14.3% to almost 100% — from very dry to highly humid environments.
- **pH** values mostly cluster between 6 and 7, ideal for many crops, but values stretch from very acidic (3.5) to fairly alkaline (9.9), indicating broad soil acidity representation.
- **Rainfall** distribution is highly spread out (min: 20.2, max: 298.6, mean: 103.5), showing data from both arid and very wet conditions.
- For most features, the mean is higher than the median, **suggesting right-skewed distributions** influenced by high-value outliers.
- The **diversity and spread** in all these variables highlight the dataset's *potential suitability for machine learning*, especially for models aiming for generalization across different agricultural and climatic conditions.

## Shape of Dataset

In [6]:
df.shape

(2200, 8)

## Dataset Information
- Nitrogen values
- Phosphorus values
- Potassium values
- Temperature 
- Humidity levels
- pH values
- rainfall 

In [7]:
df.label.unique() #this lists the different types of crops available in the dataset


array(['rice', 'maize', 'chickpea', 'kidneybeans', 'pigeonpeas',
       'mothbeans', 'mungbean', 'blackgram', 'lentil', 'pomegranate',
       'banana', 'mango', 'grapes', 'watermelon', 'muskmelon', 'apple',
       'orange', 'papaya', 'coconut', 'cotton', 'jute', 'coffee'],
      dtype=object)

In [8]:
df.isnull().sum() #this checks for null values in the dataset

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [9]:
df.info()   #this gives information about the dataset including data types and non-null counts


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2200 entries, 0 to 2199
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   N            2200 non-null   int64  
 1   P            2200 non-null   int64  
 2   K            2200 non-null   int64  
 3   temperature  2200 non-null   float64
 4   humidity     2200 non-null   float64
 5   ph           2200 non-null   float64
 6   rainfall     2200 non-null   float64
 7   label        2200 non-null   object 
dtypes: float64(4), int64(3), object(1)
memory usage: 137.6+ KB


In [10]:
df.duplicated().sum()   #this checks for duplicate rows in the dataset

np.int64(0)

In [13]:
df.nunique()  #this gives the number of unique values in each column


N               137
P               117
K                73
temperature    2200
humidity       2200
ph             2200
rainfall       2200
label            22
dtype: int64

In [14]:
df['label'].value_counts()  #this counts the number of occurrences of each crop type

label
rice           100
maize          100
jute           100
cotton         100
coconut        100
papaya         100
orange         100
apple          100
muskmelon      100
watermelon     100
grapes         100
mango          100
banana         100
pomegranate    100
lentil         100
blackgram      100
mungbean       100
mothbeans      100
pigeonpeas     100
kidneybeans    100
chickpea       100
coffee         100
Name: count, dtype: int64

## Exploring data

In [15]:
df.head() 

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice


In [19]:
print("Categories in 'label' variable: ",end=" " )
print(df['label'].unique())

print("Categories in 'P' variable:  ",end=" ")
print(df['P'].unique())

print("Categories in 'K' variable:",end=" " )
print(df['K'].unique())

print("Categories in 'temperature' variable:",end=" " )
print(df['temperature'].unique())

print("Categories in 'humidity' variable:",end=" " )
print(df['humidity'].unique())

Categories in 'label' variable:  ['rice' 'maize' 'chickpea' 'kidneybeans' 'pigeonpeas' 'mothbeans'
 'mungbean' 'blackgram' 'lentil' 'pomegranate' 'banana' 'mango' 'grapes'
 'watermelon' 'muskmelon' 'apple' 'orange' 'papaya' 'coconut' 'cotton'
 'jute' 'coffee']
Categories in 'P' variable:   [ 42  58  55  35  37  53  54  46  56  50  48  38  45  40  59  41  47  49
  51  57  39  43  44  60  52  36  72  67  73  70  62  74  66  63  71  78
  80  68  65  77  76  79  61  64  69  75  24  18  26  27  25  21  30  11
   5  10   7  20  22  15  23   8  16  29  17   6  19  13   9  14  28  94
  95  92  89  88  87  85  86  83  91  81  84  90  82  93  33  31  34  32
 130 144 123 125 131 140 122 134 145 139 141 138 136 132 133 121 126 120
 142 135 129 128 137 127 124 143  12]
Categories in 'K' variable: [ 43  41  44  40  42  38  36  37  39  35  45  16  17  21  20  19  25  22
  15  18  23  24  77  84  85  81  75  79  76  83  78  80  82  46  50  53
  54  49  55  52  47  48  51  27  31  32  34  33  30  28  2