In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn



In [2]:
df = pd.read_csv('crop_yield.csv')
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [3]:
df.shape

(19689, 10)

In [4]:
df.isnull().sum()

Crop               0
Crop_Year          0
Season             0
State              0
Area               0
Production         0
Annual_Rainfall    0
Fertilizer         0
Pesticide          0
Yield              0
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [6]:
df.duplicated().sum()

0

In [7]:
df.describe()

Unnamed: 0,Crop_Year,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
count,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0,19689.0
mean,2009.127584,179926.6,16435940.0,1437.755177,24103310.0,48848.35,79.954009
std,6.498099,732828.7,263056800.0,816.909589,94946000.0,213287.4,878.306193
min,1997.0,0.5,0.0,301.3,54.17,0.09,0.0
25%,2004.0,1390.0,1393.0,940.7,188014.6,356.7,0.6
50%,2010.0,9317.0,13804.0,1247.6,1234957.0,2421.9,1.03
75%,2015.0,75112.0,122718.0,1643.7,10003850.0,20041.7,2.388889
max,2020.0,50808100.0,6326000000.0,6552.7,4835407000.0,15750510.0,21105.0


In [8]:
state_area_sum = df.groupby('State')['Area'].sum().reset_index()
state_area_sum['Area_in_Lakhs'] = state_area_sum['Area'] / 100000
print(state_area_sum[['State', 'Area_in_Lakhs']])

                State  Area_in_Lakhs
0      Andhra Pradesh    2389.253350
1   Arunachal Pradesh      57.255310
2               Assam     773.426000
3               Bihar    1679.954310
4        Chhattisgarh    1105.361940
5               Delhi       7.979030
6                 Goa      27.496005
7             Gujarat    2224.343332
8             Haryana    1315.069936
9    Himachal Pradesh     178.102078
10  Jammu and Kashmir     187.431870
11          Jharkhand     304.355268
12          Karnataka    2579.073990
13             Kerala     363.910603
14     Madhya Pradesh    4920.879945
15        Maharashtra    4619.395521
16            Manipur      57.276390
17          Meghalaya      51.787760
18            Mizoram      15.421895
19           Nagaland      84.267317
20             Odisha    1246.466600
21         Puducherry       6.535410
22             Punjab    1620.402000
23             Sikkim      17.593990
24         Tamil Nadu    1103.727970
25          Telangana     397.921420
2

In [9]:
filtered_states = state_area_sum[state_area_sum['Area'] > 100000000]
print(filtered_states)

             State          Area  Area_in_Lakhs
0   Andhra Pradesh  2.389253e+08    2389.253350
3            Bihar  1.679954e+08    1679.954310
4     Chhattisgarh  1.105362e+08    1105.361940
7          Gujarat  2.224343e+08    2224.343332
8          Haryana  1.315070e+08    1315.069936
12       Karnataka  2.579074e+08    2579.073990
14  Madhya Pradesh  4.920880e+08    4920.879945
15     Maharashtra  4.619396e+08    4619.395521
20          Odisha  1.246467e+08    1246.466600
22          Punjab  1.620402e+08    1620.402000
24      Tamil Nadu  1.103728e+08    1103.727970
27   Uttar Pradesh  5.426726e+08    5426.726350
29     West Bengal  2.376823e+08    2376.822649


In [10]:

states_to_filter = [
    'Andhra Pradesh', 'Bihar', 'Chhattisgarh', 'Gujarat', 'Haryana', 
    'Karnataka', 'Madhya Pradesh', 'Maharashtra', 'Odisha', 
    'Punjab', 'Tamil Nadu', 'Uttar Pradesh', 'West Bengal'
]

df_1 = df[df['State'].isin(states_to_filter)]

print(df_1)

               Crop  Crop_Year       Season      State      Area  Production  \
27         Arecanut       1997  Whole Year   Karnataka   93100.0      133342   
28        Arhar/Tur       1997  Kharif       Karnataka  421810.0       98473   
29            Bajra       1997  Kharif       Karnataka  301149.0      132056   
30            Bajra       1997  Summer       Karnataka     820.0         662   
31     Black pepper       1997  Kharif       Karnataka    3825.0         924   
...             ...        ...          ...        ...       ...         ...   
19676     Sugarcane       2018  Winter          Odisha    6778.0      417672   
19677          Urad       2018  Autumn          Odisha   13720.0        3583   
19678          Urad       2018  Summer          Odisha    4571.0        2336   
19679          Urad       2018  Winter          Odisha   39560.0       13123   
19680         Wheat       2018  Summer          Odisha     147.0         268   

       Annual_Rainfall   Fertilizer  Pe

In [11]:
df_1.shape

(11470, 10)

In [12]:
df_1['Crop'].value_counts()

Crop
Maize                    587
Rice                     563
Moong(Green Gram)        503
Groundnut                471
Urad                     462
Sesamum                  423
Jowar                    398
Bajra                    378
Sunflower                346
Ragi                     335
Potato                   304
Arhar/Tur                297
Gram                     289
Sugarcane                288
Horse-gram               285
Small millets            285
Rapeseed &Mustard        282
Cotton(lint)             280
Wheat                    274
Onion                    274
Dry chillies             232
Castor seed              225
Tobacco                  223
Linseed                  188
Other Kharif pulses      183
Soyabean                 173
Other  Rabi pulses       168
Garlic                   163
Ginger                   160
Safflower                157
Peas & beans (Pulses)    157
Masoor                   156
Sweet potato             156
Barley                   156
Coriander

In [13]:
crop_counts = df_1['Crop'].value_counts()
crops_to_keep = crop_counts[crop_counts >= 200].index
df_2 = df_1[df_1['Crop'].isin(crops_to_keep)]
print(df_2)

               Crop  Crop_Year       Season      State      Area  Production  \
28        Arhar/Tur       1997  Kharif       Karnataka  421810.0       98473   
29            Bajra       1997  Kharif       Karnataka  301149.0      132056   
30            Bajra       1997  Summer       Karnataka     820.0         662   
34     Dry chillies       1997  Kharif       Karnataka  186579.0      139206   
35     Dry chillies       1997  Rabi         Karnataka    4565.0        7895   
...             ...        ...          ...        ...       ...         ...   
19676     Sugarcane       2018  Winter          Odisha    6778.0      417672   
19677          Urad       2018  Autumn          Odisha   13720.0        3583   
19678          Urad       2018  Summer          Odisha    4571.0        2336   
19679          Urad       2018  Winter          Odisha   39560.0       13123   
19680         Wheat       2018  Summer          Odisha     147.0         268   

       Annual_Rainfall   Fertilizer  Pe

In [14]:
df_2['Crop'].value_counts()

Crop
Maize                587
Rice                 563
Moong(Green Gram)    503
Groundnut            471
Urad                 462
Sesamum              423
Jowar                398
Bajra                378
Sunflower            346
Ragi                 335
Potato               304
Arhar/Tur            297
Gram                 289
Sugarcane            288
Horse-gram           285
Small millets        285
Rapeseed &Mustard    282
Cotton(lint)         280
Onion                274
Wheat                274
Dry chillies         232
Castor seed          225
Tobacco              223
Name: count, dtype: int64

In [15]:
df_2.shape

(8004, 10)

****Data engineering****

From here i plan to do medeling 

In [16]:
df_2.columns

Index(['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
       'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield'],
      dtype='object')

In [17]:
X = df_2.drop(['Production', 'Yield'], axis=1)
y = df_2['Production']


In [18]:
from sklearn.model_selection import train_test_split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [20]:
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer

In [21]:
ohe = OneHotEncoder(drop='first')
scaler = StandardScaler()

In [22]:
X_train.head(1)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide
16915,Small millets,2019,Kharif,Madhya Pradesh,79810.0,1446.7,13708165.6,29529.7


In [23]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# Define the OneHotEncoder to ignore unknown categories
ohe = OneHotEncoder(handle_unknown='ignore')

# Define your preprocessor with both encoders
preprocessor = ColumnTransformer(
    transformers=[
        ('OneHotEncoder', ohe, [0, 2, 3]),  # Categorical columns
        ('StandardScaler', StandardScaler(), [1, 4, 5, 6, 7])  # Numerical columns
    ],
    remainder='passthrough'  # Keep other columns unchanged
)



In [24]:
preprocessor

In [25]:
X_train_dummy = preprocessor.fit_transform(X_train)
X_test_dummy = preprocessor.fit_transform(X_test)

In [26]:
X_train_dummy

<5602x47 sparse matrix of type '<class 'numpy.float64'>'
	with 44816 stored elements in Compressed Sparse Row format>

In [27]:
X_test_dummy

<2402x47 sparse matrix of type '<class 'numpy.float64'>'
	with 19216 stored elements in Compressed Sparse Row format>

****Training model****

In [28]:
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from  sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error,r2_score

In [29]:
models = {
    'lr':LinearRegression(),
    'lss':Lasso(),
    'rg':Ridge(),
    'Knr':KNeighborsRegressor(),
    'dtr':DecisionTreeRegressor() 

}
for name, mod in models.items():
    mod.fit(X_train_dummy,y_train)
    y_pred = mod.predict(X_test_dummy)

    print(f"{name} MSE : {mean_squared_error(y_test,y_pred)} score {r2_score(y_test,y_pred)}")

lr MSE : 52353748156321.66 score 0.36794191388584707


  model = cd_fast.sparse_enet_coordinate_descent(


lss MSE : 52353756472205.13 score 0.36794181348957034
rg MSE : 52337360286107.66 score 0.36813976191483677
Knr MSE : 2204691396612.589 score 0.9733831277857222
dtr MSE : 1548699086295.1748 score 0.9813028137445348


In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error, r2_score

# Define the scorer for MSE and R^2
mse_scorer = make_scorer(mean_squared_error, greater_is_better=False)  # Negating MSE to ensure consistency in comparison
r2_scorer = make_scorer(r2_score)

# Number of cross-validation folds
cv_folds = 20

# Perform cross-validation for each model
for name, mod in models.items():
    # Cross-validation for MSE
    mse_scores = cross_val_score(mod, X_train_dummy, y_train, cv=cv_folds, scoring=mse_scorer)
    # Cross-validation for R^2
    r2_scores = cross_val_score(mod, X_train_dummy, y_train, cv=cv_folds, scoring=r2_scorer)
    
    # Printing average of the cross-validation scores
    print(f"{name} - Average MSE: {-mse_scores.mean()} | Average R^2: {r2_scores.mean()}")



lr - Average MSE: 44008564580491.94 | Average R^2: 0.197185281821824


  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(
  model = cd_fast.sparse_enet_coordinate_descent(


lss - Average MSE: 44008526466213.29 | Average R^2: 0.1971877007630251
rg - Average MSE: 44002336163886.58 | Average R^2: 0.20173952145333654
Knr - Average MSE: 3202393963383.5596 | Average R^2: 0.9171295268204327
dtr - Average MSE: 1539761536235.9473 | Average R^2: 0.9692938349670713


****Select Best model for prediction****

In [31]:
Knr = KNeighborsRegressor()
Knr.fit(X_train_dummy,y_train)
Knr.predict(X_test_dummy)


array([6.64390800e+05, 6.14000000e+01, 9.20000000e+02, ...,
       1.61354882e+07, 2.69108000e+05, 4.26780000e+03])

In [32]:
X_train.head(5)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Annual_Rainfall,Fertilizer,Pesticide
16915,Small millets,2019,Kharif,Madhya Pradesh,79810.0,1446.7,13708165.6,29529.7
3660,Tobacco,2012,Whole Year,Andhra Pradesh,134000.0,968.7,20207200.0,41540.0
14171,Small millets,2014,Rabi,West Bengal,523.0,1483.5,78952.08,172.59
6364,Tobacco,1998,Whole Year,Andhra Pradesh,193100.0,1048.3,19078280.0,55999.0
10628,Onion,2008,Rabi,Gujarat,54400.0,746.1,7781376.0,4896.0


In [33]:
def predicition(Crop, Crop_Year, Season, State, Area, Annual_Rainfall, Fertilizer, Pesticide):
    features = np.array([[Crop, Crop_Year, Season, State, Area, Annual_Rainfall, Fertilizer, Pesticide]])
    treasformed_features = preprocessor.transform(features)
    predicted_value = Knr.predict(treasformed_features).reshape(1,-1)
    return predicted_value[0]

In [34]:
Crop = 'Onion' 
Crop_Year = 2025
Season = 'Rabi'
State = 'Gujarat'
Area = 30000.0
Annual_Rainfall = 600.0
Fertilizer = 7000000.0
Pesticide = 3000.0

predicted_value = predicition(Crop, Crop_Year, Season, State, Area, Annual_Rainfall, Fertilizer, Pesticide)
print(predicted_value)

[800609.6]




****Pickle File****

In [35]:
import pickle

# Save the trained model
pickle.dump(Knr, open('Knr.pkl', 'wb'))

# Save the preprocessor
pickle.dump(preprocessor, open('preprocessor.pkl', 'wb'))


In [36]:
print(sklearn.__version__)

1.5.2
