In [1]:
import numpy as np 
import pandas as pd 

In [2]:
df = pd.read_csv('crop_yield.csv')

In [3]:
df.head(5)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [4]:
df['Season'].unique()

array(['Whole Year ', 'Kharif     ', 'Rabi       ', 'Autumn     ',
       'Summer     ', 'Winter     '], dtype=object)

In [5]:
df.shape

(19689, 10)

In [None]:
# data preprocessing

In [7]:
df=df[df['Crop_Year']!=2020]


In [8]:
# removing spaces present in season column
df['Season'] = df['Season'].str.strip()

In [9]:
# Convert Fertilizer and pesticide columns from kgs into tons

df['Fertilizer'] = df['Fertilizer'].apply(lambda x: x/1000)
df['Fertilizer'] = df['Fertilizer'].apply(lambda x: np.round(x,3))

df['Pesticide'] = df['Pesticide'].apply(lambda x: x/1000)
df['Pesticide'] = df['Pesticide'].apply(lambda x: np.round(x,3))

In [10]:
df['Input_Per_Unit_Area'] = (df['Fertilizer'] + df['Pesticide']) / df['Area']

In [11]:
# dropping this row because it's a extreme outlier
df.drop(119, inplace=True)

In [12]:
df.drop(columns=['Fertilizer','Pesticide'], inplace=True)

In [13]:
# Categorize crop_year
bins = [1990, 2000, 2010, 2020]  # Example bins, adjust based on your data range
labels = ['90s', '2000s', '2010s']
df['Year_Interval'] = pd.cut(df['Crop_Year'], bins=bins, labels=labels)

In [14]:
df.drop(columns=['Crop_Year'],inplace=True)

In [15]:
df.shape

(19651, 9)

In [16]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Yield', 'Input_Per_Unit_Area', 'Year_Interval'],
      dtype='object')

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19651 entries, 0 to 19688
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype   
---  ------               --------------  -----   
 0   Crop                 19651 non-null  object  
 1   Season               19651 non-null  object  
 2   State                19651 non-null  object  
 3   Area                 19651 non-null  float64 
 4   Production           19651 non-null  int64   
 5   Annual_Rainfall      19651 non-null  float64 
 6   Yield                19651 non-null  float64 
 7   Input_Per_Unit_Area  19651 non-null  float64 
 8   Year_Interval        19651 non-null  category
dtypes: category(1), float64(4), int64(1), object(3)
memory usage: 1.4+ MB


In [18]:
df.describe()

Unnamed: 0,Area,Production,Annual_Rainfall,Yield,Input_Per_Unit_Area
count,19651.0,19651.0,19651.0,19651.0,19651.0
mean,180227.2,16467210.0,1437.967162,80.100282,0.137329
std,733500.0,263310100.0,817.676055,879.148542,0.026137
min,0.5,0.0,301.3,0.0,0.0948
25%,1396.0,1400.0,940.7,0.599372,0.10855
50%,9328.0,13830.0,1247.0,1.03,0.14476
75%,75238.0,123005.5,1643.7,2.390714,0.158
max,50808100.0,6326000000.0,6552.7,21105.0,0.1725


In [19]:
df['Year_Interval'].unique()

['90s', '2000s', '2010s']
Categories (3, object): ['90s' < '2000s' < '2010s']

In [20]:
df.columns

Index(['Crop', 'Season', 'State', 'Area', 'Production', 'Annual_Rainfall',
       'Yield', 'Input_Per_Unit_Area', 'Year_Interval'],
      dtype='object')

In [21]:
df.head(5)

Unnamed: 0,Crop,Season,State,Area,Production,Annual_Rainfall,Yield,Input_Per_Unit_Area,Year_Interval
0,Arecanut,Whole Year,Assam,73814.0,56708,2051.4,0.796087,0.09548,90s
1,Arhar/Tur,Kharif,Assam,6637.0,4685,2051.4,0.710435,0.09548,90s
2,Castor seed,Kharif,Assam,796.0,22,2051.4,0.238333,0.09548,90s
3,Coconut,Whole Year,Assam,19656.0,126905000,2051.4,5238.051739,0.09548,90s
4,Cotton(lint),Kharif,Assam,1739.0,794,2051.4,0.420909,0.09548,90s


In [22]:
num_col = ['Area','Production','Annual_Rainfall','Input_Per_Unit_Area']
ordinal_col = ['Season','Year_Interval']
nominal_col = ['Crop','State']
season_order = ['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']
year_order = ['90s','2000s','2010s']

In [23]:
df['Season'].unique()

array(['Whole Year', 'Kharif', 'Rabi', 'Autumn', 'Summer', 'Winter'],
      dtype=object)

In [24]:
from sklearn.preprocessing import RobustScaler, PowerTransformer, OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [25]:
X = df.drop(columns=['Yield'])
y = df[['Yield']]

In [26]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [28]:
# data encoding

In [29]:
preprocessor = ColumnTransformer(transformers=[
    ('yeo_johnson_transform', PowerTransformer(method='yeo-johnson'), num_col),
    ('robust_scaler', RobustScaler(),num_col),
    ('Season_order', OrdinalEncoder(categories=[['Winter','Summer','Autumn','Rabi','Kharif','Whole Year']]), ['Season']),
    ('Year_order',OrdinalEncoder(categories=[['90s','2000s','2010s']]), ['Year_Interval']),
    ('OHE', OneHotEncoder(drop='first',handle_unknown='ignore'), nominal_col)
], remainder='passthrough')

In [30]:
# model training

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectKBest, f_regression

In [32]:
# Feature selection using KBest method to reduce curse of dimensionality
kbest = SelectKBest(score_func=f_regression, k=40)

In [33]:
# After hyperparameter tuning best parameters are selected to reduce runtime and increse performance of model
rf = RandomForestRegressor(max_features=0.75, max_samples=0.75, n_estimators=400, n_jobs=-1)

In [34]:
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('kbest', kbest),
    ('RF_regressor', rf)
])

In [35]:
pipe.fit(X_train,y_train.values.ravel())

In [36]:
y_pred = pipe.predict(X_test)

In [37]:
from sklearn.metrics import accuracy_score, r2_score
r2_score(y_test,y_pred)

0.915755422398867

In [38]:
import pickle

# Save the full pipeline (includes preprocessor, feature selection, and model)
pickle.dump(pipe, open('crop_yield_pipeline.pkl', 'wb'))
pickle.dump(preprocessor, open('crop_yield_preprocessor.pkl', 'wb'))


In [44]:
# Test the loaded models
def test_loaded_models():
    # Load the models
    loaded_pipeline, loaded_preprocessor = load_models()
    
    # Create a sample input (make sure it matches your training data structure)
    sample_input = pd.DataFrame({
        'Crop': ['Arecanut'],
        'Season': ['Whole Year'],
        'State': ['Assam'],
        'Area': [73814.0],
        'Production': [56708],
        'Annual_Rainfall': [2051.4],
        'Input_Per_Unit_Area': [0.09548],
        'Year_Interval': ['90s']
    })
    
    # Make predictions
    prediction = predict_crop_yield(sample_input, loaded_pipeline)
    print(f"Predicted yield: {prediction[0]:.2f}")

# Example of how to use the saved models
if __name__ == "__main__":
    test_loaded_models()

Predicted yield: 0.82
