In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pickle

In [None]:
data = pd.read_csv("crop_yield.csv")

In [None]:
data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


(           Crop  Crop_Year       Season  State     Area  Production  \
 0      Arecanut       1997  Whole Year   Assam  73814.0       56708   
 1     Arhar/Tur       1997  Kharif       Assam   6637.0        4685   
 2   Castor seed       1997  Kharif       Assam    796.0          22   
 3      Coconut        1997  Whole Year   Assam  19656.0   126905000   
 4  Cotton(lint)       1997  Kharif       Assam   1739.0         794   
 
    Annual_Rainfall  Fertilizer  Pesticide        Yield  
 0           2051.4  7024878.38   22882.34     0.796087  
 1           2051.4   631643.29    2057.47     0.710435  
 2           2051.4    75755.32     246.76     0.238333  
 3           2051.4  1870661.52    6093.36  5238.051739  
 4           2051.4   165500.63     539.09     0.420909  ,
 None)

In [None]:
# Separate features (X) and target variable (y)
X = data.drop(columns=['Yield'])
y = data['Yield']

# Identify categorical and numerical columns
categorical_cols = ['Crop', 'Season', 'State']
numerical_cols = ['Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Preprocessing

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)


Building pipeline

In [None]:
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [None]:
# Update the Random Forest model with fewer estimators for faster training
model_pipeline.set_params(regressor__n_estimators=10)  # Reduce number of estimators

# Retrain the model
model_pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Save the trained model
model_file_path ='crop_yield_model.pkl'
with open(model_file_path, 'wb') as f:
    pickle.dump(model_pipeline, f)

rmse, r2, model_file_path


(122.34698619160595, 0.9813179726998675, 'crop_yield_model.pkl')