In [85]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt

In [86]:
df = pd.read_csv('crop_yield.csv')

In [87]:
print(df['State'].unique())
print(df['Crop'].unique())
print(df['Season'].unique())

['Assam' 'Karnataka' 'Kerala' 'Meghalaya' 'West Bengal' 'Puducherry' 'Goa'
 'Andhra Pradesh' 'Tamil Nadu' 'Odisha' 'Bihar' 'Gujarat' 'Madhya Pradesh'
 'Maharashtra' 'Mizoram' 'Punjab' 'Uttar Pradesh' 'Haryana'
 'Himachal Pradesh' 'Tripura' 'Nagaland' 'Chhattisgarh' 'Uttarakhand'
 'Jharkhand' 'Delhi' 'Manipur' 'Jammu and Kashmir' 'Telangana'
 'Arunachal Pradesh' 'Sikkim']
['Arecanut' 'Arhar/Tur' 'Castor seed' 'Coconut ' 'Cotton(lint)'
 'Dry chillies' 'Gram' 'Jute' 'Linseed' 'Maize' 'Mesta' 'Niger seed'
 'Onion' 'Other  Rabi pulses' 'Potato' 'Rapeseed &Mustard' 'Rice'
 'Sesamum' 'Small millets' 'Sugarcane' 'Sweet potato' 'Tapioca' 'Tobacco'
 'Turmeric' 'Wheat' 'Bajra' 'Black pepper' 'Cardamom' 'Coriander' 'Garlic'
 'Ginger' 'Groundnut' 'Horse-gram' 'Jowar' 'Ragi' 'Cashewnut' 'Banana'
 'Soyabean' 'Barley' 'Khesari' 'Masoor' 'Moong(Green Gram)'
 'Other Kharif pulses' 'Safflower' 'Sannhamp' 'Sunflower' 'Urad'
 'Peas & beans (Pulses)' 'other oilseeds' 'Other Cereals' 'Cowpea(Lobia)'
 'Oilsee

In [88]:
df.head(5)

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19689 entries, 0 to 19688
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Crop             19689 non-null  object 
 1   Crop_Year        19689 non-null  int64  
 2   Season           19689 non-null  object 
 3   State            19689 non-null  object 
 4   Area             19689 non-null  float64
 5   Production       19689 non-null  int64  
 6   Annual_Rainfall  19689 non-null  float64
 7   Fertilizer       19689 non-null  float64
 8   Pesticide        19689 non-null  float64
 9   Yield            19689 non-null  float64
dtypes: float64(5), int64(2), object(3)
memory usage: 1.5+ MB


In [90]:
df.isnull().sum()

Crop               0
Crop_Year          0
Season             0
State              0
Area               0
Production         0
Annual_Rainfall    0
Fertilizer         0
Pesticide          0
Yield              0
dtype: int64

In [91]:
df.duplicated().sum()

0

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import joblib

In [93]:
df["Crop"] = df["Crop"].str.strip()
le_crop = LabelEncoder()
df['Crop'] = le_crop.fit_transform(df['Crop'])  # Fit and transform ONCE

In [94]:
df["State"] = df["State"].str.strip()
le_state = LabelEncoder()
df['State'] = le_state.fit_transform(df['State'])

In [95]:
df["Season"] = df["Season"].str.strip()
le_season = LabelEncoder()
df['Season'] = le_season.fit_transform(df['Season'])

In [96]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [97]:
# Define features and target
X = df.drop(['Yield'], axis=1)
y = df['Yield']

In [98]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [99]:
# Define numeric and categorical features
numeric_features = ['Crop_Year', 'Area', 'Production', 'Annual_Rainfall', 'Fertilizer', 'Pesticide']
categorical_features = ['Crop', 'Season', 'State']  # Already label encoded

In [100]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', 'passthrough', categorical_features)  # Already encoded
    ])

In [101]:
# Full pipeline with RandomForest
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        verbose=1  # Optional: shows training progress
    ))
])

In [102]:
# Train the model
pipeline.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   17.4s finished


In [103]:
# Save EVERYTHING needed for prediction
joblib.dump({
    'pipeline': pipeline,
    'label_encoders': {
        'crop': le_crop,
        'state': le_state,
        'season': le_season
    },
    'feature_order': list(X_train.columns)  # Preserve column order
}, 'crop_yield_model_full.pkl')

['crop_yield_model_full.pkl']

In [104]:
# Verify it works
test_sample = X_train.iloc[0:1]
print("Test prediction:", pipeline.predict(test_sample)[0])
print("Actual yield:", y_train.iloc[0])

Test prediction: 2.7908669047500045
Actual yield: 2.86


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [105]:
print(f"Predicted Yield: {pipeline.predict(test_sample)[0]:.2f} tons/ha (Expected: {y_train.iloc[0]:.2f} tons/ha) - Good harvest expected!")

Predicted Yield: 2.79 tons/ha (Expected: 2.86 tons/ha) - Good harvest expected!


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [106]:
pred = pipeline.predict(test_sample)[0]
rating = "Excellent" if pred > 3.0 else "Good" if pred > 2.0 else "Needs improvement"
print(f"Predicted Yield: {pred:.2f} tons/ha - {rating}")


Predicted Yield: 2.79 tons/ha - Good


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [107]:
yield_kg_ha = pipeline.predict(test_sample)[0] * 1000  # Convert tons to kg
print(f"Predicted yield: {yield_kg_ha:,.0f} kg/ha (Actual: {y_train.iloc[0] * 1000:,.0f} kg/ha)")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Predicted yield: 2,791 kg/ha (Actual: 2,860 kg/ha)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


In [108]:
kg_ha = pipeline.predict(test_sample)[0] * 1000
status = "Excellent" if kg_ha > 3000 else "Good" if kg_ha > 2000 else "Needs care"
print(f"{kg_ha:,.0f} kg/ha - {status} yield")

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.0s finished


2,791 kg/ha - Good yield
