In [1]:
# Step 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [2]:
df = pd.read_csv("crop_yield.csv")

df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [11]:
df['Crop'] = df['Crop'].str.strip()
df['Season'] = df['Season'].str.strip()
df['State'] = df['State'].str.strip()

crop_categories = df['Crop'].unique()  # Get unique crops
season_categories = df['Season'].unique()  # Get unique seasons
state_categories = df['State'].unique() 

crop_categories

array(['Arecanut', 'Arhar/Tur', 'Castor seed', 'Coconut', 'Cotton(lint)',
       'Dry chillies', 'Gram', 'Jute', 'Linseed', 'Maize', 'Mesta',
       'Niger seed', 'Onion', 'Other  Rabi pulses', 'Potato',
       'Rapeseed &Mustard', 'Rice', 'Sesamum', 'Small millets',
       'Sugarcane', 'Sweet potato', 'Tapioca', 'Tobacco', 'Turmeric',
       'Wheat', 'Bajra', 'Black pepper', 'Cardamom', 'Coriander',
       'Garlic', 'Ginger', 'Groundnut', 'Horse-gram', 'Jowar', 'Ragi',
       'Cashewnut', 'Banana', 'Soyabean', 'Barley', 'Khesari', 'Masoor',
       'Moong(Green Gram)', 'Other Kharif pulses', 'Safflower',
       'Sannhamp', 'Sunflower', 'Urad', 'Peas & beans (Pulses)',
       'other oilseeds', 'Other Cereals', 'Cowpea(Lobia)',
       'Oilseeds total', 'Guar seed', 'Other Summer Pulses', 'Moth'],
      dtype=object)

In [12]:
season_categories

array(['Whole Year', 'Kharif', 'Rabi', 'Autumn', 'Summer', 'Winter'],
      dtype=object)

In [13]:
crop_enc = LabelEncoder()
season_enc = LabelEncoder()
state_enc = LabelEncoder()

In [14]:
crop_enc.fit(crop_categories)
season_enc.fit(season_categories)
state_enc.fit(state_categories)

# Step 3: Save these encoders for future use
joblib.dump(crop_enc, "Crop_encoder.pkl")
joblib.dump(season_enc, "Season_encoder.pkl")
joblib.dump(state_enc, "State_encoder.pkl")

['State_encoder.pkl']

In [6]:
df.head()

Unnamed: 0,Crop,Crop_Year,Season,State,Area,Production,Annual_Rainfall,Fertilizer,Pesticide,Yield
0,Arecanut,1997,Whole Year,Assam,73814.0,56708,2051.4,7024878.38,22882.34,0.796087
1,Arhar/Tur,1997,Kharif,Assam,6637.0,4685,2051.4,631643.29,2057.47,0.710435
2,Castor seed,1997,Kharif,Assam,796.0,22,2051.4,75755.32,246.76,0.238333
3,Coconut,1997,Whole Year,Assam,19656.0,126905000,2051.4,1870661.52,6093.36,5238.051739
4,Cotton(lint),1997,Kharif,Assam,1739.0,794,2051.4,165500.63,539.09,0.420909


In [7]:
# Step 4: Split Data for Yield Prediction
# Features for Yield: All except Production & Yield
X_yield = df.drop(columns=["Yield", "Production"])
y_yield = df["Yield"]

X_yield_train, X_yield_test, y_yield_train, y_yield_test = train_test_split(X_yield, y_yield, test_size=0.2, random_state=42)


In [8]:
# Step 5: Train DecisionTreeRegressor for Yield
yield_model = DecisionTreeRegressor(random_state=42)
yield_model.fit(X_yield_train, y_yield_train)

# Evaluate Yield Model
yield_preds = yield_model.predict(X_yield_test)
print("Yield Model RMSE:", np.sqrt(mean_squared_error(y_yield_test, yield_preds)))
print("Yield Model R²:", r2_score(y_yield_test, yield_preds))

Yield Model RMSE: 170.48389608579333
Yield Model R²: 0.9637252711734178


In [9]:
# Step 6: Add predicted Yield to original data (for training Production model)
df["Predicted_Yield"] = yield_model.predict(X_yield)

In [10]:
# Step 7: Prepare Data for Production Prediction
# Features: everything except actual Production
X_prod = df.drop(columns=["Production", "Yield"])
y_prod = df["Production"]

X_prod_train, X_prod_test, y_prod_train, y_prod_test = train_test_split(X_prod, y_prod, test_size=0.2, random_state=42)

In [11]:
# Step 8: Train LinearRegression for Production
prod_model = LinearRegression()
prod_model.fit(X_prod_train, y_prod_train)

# Evaluate Production Model
prod_preds = prod_model.predict(X_prod_test)
print("Production Model RMSE:", np.sqrt(mean_squared_error(y_prod_test, prod_preds)))
print("Production Model R²:", r2_score(y_prod_test, prod_preds))

Production Model RMSE: 223636541.3179426
Production Model R²: 0.3421407981415997


In [12]:
# Step 9: Save Models and Encoders
# Save models
joblib.dump(yield_model, "yield_model.pkl")
joblib.dump(prod_model, "production_model.pkl")

# Save encoders
for col, enc in encoders.items():
    joblib.dump(enc, f"{col}_encoder.pkl")

In [30]:
def safe_label_encode(encoder, value, label_name):
    value_clean = value
    if value_clean not in encoder.classes_:
        raise ValueError(f"❌ '{value}' not found in {label_name} encoder classes: {list(encoder.classes_)}")
    return encoder.transform([value_clean])[0]

In [15]:
# Sample Input (Change these values for testing)
sample_input = {
    "Crop": "Wheat",
    "Crop_Year": 2023,
    "Season": "Rabi",
    "State": "Punjab",
    "Area": 1200.0,
    "Annual_Rainfall": 850.0,
    "Fertilizer": 300.0,
    "Pesticide": 10.0
}
# Load models & encoders
yield_model = joblib.load("yield_model.pkl")
production_model = joblib.load("production_model.pkl")
crop_enc = joblib.load("Crop_encoder.pkl")
state_enc = joblib.load("State_encoder.pkl")
season_enc = joblib.load("Season_encoder.pkl")
# Encode categorical features
encoded_input = {
    "Crop": crop_enc.transform([sample_input["Crop"].strip()])[0],
    "Crop_Year": sample_input["Crop_Year"],
    "Season": season_enc.transform([sample_input["Season"].strip()])[0],
    "State": state_enc.transform([sample_input["State"].strip()])[0],
    "Area": sample_input["Area"],
    "Annual_Rainfall": sample_input["Annual_Rainfall"],
    "Fertilizer": sample_input["Fertilizer"],
    "Pesticide": sample_input["Pesticide"]
}
# Convert to DataFrame for model input
X_yield_sample = pd.DataFrame([encoded_input])

# Predict Yield
predicted_yield = yield_model.predict(X_yield_sample)[0]

# Add Predicted Yield to input for production prediction
X_production_sample = X_yield_sample.copy()
X_production_sample["Predicted_Yield"] = predicted_yield

# Predict Production
predicted_production = production_model.predict(X_production_sample)[0]

print(f"✅ Predicted Yield: {predicted_yield:.2f}")
print(f"✅ Predicted Production: {predicted_production:.2f}")


✅ Predicted Yield: 1.20
✅ Predicted Production: -10202932.20


In [16]:
import pandas as pd

# Example: Show mapping for Crop encoder
crop_label_map = {label: idx for idx, label in enumerate(crop_enc.classes_)}
print("Crop Encoding Mapping:")
print(pd.DataFrame(crop_label_map.items(), columns=["Label", "Encoded"]))

Crop Encoding Mapping:
                    Label  Encoded
0                Arecanut        0
1               Arhar/Tur        1
2                   Bajra        2
3                  Banana        3
4                  Barley        4
5            Black pepper        5
6                Cardamom        6
7               Cashewnut        7
8             Castor seed        8
9                 Coconut        9
10              Coriander       10
11           Cotton(lint)       11
12          Cowpea(Lobia)       12
13           Dry chillies       13
14                 Garlic       14
15                 Ginger       15
16                   Gram       16
17              Groundnut       17
18              Guar seed       18
19             Horse-gram       19
20                  Jowar       20
21                   Jute       21
22                Khesari       22
23                Linseed       23
24                  Maize       24
25                 Masoor       25
26                  Mesta       