In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from joblib import dump

In [7]:
df = pd.read_excel("Final dataset.xlsx")

print("Columns:", df.columns)
print("Shape:", df.shape)

df = df.dropna(subset=["Yield"])

df.columns = [c.strip().replace(" ", "_") for c in df.columns]

target = "Yield"

features = [
    "Year", "District", "Crop", "Area", "Production",
    "Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec",
    "AnnualRainfall","NetSownArea","CroppingIntensity","NetIrrigatedArea"
]

X = df[features]
y = df[target]

categorical = ["District", "Crop"]
numerical = list(set(features) - set(categorical))

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
        ("num", StandardScaler(), numerical)
    ]
)

model = RandomForestRegressor(n_estimators=200, random_state=42)

pipe = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
print("R2 Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

dump(pipe, "yield_prediction_pipeline.joblib")

def predict_for_farmer(crop, land_area_ha, district, month=None, year=2023):
    """
    Predict yield for a farmer's input.
    crop: str
    land_area_ha: float
    district: str
    month: optional str ("Jan".."Dec")
    year: int
    """
    sample = {
        "Year": year,
        "District": district,
        "Crop": crop,
        "Area": land_area_ha,
        "Production": 0,  
        "AnnualRainfall": df["AnnualRainfall"].mean(),
        "NetSownArea": df["NetSownArea"].mean(),
        "CroppingIntensity": df["CroppingIntensity"].mean(),
        "NetIrrigatedArea": df["NetIrrigatedArea"].mean(),
    }

    for m in ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]:
        sample[m] = df[m].mean()

    if month and month in sample:
        sample[month] = df[month].mean() * 1.2 

    sample_df = pd.DataFrame([sample])

    yield_pred = pipe.predict(sample_df)[0]

    production_pred = yield_pred * land_area_ha

    return {"Predicted Yield (t/ha)": yield_pred,
            "Predicted Production (t)": production_pred}


Columns: Index(['Year', 'District', 'Crop', 'Area', 'Yield', 'Production', 'Jan', 'Feb',
       'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec',
       'AnnualRainfall', 'NetSownArea', 'CroppingIntensity',
       'NetIrrigatedArea'],
      dtype='object')
Shape: (1050, 22)
R2 Score: 0.9724519783588753
MAE: 82.56142512077297


In [8]:
from joblib import load
import pandas as pd

pipe = load("yield_prediction_pipeline.joblib")

def predict_for_farmer(crop, land_area_ha, district, month=None, year=2023):
    """
    Predict yield for a farmer's input.
    crop: str
    land_area_ha: float
    district: str
    month: optional str ("Jan".."Dec")
    year: int
    """
    avg_rainfall = df["AnnualRainfall"].mean()
    avg_net_sown = df["NetSownArea"].mean()
    avg_cropping_intensity = df["CroppingIntensity"].mean()
    avg_irrigated_area = df["NetIrrigatedArea"].mean()
    sample = {
        "Year": year,
        "District": district,
        "Crop": crop,
        "Area": land_area_ha,
        "Production": 0,  
        "AnnualRainfall": avg_rainfall,
        "NetSownArea": avg_net_sown,
        "CroppingIntensity": avg_cropping_intensity,
        "NetIrrigatedArea": avg_irrigated_area
    }

    for m in ["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"]:
        sample[m] = 100  

    if month and month in sample:
        sample[month] = sample[month] * 1.2

    sample_df = pd.DataFrame([sample])

    yield_pred = pipe.predict(sample_df)[0]

    production_pred = yield_pred * land_area_ha

    return {
        "Predicted Yield (t/ha)": round(yield_pred, 2),
        "Predicted Production (t)": round(production_pred, 2)
    }

crop = input("Enter the crop:")
land_area_ha= float(input("Enter the size of farm in ha:"))
district= input("Enter your district:")
month=input("Enter the month (3 letters only):")
year=int(input("Enter the year"))
result = predict_for_farmer(crop, land_area_ha, district, month, year)
print(result)


Enter the crop: Wheat
Enter the size of farm in ha: 0.21
Enter your district: Jajpur
Enter the month (3 letters only): Mar
Enter the year 2027


{'Predicted Yield (t/ha)': 705.12, 'Predicted Production (t)': 148.07}
