In [1]:
from pathlib import Path
import os

In [2]:
# Cell-1: Set working directory to project root (auto)
from pathlib import Path
import os

# Try to move up until we find the folder that has 'data' and 'src'
p = Path.cwd()
for _ in range(5):  # max 5 levels up
    if (p / "data").exists() and (p / "src").exists():
        os.chdir(p)
        break
    p = p.parent

print("CWD set to:", Path.cwd())  # should be .../Agriculture_Production

CWD set to: C:\Users\riken\Downloads\Agriculture_Production


In [3]:
# Cell-2: Load the combined CSV
import pandas as pd
import numpy as np

df = pd.read_csv("data/interim/agri_combined.csv")
print("Shape:", df.shape)
df.head()

Shape: (275, 14)


Unnamed: 0,crop,variety,state,season,year,quantity,production,unit,cost,recommended_zone,source_file,source_sheet,area_ha,yield_q_ha
0,Total Foodgrains,,,,2007,128.5,158.8,Tons,,,datafile2.csv,csv,128.5,123.6
1,Rice,,,,2007,168.5,200.8,Tons,,,datafile2.csv,csv,168.5,119.2
2,Wheat,,,,2007,115.0,131.6,Tons,,,datafile2.csv,csv,115.0,114.4
3,Jowar,,,,2007,120.7,124.3,Tons,,,datafile2.csv,csv,120.7,103.0
4,Bajra,,,,2007,94.5,136.4,Tons,,,datafile2.csv,csv,94.5,144.3


In [4]:
import pandas as pd
import numpy as np

# df already loaded in your notebook. If not:
# df = pd.read_csv("data/interim/agri_combined.csv")

print("Original shape:", df.shape)
df = df.drop_duplicates()

# Ensure numeric types
for col in ['production','quantity','cost','year']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Keep only rows with valid non-negative production
df = df[df['production'].notna() & (df['production'] >= 0)]

# Clean basic text columns
for col in ['crop','variety','state','season','unit','recommended_zone']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
        df.loc[df[col].isin(['nan','None','NaN']), col] = np.nan

print("After cleaning:", df.shape)
display(df.head())
print("Columns:", df.columns.tolist())

Original shape: (275, 14)
After cleaning: (275, 14)


Unnamed: 0,crop,variety,state,season,year,quantity,production,unit,cost,recommended_zone,source_file,source_sheet,area_ha,yield_q_ha
0,Total Foodgrains,,,,2007,128.5,158.8,Tons,,,datafile2.csv,csv,128.5,123.6
1,Rice,,,,2007,168.5,200.8,Tons,,,datafile2.csv,csv,168.5,119.2
2,Wheat,,,,2007,115.0,131.6,Tons,,,datafile2.csv,csv,115.0,114.4
3,Jowar,,,,2007,120.7,124.3,Tons,,,datafile2.csv,csv,120.7,103.0
4,Bajra,,,,2007,94.5,136.4,Tons,,,datafile2.csv,csv,94.5,144.3


Columns: ['crop', 'variety', 'state', 'season', 'year', 'quantity', 'production', 'unit', 'cost', 'recommended_zone', 'source_file', 'source_sheet', 'area_ha', 'yield_q_ha']


In [5]:
print("Years:", sorted(df['year'].dropna().unique().tolist()) if 'year' in df.columns else "No year")
print("\nMissing ratio per column:\n", df.isna().mean().sort_values(ascending=False))

if 'state' in df.columns:
    print("\nStates:", df['state'].nunique())
if 'crop' in df.columns:
    print("Crops:", df['crop'].nunique())

print("\nProduction summary:\n", df['production'].describe())

if set(['state','crop']).issubset(df.columns):
    top = df.groupby(['state','crop'])['production'].sum().sort_values(ascending=False).head(10)
    print("\nTop 10 state–crop by total production:\n", top)

Years: [2007, 2008, 2009, 2010, 2011]

Missing ratio per column:
 variety             1.0
state               1.0
season              1.0
cost                1.0
recommended_zone    1.0
crop                0.0
year                0.0
quantity            0.0
production          0.0
unit                0.0
source_file         0.0
source_sheet        0.0
area_ha             0.0
yield_q_ha          0.0
dtype: float64

States: 0
Crops: 55

Production summary:
 count     275.000000
mean      183.042182
std       196.681803
min        42.100000
25%       116.650000
50%       155.400000
75%       199.050000
max      1790.600000
Name: production, dtype: float64

Top 10 state–crop by total production:
 Series([], Name: production, dtype: float64)


In [6]:
target = 'production'

# Auto-select features (drop target + helper cols)
drop_cols = {target, 'source_file', 'source_sheet'}
feature_cols = [c for c in df.columns if c not in drop_cols]

# Drop all-null or constant columns
feature_cols = [c for c in feature_cols if df[c].notna().any()]
feature_cols = [c for c in feature_cols if df[c].nunique(dropna=True) > 1]
print("Using features:", feature_cols)

# Time-aware split
years = sorted(df['year'].dropna().unique().tolist()) if 'year' in df.columns else []
print("Unique years:", years)

if len(years) >= 4:
    test_years  = years[-2:]
    valid_years = [years[-3]]
    train_years = [y for y in years if y not in test_years + valid_years]
elif len(years) == 3:
    test_years  = [years[-1]]
    valid_years = [years[-2]]
    train_years = [years[0]]
elif len(years) == 2:
    test_years  = [years[-1]]
    valid_years = []
    train_years = [years[0]]
else:
    # Fallback: random split if years not available/too few
    test_years, valid_years, train_years = [], [], []

if train_years:
    train = df[df['year'].isin(train_years)]
    remain = df[~df.index.isin(train.index)]
    valid = remain[remain['year'].isin(valid_years)] if valid_years else remain.sample(frac=0.5, random_state=42)
    test  = remain[~remain.index.isin(valid.index)] if test_years else df.sample(frac=0.2, random_state=42)
else:
    # random split fallback
    train = df.sample(frac=0.7, random_state=42)
    remain = df.drop(train.index)
    valid = remain.sample(frac=0.5, random_state=42)
    test  = remain.drop(valid.index)

print("Split sizes -> train:", train.shape, "valid:", valid.shape, "test:", test.shape)

Using features: ['crop', 'year', 'quantity', 'area_ha', 'yield_q_ha']
Unique years: [2007, 2008, 2009, 2010, 2011]
Split sizes -> train: (110, 14) valid: (55, 14) test: (110, 14)


In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate(y_true, y_pred, name=""):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = mean_squared_error(y_true, y_pred,)
    r2 = r2_score(y_true, y_pred)
    print(f"{name} | MAE: {mae:.2f} | RMSE: {rmse:.2f} | R2: {r2:.3f}")

# choose grouping columns present
group_cols = [c for c in ['state','crop','season'] if c in df.columns]
if not group_cols:
    group_cols = [c for c in ['state','crop'] if c in df.columns]

def group_median_baseline(train_df, apply_df, group_cols, target='production'):
    med = train_df.groupby(group_cols)[target].median()
    default = train_df[target].median()
    preds = []
    for _, row in apply_df[group_cols].iterrows():
        key = tuple(row.values)
        preds.append(med.get(key, default))
    return np.array(preds)

if group_cols:
    base_valid = group_median_baseline(train, valid, group_cols, target)
    evaluate(valid[target], base_valid, f"Baseline (valid) by {group_cols}")
else:
    print("Skipping baseline: no suitable grouping columns found")

Baseline (valid) by ['state', 'crop', 'season'] | MAE: 71.93 | RMSE: 35420.17 | R2: -0.027


In [8]:
# If this import fails: pip install catboost
from catboost import CatBoostRegressor, Pool

# Identify categorical vs numeric
cat_cols = [c for c in feature_cols if df[c].dtype == 'object']
num_cols = [c for c in feature_cols if c not in cat_cols]

def make_pool(d):
    X = d[feature_cols].copy()
    cat_idx = [X.columns.get_loc(c) for c in cat_cols]
    return Pool(X, d[target], cat_features=cat_idx)

train_pool = make_pool(train)
valid_pool = make_pool(valid)
test_pool  = make_pool(test)

cat = CatBoostRegressor(
    loss_function='RMSE',
    depth=8,
    learning_rate=0.05,
    n_estimators=1500,
    random_seed=42,
    eval_metric='RMSE',
    verbose=200
)
cat.fit(train_pool, eval_set=valid_pool, use_best_model=True)

pred_val = cat.predict(valid_pool)
evaluate(valid[target], pred_val, "CatBoost (valid)")

pred_test = cat.predict(test_pool)
evaluate(test[target], pred_test, "CatBoost (test)")

0:	learn: 185.7837188	test: 183.1805506	best: 183.1805506 (0)	total: 189ms	remaining: 4m 42s
200:	learn: 10.6006114	test: 18.5537818	best: 18.5537818 (200)	total: 14.4s	remaining: 1m 33s
400:	learn: 2.0545811	test: 16.4330781	best: 16.4330781 (400)	total: 32.6s	remaining: 1m 29s
600:	learn: 0.7964589	test: 16.2174901	best: 16.2174901 (600)	total: 48.8s	remaining: 1m 12s
800:	learn: 0.4463215	test: 16.2357827	best: 16.2115652 (622)	total: 1m 6s	remaining: 58.1s
1000:	learn: 0.2899293	test: 16.2328388	best: 16.2115652 (622)	total: 1m 23s	remaining: 41.6s
1200:	learn: 0.2009692	test: 16.2408032	best: 16.2115652 (622)	total: 1m 39s	remaining: 24.8s
1400:	learn: 0.1450130	test: 16.2483412	best: 16.2115652 (622)	total: 2m	remaining: 8.53s
1499:	learn: 0.1266891	test: 16.2517873	best: 16.2115652 (622)	total: 2m 7s	remaining: 0us

bestTest = 16.21156525
bestIteration = 622

Shrink model to first 623 iterations.
CatBoost (valid) | MAE: 7.96 | RMSE: 262.81 | R2: 0.992
CatBoost (test) | MAE: 14.8

In [10]:
import joblib
# Retrain on train+valid for final model
train_valid = pd.concat([train, valid], axis=0)
tv_pool = Pool(train_valid[feature_cols], train_valid[target],
               cat_features=[train_valid[feature_cols].columns.get_loc(c) for c in cat_cols])

cat.fit(tv_pool, verbose=False)
joblib.dump(cat, "models/production_predictor.joblib")
print("Saved model -> models/production_predictor.joblib")

Saved model -> models/production_predictor.joblib


In [11]:
import pandas as pd, glob, os

for p in glob.glob("data/raw/*.csv"):
    if os.path.basename(p).lower() == "datafile2.csv":
        continue  # we already used this
    print("\n====", os.path.basename(p), "====")
    df0 = pd.read_csv(p, nrows=5)
    print("Columns:", list(df0.columns))
    cand = [c for c in df0.columns if str(c).lower().startswith('particular')]
    if cand:
        print("Particulars sample:", df0[cand[0]].astype(str).head(10).tolist())


==== datafile.csv ====
Columns: ['Crop', '2004-05', '2005-06', '2006-07', '2007-08', '2008-09', '2009-10', '2010-11', '2011-12']

==== datafile1.csv ====
Columns: ['Crop', 'State', 'Cost of Cultivation (`/Hectare) A2+FL', 'Cost of Cultivation (`/Hectare) C2', 'Cost of Production (`/Quintal) C2', 'Yield (Quintal/ Hectare) ']

==== datafile3.csv ====
Columns: ['Crop', 'Variety', 'Season/ duration in days', 'Recommended Zone', 'Unnamed: 4']

==== produce.csv ====
Columns: ['Particulars', 'Frequency', 'Unit', ' 3-1993', ' 3-1994', ' 3-1995', ' 3-1996', ' 3-1997', ' 3-1998', ' 3-1999', ' 3-2000', ' 3-2001', ' 3-2002', ' 3-2003', ' 3-2004', ' 3-2005', ' 3-2006', ' 3-2007', ' 3-2008', ' 3-2009', ' 3-2010', ' 3-2011', ' 3-2012', ' 3-2013', ' 3-2014']
Particulars sample: ['Agricultural Production Foodgrains', 'Agricultural Production Foodgrains Kharif', 'Agricultural Production Foodgrains Rabi', 'Agricultural Production Foodgrains Rice', 'Agricultural Production Foodgrains Rice Kharif']


In [12]:
df = pd.read_csv("data/interim/agri_combined.csv")

In [13]:
from pathlib import Path
import os

p = Path.cwd()
for _ in range(5):
    if (p / "data").exists() and (p / "src").exists():
        os.chdir(p)
        break
    p = p.parent
print("CWD:", Path.cwd())

CWD: C:\Users\riken\Downloads\Agriculture_Production


In [14]:
import pandas as pd
import numpy as np

df = pd.read_csv("data/interim/agri_combined.csv")
print("Loaded:", df.shape)
df = df.drop_duplicates()

# Numeric columns
for col in ['production','quantity','cost','year','area_ha','yield_q_ha']:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Keep valid production
df = df[df['production'].notna() & (df['production'] >= 0)].copy()

# Normalize text cols
for col in ['crop','variety','state','season','unit','recommended_zone']:
    if col in df.columns:
        df[col] = df[col].astype(str).str.strip()
        df.loc[df[col].isin(['nan','None','NaN']), col] = np.nan

print("After cleaning:", df.shape)
print("Years:", sorted(df['year'].dropna().unique().tolist())[:10], "...", sorted(df['year'].dropna().unique().tolist())[-5:])
if 'unit' in df.columns:
    print("Unit counts:\n", df['unit'].value_counts().head())
print("Sources:\n", df['source_file'].value_counts())

Loaded: (2111, 15)
After cleaning: (2111, 15)
Years: [1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002] ... [2010, 2011, 2012, 2013, 2014]
Unit counts:
 unit
Tons    2111
Name: count, dtype: int64
Sources:
 source_file
produce.csv      1740
datafile2.csv     275
datafile.csv       96
Name: count, dtype: int64


In [15]:
# Choose grouping keys dynamically
cand_keys = ['state','crop','season']
keys = [k for k in cand_keys if k in df.columns and df[k].notna().any()]
if 'crop' not in keys and 'crop' in df.columns:
    keys = ['crop']  # at least crop
print("Group keys:", keys)

# Sort and create lags
df = df.sort_values(keys + ['year']).reset_index(drop=True)

def lag(s, n): return s.shift(n)

# 1-yr lags
df['prod_prev1']  = df.groupby(keys)['production'].transform(lambda s: lag(s,1))
if 'area_ha' in df.columns:
    df['area_prev1']  = df.groupby(keys)['area_ha'].transform(lambda s: lag(s,1))
if 'yield_q_ha' in df.columns:
    df['yield_prev1'] = df.groupby(keys)['yield_q_ha'].transform(lambda s: lag(s,1))

# 2-yr moving avg of production (lagged)
df['prod_ma2'] = df.groupby(keys)['production'].transform(lambda s: lag(s,1).rolling(2).mean())

# 2-yr lags for deltas
df['prod_prev2']  = df.groupby(keys)['production'].transform(lambda s: lag(s,2))
if 'area_ha' in df.columns:
    df['area_prev2']  = df.groupby(keys)['area_ha'].transform(lambda s: lag(s,2))
if 'yield_q_ha' in df.columns:
    df['yield_prev2'] = df.groupby(keys)['yield_q_ha'].transform(lambda s: lag(s,2))

# Deltas (prev1 - prev2)
df['prod_delta']  = df['prod_prev1']  - df['prod_prev2']
if 'area_prev1' in df.columns and 'area_prev2' in df.columns:
    df['area_delta']  = df['area_prev1']  - df['area_prev2']
if 'yield_prev1' in df.columns and 'yield_prev2' in df.columns:
    df['yield_delta'] = df['yield_prev1'] - df['yield_prev2']

# Keep rows with at least last year's production
df_model = df.dropna(subset=['prod_prev1']).copy()
print("Model rows:", df_model.shape)
df_model.head()

Group keys: ['crop', 'season']
Model rows: (302, 25)


Unnamed: 0,crop,variety,state,season,year,quantity,production,unit,cost,recommended_zone,...,prod_prev1,area_prev1,yield_prev1,prod_ma2,prod_prev2,area_prev2,yield_prev2,prod_delta,area_delta,yield_delta
59,Bajra,,,Kharif,2006,,7.684,Tons,,,...,7.9313,,,,,,,,,
60,Bajra,,,Kharif,2007,,8.4237,Tons,,,...,7.684,,,7.80765,7.9313,,,-0.2473,,
61,Bajra,,,Kharif,2008,,9.9701,Tons,,,...,8.4237,,,8.05385,7.684,,,0.7397,,
62,Bajra,,,Kharif,2009,,8.8871,Tons,,,...,9.9701,,,9.1969,8.4237,,,1.5464,,
63,Bajra,,,Kharif,2010,,6.506412,Tons,,,...,8.8871,,,9.4286,9.9701,,,-1.083,,


In [16]:
years = sorted(df_model['year'].dropna().unique().tolist())
print("Years in model:", years)

test_year  = years[-1]
valid_year = years[-2] if len(years) > 1 else years[-1]
train_years = [y for y in years if y not in [valid_year, test_year]]

train = df_model[df_model['year'].isin(train_years)]
valid = df_model[df_model['year'] == valid_year]
test  = df_model[df_model['year'] == test_year]
print("Split sizes -> train:", train.shape, "valid:", valid.shape, "test:", test.shape)

Years in model: [2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014]
Split sizes -> train: (238, 25) valid: (32, 25) test: (32, 25)


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate(y_true, y_pred, name=""):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    try:
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} | MAE: {mean_absolute_error(y_true, y_pred):.3f} | RMSE: {rmse:.3f} | R2: {r2_score(y_true, y_pred):.3f}")

evaluate(valid['production'], valid['prod_prev1'], "Naive baseline (valid)")
evaluate(test['production'],  test['prod_prev1'],  "Naive baseline (test)")

Naive baseline (valid) | MAE: 14.135 | RMSE: 35.649 | R2: -0.089
Naive baseline (test) | MAE: 13.970 | RMSE: 35.458 | R2: -0.039


In [21]:
# 6) Model: RandomForest + OneHot + Imputation (handles NaNs)

import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluation helper (compatible with older sklearn)
def evaluate(y_true, y_pred, name=""):
    y_true = np.asarray(y_true); y_pred = np.asarray(y_pred)
    try:
        rmse = mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    print(f"{name} | MAE: {mean_absolute_error(y_true, y_pred):.3f} | RMSE: {rmse:.3f} | R2: {r2_score(y_true, y_pred):.3f}")

target = 'production'

# Forecast-safe features (only lagged and past info)
forecast_feats = [
    'year','prod_prev1','prod_prev2','prod_ma2','prod_delta',
    'area_prev1','area_prev2','area_delta',
    'yield_prev1','yield_prev2','yield_delta',
    'crop','season','state'
]
forecast_feats = [c for c in forecast_feats if c in df_model.columns]

# Remove features that are all-NaN in any split
def prune_feats(feats, dataframes):
    good = []
    for c in feats:
        ok = True
        for d in dataframes:
            if c not in d.columns or d[c].dropna().shape[0] == 0:
                ok = False; break
        if ok: good.append(c)
    return good

feature_cols = prune_feats(forecast_feats, [train, valid, test])
print("Using features:", feature_cols)

# Split columns by type
cat_cols = [c for c in feature_cols if df_model[c].dtype == 'object']
num_cols = [c for c in feature_cols if c not in cat_cols]

# Replace inf with NaN, then impute
for d in [train, valid, test]:
    d.replace([np.inf, -np.inf], np.nan, inplace=True)

# Build transformers
num_imputer = SimpleImputer(strategy='median')
cat_imputer = SimpleImputer(strategy='most_frequent')

# OneHot (version-safe)
try:
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
except TypeError:
    try:
        ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
    except TypeError:
        ohe = OneHotEncoder(handle_unknown='ignore')

pre = ColumnTransformer(
    transformers=[
        ('num', num_imputer, num_cols),
        ('cat', Pipeline(steps=[('imputer', cat_imputer), ('ohe', ohe)]), cat_cols),
    ],
    remainder='drop'
)

rf = RandomForestRegressor(
    n_estimators=600,
    random_state=42,
    n_jobs=-1,
    max_depth=None
)

pipe = Pipeline([('pre', pre), ('model', rf)])

# Fit
pipe.fit(train[feature_cols], train[target])

# Validate
pred_val = pipe.predict(valid[feature_cols])
evaluate(valid[target], pred_val, "RandomForest (valid)")

# Test
pred_test = pipe.predict(test[feature_cols])
evaluate(test[target], pred_test, "RandomForest (test)")

Using features: ['year', 'prod_prev1', 'prod_prev2', 'prod_ma2', 'prod_delta', 'crop', 'season']
RandomForest (valid) | MAE: 1.493 | RMSE: 3.248 | R2: 0.991
RandomForest (test) | MAE: 1.934 | RMSE: 3.610 | R2: 0.989


In [22]:
import joblib, os, json
os.makedirs("models", exist_ok=True)

# Retrain on train+valid
train_valid = pd.concat([train, valid], axis=0)
pipe.fit(train_valid[feature_cols], train_valid[target])

# Save model
joblib.dump(pipe, "models/production_predictor.joblib")

# Save meta with feature order (needed for app)
meta = {
    "feature_cols": feature_cols,
    "used_features": feature_cols,  # same here
}
with open("models/meta.json", "w") as f:
    json.dump(meta, f, indent=2)

print("Saved -> models/production_predictor.joblib and models/meta.json")

Saved -> models/production_predictor.joblib and models/meta.json


In [23]:
# app/app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib, json

st.set_page_config(page_title="Crop Production Forecast", page_icon="🌾")
st.title("Crop Production Forecast (India) 🌾")

# Load model + meta
model = joblib.load("models/production_predictor.joblib")
with open("models/meta.json", "r") as f:
    meta = json.load(f)
feature_cols = meta["feature_cols"]

# Load combined data for crop options (optional)
@st.cache_data
def load_data():
    try:
        df = pd.read_csv("data/interim/agri_combined.csv")
    except Exception:
        df = pd.DataFrame()
    return df

df_all = load_data()
crop_options = sorted([c for c in df_all.get('crop', pd.Series([])).dropna().unique().tolist()]) if not df_all.empty else []

# Inputs
col1, col2 = st.columns(2)
crop = col1.selectbox("Crop", options=crop_options if crop_options else ["Wheat","Rice","Maize","Sugarcane"], index=0)
season = col2.selectbox("Season (optional)", options=["Kharif","Rabi","—"], index=0)
if season == "—":
    season = None

year = st.number_input("Forecast Year (e.g., 2012)", min_value=1900, max_value=2100, value=2012, step=1)

st.markdown("Provide last two years' production for this crop (Tons). If you don't know, check your dataset or leave 2-years-ago empty.")
prod_prev1 = st.number_input("Last year production (prod_prev1) [Tons]", min_value=0.0, value=100.0, step=1.0)
prod_prev2 = st.number_input("2 years ago production (prod_prev2) [Tons] (optional)", min_value=0.0, value=0.0, step=1.0)

# Derive lag features used by the model
prod_ma2 = None
prod_delta = None
if prod_prev1 > 0 and prod_prev2 > 0:
    prod_ma2 = (prod_prev1 + prod_prev2) / 2.0
    prod_delta = prod_prev1 - prod_prev2
else:
    # Fallbacks if prod_prev2 unknown
    prod_ma2 = prod_prev1
    prod_delta = np.nan  # imputer will handle

# Build input row with exactly the model's feature columns
row = {
    "year": int(year),
    "prod_prev1": float(prod_prev1),
    "prod_prev2": float(prod_prev2) if prod_prev2 > 0 else np.nan,
    "prod_ma2": float(prod_ma2) if prod_ma2 is not None else np.nan,
    "prod_delta": float(prod_delta) if prod_delta is not None else np.nan,
    "crop": crop,
    "season": season if season is not None else np.nan,
}

# Keep only features the model expects; add any missing as NaN
X = pd.DataFrame([row])
for c in feature_cols:
    if c not in X.columns:
        X[c] = np.nan
X = X[feature_cols]

if st.button("Predict"):
    try:
        pred = float(model.predict(X)[0])
        st.success(f"Estimated production: {pred:,.2f} Tons")
        with st.expander("Model input (debug)"):
            st.dataframe(X)
    except Exception as e:
        st.error(f"Prediction failed: {e}")

2025-10-21 09:33:39.713 
  command:

    streamlit run C:\ProgramData\anaconda3\Lib\site-packages\ipykernel_launcher.py [ARGUMENTS]
2025-10-21 09:33:40.079 No runtime found, using MemoryCacheStorageManager
2025-10-21 09:33:40.088 No runtime found, using MemoryCacheStorageManager
2025-10-21 09:33:40.140 Session state does not function when running a script without `streamlit run`
