In [3]:
# %% [markdown]
# ## 1Ô∏è‚É£ Import Libraries

import os
import time
import joblib
import cloudpickle
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


In [4]:
# %% [markdown]
# ## 2Ô∏è‚É£ Load and Clean Data

# Example: loading one dataset (extend to multiple CSVs if needed)
kolhapur = pd.read_csv("datasets/kolhapur.csv")

# Combine all dataframes if you have more (extend list as needed)
data = pd.concat([kolhapur], ignore_index=True)

# Clean column names
data.columns = data.columns.str.strip()

# Parse dates
data["Arrival_Date"] = pd.to_datetime(data["Arrival_Date"], dayfirst=True, errors="coerce")

# Drop rows with missing or invalid data
data = data.dropna(subset=["Arrival_Date", "Modal_Price"])

# Extract temporal features
data["Year"] = data["Arrival_Date"].dt.year
data["Month"] = data["Arrival_Date"].dt.month
data["Day"] = data["Arrival_Date"].dt.day
data["DayOfWeek"] = data["Arrival_Date"].dt.dayofweek
data["WeekOfYear"] = data["Arrival_Date"].dt.isocalendar().week.astype(int)

# Select columns
features = ["District", "Market", "Commodity", "Year", "Month", "Day", "DayOfWeek", "WeekOfYear"]
target = "Modal_Price"

# Drop rows with missing feature values
data = data.dropna(subset=features + [target])

# --- Remove duplicates and show change in data size ---
before_size = data.shape[0]
data = data.drop_duplicates()
after_size = data.shape[0]

print("‚úÖ Data loaded and cleaned successfully!")
print(f"Before removing duplicates: {before_size:,} rows")
print(f"After removing duplicates:  {after_size:,} rows")
print(f"Duplicates removed:         {before_size - after_size:,} rows")
display(data.head())


‚úÖ Data loaded and cleaned successfully!
Before removing duplicates: 200,650 rows
After removing duplicates:  200,650 rows
Duplicates removed:         0 rows


Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min_Price,Max_Price,Modal_Price,Commodity_Code,Year,Month,Day,DayOfWeek,WeekOfYear
0,Maharashtra,Kolhapur,Gadhinglaj,Chili Red,Other,FAQ,2016-02-10,7500,36000,19000,26,2016,2,10,2,6
1,Maharashtra,Kolhapur,Gadhinglaj,Chili Red,Other,FAQ,2016-03-12,7500,45000,26500,26,2016,3,12,5,10
2,Maharashtra,Kolhapur,Gadhinglaj,Chili Red,Other,FAQ,2016-10-31,8800,25000,16000,26,2016,10,31,0,44
3,Maharashtra,Kolhapur,Gadhinglaj,Chili Red,Other,FAQ,2016-11-09,7000,26600,15000,26,2016,11,9,2,45
4,Maharashtra,Kolhapur,Gadhinglaj,Chili Red,Other,FAQ,2016-11-12,8500,29000,14000,26,2016,11,12,5,45


In [5]:
# %% [markdown]
# ## 3Ô∏è‚É£ Model Configuration and Preprocessing

MODEL_PATH = "backup/crop_price_model_01.pkl"

CATEGORICAL_FEATURES = ["District", "Market", "Commodity"]
NUMERIC_FEATURES = ["Year", "Month", "Day", "DayOfWeek", "WeekOfYear"]

def create_preprocessor():
    """Build preprocessing pipeline: encode categoricals + scale numerics."""
    categorical_transformer = OneHotEncoder(handle_unknown="ignore")
    numeric_transformer = StandardScaler()
    
    return ColumnTransformer(
        transformers=[
            ("cat", categorical_transformer, CATEGORICAL_FEATURES),
            ("num", numeric_transformer, NUMERIC_FEATURES)
        ],
        remainder="drop"
    )


In [6]:
# %% [markdown]
# ## 4Ô∏è‚É£ Define Helper Functions

def prepare_data(data: pd.DataFrame, features: list, target: str):
    """Split the dataset into train/test sets."""
    X = data[features]
    y = data[target]
    return train_test_split(X, y, test_size=0.2, random_state=42)


def create_model():
    """Create ML pipeline."""
    preprocessor = create_preprocessor()
    rf = RandomForestRegressor(
        n_estimators=150,   # a bit higher for stability
        max_depth=20,
        min_samples_split=4,
        n_jobs=-1,
        random_state=42
    )
    return Pipeline([
        ("preprocessor", preprocessor),
        ("regressor", rf)
    ])


def save_model(model):
    """Save model using joblib, fallback to cloudpickle."""
    try:
        joblib.dump(model, MODEL_PATH)
    except Exception as e:
        print(f"‚ö†Ô∏è Joblib save failed: {e}, trying cloudpickle...")
        with open(MODEL_PATH, "wb") as f:
            cloudpickle.dump(model, f)


def load_model():
    """Load model using joblib or cloudpickle."""
    try:
        return joblib.load(MODEL_PATH)
    except Exception as e:
        print(f"‚ö†Ô∏è Joblib load failed ({e}), trying cloudpickle...")
        with open(MODEL_PATH, "rb") as f:
            return cloudpickle.load(f)


In [7]:
# %% [markdown]
# ## 5Ô∏è‚É£ Train or Load Model

def train_model(data: pd.DataFrame, features: list, target: str):
    """Train model if not already saved."""
    if os.path.exists(MODEL_PATH):
        print("‚úÖ Loaded saved model (no retraining needed)")
        return load_model()

    print("‚öôÔ∏è Training new model (no saved model found)...")

    X_train, X_test, y_train, y_test = prepare_data(data, features, target)
    model = create_model()

    start = time.time()
    model.fit(X_train, y_train)
    end = time.time()

    print(f"‚úÖ Training complete in {end - start:.2f} seconds")

    # Evaluate
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    print(f"üìä R¬≤: {r2:.3f}, RMSE: {rmse:.2f}")


    save_model(model)
    print(f"üíæ Model saved as {MODEL_PATH}")

    return model


model = train_model(data, features, target)


‚úÖ Loaded saved model (no retraining needed)


In [8]:
# %% [markdown]
# ## 6Ô∏è‚É£ Predict Future Crop Prices

from datetime import datetime

def predict_crop_price(region: str, crop: str, date: str, market: str = None):
    """Predict crop price for given region, crop, and date."""
    model = load_model()

    date_obj = pd.to_datetime(date)
    year, month, day = date_obj.year, date_obj.month, date_obj.day
    day_of_week = date_obj.dayofweek
    week_of_year = date_obj.isocalendar()[1]

    # Fallback: use first available market for that region if not provided
    if market is None:
        market = data[data["District"] == region]["Market"].mode().iloc[0]

    sample = pd.DataFrame([{
        "District": region,
        "Market": market,
        "Commodity": crop,
        "Year": year,
        "Month": month,
        "Day": day,
        "DayOfWeek": day_of_week,
        "WeekOfYear": week_of_year
    }])

    price = model.predict(sample)[0]
    return round(price, 2)


# Example usage
region = "Kolhapur"
crop = "Onion"
date = "2025-11-01"

predicted_price = predict_crop_price(region, crop, date)
print(f"üí∞ Predicted price of {crop} in {region} on {date}: ‚Çπ{predicted_price} per quintal")


üí∞ Predicted price of Onion in Kolhapur on 2025-11-01: ‚Çπ911.83 per quintal
