In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m99.2/99.2 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [5]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
import joblib

# --- CONFIGURATION ---
FILE_PATH = "Vegetables_fruit_prices.csv"
MODEL_FILE = "market_price_model.cbm"

def load_and_prep_data(filepath):
    print("‚è≥ Loading dataset...")
    df = pd.read_csv(filepath, encoding='latin1') # Added encoding parameter

    # 1. Clean Column Names (Handling special characters)
    # Find the column containing 'Temp' (handles the (Ëöì) issue)
    temp_col = [c for c in df.columns if "Temp" in c][0]

    df = df.rename(columns={
        temp_col: 'Temperature',
        'Rainfall (mm)': 'Rainfall',
        'Humidity (%)': 'Humidity',
        'fruit_Commodity': 'Fruit_Item',
        'fruit_Price per Unit (LKR/kg)': 'Fruit_Price',
        'vegitable_Commodity': 'Veg_Item',
        'vegitable_Price per Unit (LKR/kg)': 'Veg_Price'
    })

    # 2. Date Features
    df['Date'] = pd.to_datetime(df['Date'])
    df['Month_Num'] = df['Date'].dt.month
    df['Year'] = df['Date'].dt.year

    # 3. Melt (Combine Fruits and Vegs into one list)
    # We need a single 'Item' column and a single 'Price' column
    common_cols = ['Date', 'Month_Num', 'Year', 'Region', 'Temperature', 'Rainfall', 'Humidity']

    # Process Fruits
    df_fruits = df[common_cols + ['Fruit_Item', 'Fruit_Price']].rename(
        columns={'Fruit_Item': 'Item', 'Fruit_Price': 'Price'}
    )
    df_fruits['Category'] = 'Fruit'

    # Process Vegetables
    df_veg = df[common_cols + ['Veg_Item', 'Veg_Price']].rename(
        columns={'Veg_Item': 'Item', 'Veg_Price': 'Price'}
    )
    df_veg['Category'] = 'Vegetable'

    # Combine
    final_df = pd.concat([df_fruits, df_veg], ignore_index=True)

    # Remove rows with 0 or negative prices
    final_df = final_df[final_df['Price'] > 0].dropna()

    print(f"‚úÖ Data Prepared: {len(final_df)} rows")
    return final_df

def train_model():
    df = load_and_prep_data(FILE_PATH)

    # --- FEATURES & TARGET ---
    # CatBoost handles categorical features (Item, Region) automatically!
    features = ['Region', 'Item', 'Category', 'Month_Num', 'Temperature', 'Rainfall', 'Humidity']
    target = 'Price'

    X = df[features]
    y = df[target]

    # Identify Categorical Columns indices
    cat_features_indices = [X.columns.get_loc(c) for c in ['Region', 'Item', 'Category']]

    # Split
    print("üöÄ Training Model (CatBoost)...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize CatBoost Regressor
    model = CatBoostRegressor(
        iterations=1000,
        learning_rate=0.05,
        depth=6,
        loss_function='RMSE',
        cat_features=cat_features_indices,
        verbose=100
    )

    model.fit(X_train, y_train)

    # Evaluate
    preds = model.predict(X_test)
    r2 = r2_score(y_test, preds)
    mae = mean_absolute_error(y_test, preds)

    print(f"\nüèÜ Model Performance:")
    print(f"   R¬≤ Score: {r2:.4f}")
    print(f"   MAE: Rs {mae:.2f}")

    # Save
    model.save_model(MODEL_FILE)
    print(f"üíæ Model saved to '{MODEL_FILE}'")

if __name__ == "__main__":
    train_model()

‚è≥ Loading dataset...


  df = pd.read_csv(filepath, encoding='latin1') # Added encoding parameter


‚úÖ Data Prepared: 260000 rows
üöÄ Training Model (CatBoost)...
0:	learn: 129.8176951	total: 164ms	remaining: 2m 44s
100:	learn: 119.5275697	total: 10.6s	remaining: 1m 34s
200:	learn: 115.0931022	total: 20.3s	remaining: 1m 20s
300:	learn: 110.6288979	total: 30.7s	remaining: 1m 11s
400:	learn: 107.5053598	total: 39.8s	remaining: 59.5s
500:	learn: 105.1181253	total: 50.2s	remaining: 50s
600:	learn: 103.5193950	total: 1m	remaining: 40.4s
700:	learn: 101.9536248	total: 1m 11s	remaining: 30.6s
800:	learn: 100.8670164	total: 1m 22s	remaining: 20.5s
900:	learn: 99.9085733	total: 1m 32s	remaining: 10.2s
999:	learn: 98.9014674	total: 1m 41s	remaining: 0us

üèÜ Model Performance:
   R¬≤ Score: 0.4688
   MAE: Rs 77.43
üíæ Model saved to 'market_price_model.cbm'
